715 files changed, 60853 insertions, 85259 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index 0297de1..1c022aa 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -1,4 +1,4 @@
-//==-- AArch64.h - Top-level interface for AArch64 representation -*- C++ -*-=//
+//==-- AArch64.h - Top-level interface for AArch64  --------------*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,35 +12,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_H
-#define LLVM_TARGET_AARCH64_H
+#ifndef TARGET_AArch64_H
+#define TARGET_AArch64_H
 
+#include "Utils/AArch64BaseInfo.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
 
-class AArch64AsmPrinter;
-class FunctionPass;
 class AArch64TargetMachine;
-class MachineInstr;
-class MCInst;
-
-FunctionPass *createAArch64ISelDAG(AArch64TargetMachine &TM,
-                                   CodeGenOpt::Level OptLevel);
+class FunctionPass;
+class MachineFunctionPass;
+
+FunctionPass *createAArch64DeadRegisterDefinitions();
+FunctionPass *createAArch64ConditionalCompares();
+FunctionPass *createAArch64AdvSIMDScalar();
+FunctionPass *createAArch64BranchRelaxation();
+FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
+                                 CodeGenOpt::Level OptLevel);
+FunctionPass *createAArch64StorePairSuppressPass();
+FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64LoadStoreOptimizationPass();
+ModulePass *createAArch64PromoteConstantPass();
+FunctionPass *createAArch64AddressTypePromotionPass();
+/// \brief Creates an ARM-specific Target Transformation Info pass.
+ImmutablePass *
+createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
-FunctionPass *createAArch64BranchFixupPass();
-
-/// \brief Creates an AArch64-specific Target Transformation Info pass.
-ImmutablePass *createAArch64TargetTransformInfoPass(
-                                                const AArch64TargetMachine *TM);
-
-void LowerAArch64MachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                      AArch64AsmPrinter &AP);
-
-
-}
+FunctionPass *createAArch64CollectLOHPass();
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index e49afd6..1ad5ac8 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -1,4 +1,4 @@
-//===- AArch64.td - Describe the AArch64 Target Machine -------*- tblgen -*-==//
+//=- AArch64.td - Describe the AArch64 Target Machine --------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is the top level entry point for the AArch64 target.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Target-independent interfaces
+// Target-independent interfaces which we are implementing
 //===----------------------------------------------------------------------===//
 
 include "llvm/Target/Target.td"
@@ -22,7 +21,7 @@ include "llvm/Target/Target.td"
 //
 
 def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
-  "Enable ARMv8 FP">;
+                                       "Enable ARMv8 FP">;
 
 def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
@@ -30,54 +29,106 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
   "Enable cryptographic instructions">;
 
+def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
+  "Enable ARMv8 CRC-32 checksum instructions">;
+
+/// Cyclone has register move instructions which are "free".
+def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
+                                        "Has zero-cycle register moves">;
+
+/// Cyclone has instructions which zero registers for "free".
+def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
+                                        "Has zero-cycle zeroing instructions">;
+
+//===----------------------------------------------------------------------===//
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "AArch64RegisterInfo.td"
+include "AArch64CallingConvention.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
 //===----------------------------------------------------------------------===//
-// AArch64 Processors
-//
 
 include "AArch64Schedule.td"
+include "AArch64InstrInfo.td"
 
-class ProcNoItin<string Name, list<SubtargetFeature> Features>
- : Processor<Name, NoItineraries, Features>;
+def AArch64InstrInfo : InstrInfo;
 
-def : Processor<"generic", GenericItineraries, [FeatureFPARMv8, FeatureNEON]>;
+//===----------------------------------------------------------------------===//
+// AArch64 Processors supported.
+//
+include "AArch64SchedA53.td"
+include "AArch64SchedCyclone.td"
 
 def ProcA53     : SubtargetFeature<"a53", "ARMProcFamily", "CortexA53",
                                    "Cortex-A53 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
 
 def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    "Cortex-A57 ARM processors",
                                    [FeatureFPARMv8,
                                    FeatureNEON,
-                                   FeatureCrypto]>;
+                                   FeatureCrypto,
+                                   FeatureCRC]>;
+
+def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
+                                   "Cyclone",
+                                   [FeatureFPARMv8,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureCRC,
+                                   FeatureZCRegMove, FeatureZCZeroing]>;
+
+def : ProcessorModel<"generic", NoSchedModel, [FeatureFPARMv8,
+                                              FeatureNEON,
+                                              FeatureCRC]>;
 
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
-def : Processor<"cortex-a57", NoItineraries, [ProcA57]>;
+def : ProcessorModel<"cortex-a57", NoSchedModel, [ProcA57]>;
+def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
 
 //===----------------------------------------------------------------------===//
-// Register File Description
+// Assembly parser
 //===----------------------------------------------------------------------===//
 
-include "AArch64RegisterInfo.td"
+def GenericAsmParserVariant : AsmParserVariant {
+  int Variant = 0;
+  string Name = "generic";
+}
 
-include "AArch64CallingConv.td"
+def AppleAsmParserVariant : AsmParserVariant {
+  int Variant = 1;
+  string Name = "apple-neon";
+}
 
 //===----------------------------------------------------------------------===//
-// Instruction Descriptions
+// Assembly printer
 //===----------------------------------------------------------------------===//
+// AArch64 Uses the MC printer for asm output, so make sure the TableGen
+// AsmWriter bits get associated with the correct class.
+def GenericAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  int Variant = 0;
+  bit isMCAsmWriter = 1;
+}
 
-include "AArch64InstrInfo.td"
-
-def AArch64InstrInfo : InstrInfo {
-  let noNamedPositionallyEncodedOperands = 1;
+def AppleAsmWriter : AsmWriter {
+  let AsmWriterClassName = "AppleInstPrinter";
+  int Variant = 1;
+  int isMCAsmWriter = 1;
 }
 
 //===----------------------------------------------------------------------===//
-// Declare the target which we are implementing
+// Target Declaration
 //===----------------------------------------------------------------------===//
 
 def AArch64 : Target {
   let InstructionSet = AArch64InstrInfo;
+  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
+  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
 }
diff --git a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
index 72fa6af..04906f6 100644
--- a/lib/Target/ARM64/ARM64AddressTypePromotion.cpp
+++ b/lib/Target/AArch64/AArch64AddressTypePromotion.cpp
@@ -1,5 +1,4 @@
-
-//===-- ARM64AddressTypePromotion.cpp --- Promote type for addr accesses -===//
+//===-- AArch64AddressTypePromotion.cpp --- Promote type for addr accesses -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,8 +28,7 @@
 // FIXME: This pass may be useful for other targets too.
 // ===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-type-promotion"
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -46,41 +44,43 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-type-promotion"
+
 static cl::opt<bool>
-EnableAddressTypePromotion("arm64-type-promotion", cl::Hidden,
+EnableAddressTypePromotion("aarch64-type-promotion", cl::Hidden,
                            cl::desc("Enable the type promotion pass"),
                            cl::init(true));
 static cl::opt<bool>
-EnableMerge("arm64-type-promotion-merge", cl::Hidden,
+EnableMerge("aarch64-type-promotion-merge", cl::Hidden,
             cl::desc("Enable merging of redundant sexts when one is dominating"
                      " the other."),
             cl::init(true));
 
 //===----------------------------------------------------------------------===//
-//                       ARM64AddressTypePromotion
+//                       AArch64AddressTypePromotion
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-void initializeARM64AddressTypePromotionPass(PassRegistry &);
+void initializeAArch64AddressTypePromotionPass(PassRegistry &);
 }
 
 namespace {
-class ARM64AddressTypePromotion : public FunctionPass {
+class AArch64AddressTypePromotion : public FunctionPass {
 
 public:
   static char ID;
-  ARM64AddressTypePromotion()
-      : FunctionPass(ID), Func(NULL), ConsideredSExtType(NULL) {
-    initializeARM64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
+  AArch64AddressTypePromotion()
+      : FunctionPass(ID), Func(nullptr), ConsideredSExtType(nullptr) {
+    initializeAArch64AddressTypePromotionPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual const char *getPassName() const {
-    return "ARM64 Address Type Promotion";
+  const char *getPassName() const override {
+    return "AArch64 Address Type Promotion";
   }
 
   /// Iterate over the functions and promote the computation of interesting
   // sext instructions.
-  bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
 private:
   /// The current function.
@@ -90,7 +90,7 @@ private:
   Type *ConsideredSExtType;
 
   // This transformation requires dominator info.
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
@@ -139,19 +139,19 @@ private:
 };
 } // end anonymous namespace.
 
-char ARM64AddressTypePromotion::ID = 0;
+char AArch64AddressTypePromotion::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ARM64AddressTypePromotion, "arm64-type-promotion",
-                      "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_BEGIN(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                      "AArch64 Type Promotion Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64AddressTypePromotion, "arm64-type-promotion",
-                    "ARM64 Type Promotion Pass", false, false)
+INITIALIZE_PASS_END(AArch64AddressTypePromotion, "aarch64-type-promotion",
+                    "AArch64 Type Promotion Pass", false, false)
 
-FunctionPass *llvm::createARM64AddressTypePromotionPass() {
-  return new ARM64AddressTypePromotion();
+FunctionPass *llvm::createAArch64AddressTypePromotionPass() {
+  return new AArch64AddressTypePromotion();
 }
 
-bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
+bool AArch64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
   if (isa<SExtInst>(Inst))
     return true;
 
@@ -174,7 +174,7 @@ bool ARM64AddressTypePromotion::canGetThrough(const Instruction *Inst) {
   return false;
 }
 
-bool ARM64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
+bool AArch64AddressTypePromotion::shouldGetThrough(const Instruction *Inst) {
   // If the type of the sext is the same as the considered one, this sext
   // will become useless.
   // Otherwise, we will have to do something to preserve the original value,
@@ -210,14 +210,12 @@ static bool shouldSExtOperand(const Instruction *Inst, int OpIdx) {
 }
 
 bool
-ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
+AArch64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
   if (SExt->getType() != ConsideredSExtType)
     return false;
 
-  for (Value::const_use_iterator UseIt = SExt->use_begin(),
-                                 EndUseIt = SExt->use_end();
-       UseIt != EndUseIt; ++UseIt) {
-    if (isa<GetElementPtrInst>(*UseIt))
+  for (const Use &U : SExt->uses()) {
+    if (isa<GetElementPtrInst>(*U))
       return true;
   }
 
@@ -250,7 +248,7 @@ ARM64AddressTypePromotion::shouldConsiderSExt(const Instruction *SExt) const {
 //   = a
 // Iterate on 'c'.
 bool
-ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
+AArch64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
   DEBUG(dbgs() << "*** Propagate Sign Extension ***\n");
 
   bool LocalChange = false;
@@ -345,7 +343,7 @@ ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
         SExtForOpnd->moveBefore(Inst);
         Inst->setOperand(OpIdx, SExtForOpnd);
         // If more sext are required, new instructions will have to be created.
-        SExtForOpnd = NULL;
+        SExtForOpnd = nullptr;
       }
       if (SExtForOpnd == SExt) {
         DEBUG(dbgs() << "Sign extension is useless now\n");
@@ -376,11 +374,11 @@ ARM64AddressTypePromotion::propagateSignExtension(Instructions &SExtInsts) {
   return LocalChange;
 }
 
-void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
-                                           SetOfInstructions &ToRemove) {
+void AArch64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
+                                             SetOfInstructions &ToRemove) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
 
-  for (auto &Entry: ValToSExtendedUses) {
+  for (auto &Entry : ValToSExtendedUses) {
     Instructions &Insts = Entry.second;
     Instructions CurPts;
     for (Instruction *Inst : Insts) {
@@ -415,13 +413,13 @@ void ARM64AddressTypePromotion::mergeSExts(ValueToInsts &ValToSExtendedUses,
   }
 }
 
-void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
+void AArch64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
   DEBUG(dbgs() << "*** Analyze Sign Extensions ***\n");
 
   DenseMap<Value *, Instruction *> SeenChains;
 
   for (auto &BB : *Func) {
-    for (auto &II: BB) {
+    for (auto &II : BB) {
       Instruction *SExt = &II;
 
       // Collect all sext operation per type.
@@ -438,10 +436,8 @@ void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
 
       bool insert = false;
       // #1.
-      for (Value::use_iterator UseIt = SExt->use_begin(),
-                               EndUseIt = SExt->use_end();
-           UseIt != EndUseIt; ++UseIt) {
-        const Instruction *Inst = dyn_cast<GetElementPtrInst>(*UseIt);
+      for (const Use &U : SExt->uses()) {
+        const Instruction *Inst = dyn_cast<GetElementPtrInst>(U);
         if (Inst && Inst->getNumOperands() > 2) {
           DEBUG(dbgs() << "Interesting use in GetElementPtrInst\n" << *Inst
                        << '\n');
@@ -469,10 +465,10 @@ void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
       if (insert || AlreadySeen != SeenChains.end()) {
         DEBUG(dbgs() << "Insert\n");
         SExtInsts.push_back(SExt);
-        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != NULL) {
+        if (AlreadySeen != SeenChains.end() && AlreadySeen->second != nullptr) {
           DEBUG(dbgs() << "Insert chain member\n");
           SExtInsts.push_back(AlreadySeen->second);
-          SeenChains[Last] = NULL;
+          SeenChains[Last] = nullptr;
         }
       } else {
         DEBUG(dbgs() << "Record its chain membership\n");
@@ -482,7 +478,7 @@ void ARM64AddressTypePromotion::analyzeSExtension(Instructions &SExtInsts) {
   }
 }
 
-bool ARM64AddressTypePromotion::runOnFunction(Function &F) {
+bool AArch64AddressTypePromotion::runOnFunction(Function &F) {
   if (!EnableAddressTypePromotion || F.isDeclaration())
     return false;
   Func = &F;
diff --git a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
index 83f8cda..734fb21 100644
--- a/lib/Target/ARM64/ARM64AdvSIMDScalarPass.cpp
+++ b/lib/Target/AArch64/AArch64AdvSIMDScalarPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
+//===-- AArch64AdvSIMDScalar.cpp - Replace dead defs w/ zero reg --===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,7 +14,7 @@
 //===----------------------------------------------------------------------===//
 // TODO: Graph based predicate heuristics.
 // Walking the instruction list linearly will get many, perhaps most, of
-// the cases, but to do a truly throrough job of this, we need a more
+// the cases, but to do a truly thorough job of this, we need a more
 // wholistic approach.
 //
 // This optimization is very similar in spirit to the register allocator's
@@ -33,10 +33,9 @@
 // solution.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-simd-scalar"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64RegisterInfo.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64RegisterInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -48,14 +47,12 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-static cl::opt<bool>
-AdvSIMDScalar("arm64-simd-scalar",
-              cl::desc("enable use of AdvSIMD scalar integer instructions"),
-              cl::init(false), cl::Hidden);
+#define DEBUG_TYPE "aarch64-simd-scalar"
+
 // Allow forcing all i64 operations with equivalent SIMD instructions to use
 // them. For stress-testing the transformation function.
 static cl::opt<bool>
-TransformAll("arm64-simd-scalar-force-all",
+TransformAll("aarch64-simd-scalar-force-all",
              cl::desc("Force use of AdvSIMD scalar instructions everywhere"),
              cl::init(false), cl::Hidden);
 
@@ -64,9 +61,9 @@ STATISTIC(NumCopiesDeleted, "Number of cross-class copies deleted");
 STATISTIC(NumCopiesInserted, "Number of cross-class copies inserted");
 
 namespace {
-class ARM64AdvSIMDScalar : public MachineFunctionPass {
+class AArch64AdvSIMDScalar : public MachineFunctionPass {
   MachineRegisterInfo *MRI;
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
 private:
   // isProfitableToTransform - Predicate function to determine whether an
@@ -74,7 +71,7 @@ private:
   // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
   bool isProfitableToTransform(const MachineInstr *MI) const;
 
-  // tranformInstruction - Perform the transformation of an instruction
+  // transformInstruction - Perform the transformation of an instruction
   // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
   // to be the correct register class, minimizing cross-class copies.
   void transformInstruction(MachineInstr *MI);
@@ -84,20 +81,20 @@ private:
 
 public:
   static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64AdvSIMDScalar() : MachineFunctionPass(ID) {}
+  explicit AArch64AdvSIMDScalar() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
-  const char *getPassName() const {
-    return "AdvSIMD scalar operation optimization";
+  const char *getPassName() const override {
+    return "AdvSIMD Scalar Operation Optimization";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
-char ARM64AdvSIMDScalar::ID = 0;
+char AArch64AdvSIMDScalar::ID = 0;
 } // end anonymous namespace
 
 static bool isGPR64(unsigned Reg, unsigned SubReg,
@@ -105,20 +102,20 @@ static bool isGPR64(unsigned Reg, unsigned SubReg,
   if (SubReg)
     return false;
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::GPR64RegClass);
-  return ARM64::GPR64RegClass.contains(Reg);
+    return MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::GPR64RegClass);
+  return AArch64::GPR64RegClass.contains(Reg);
 }
 
 static bool isFPR64(unsigned Reg, unsigned SubReg,
                     const MachineRegisterInfo *MRI) {
   if (TargetRegisterInfo::isVirtualRegister(Reg))
-    return (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR64RegClass) &&
+    return (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR64RegClass) &&
             SubReg == 0) ||
-           (MRI->getRegClass(Reg)->hasSuperClassEq(&ARM64::FPR128RegClass) &&
-            SubReg == ARM64::dsub);
-  // Physical register references just check the regist class directly.
-  return (ARM64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
-         (ARM64::FPR128RegClass.contains(Reg) && SubReg == ARM64::dsub);
+           (MRI->getRegClass(Reg)->hasSuperClassEq(&AArch64::FPR128RegClass) &&
+            SubReg == AArch64::dsub);
+  // Physical register references just check the register class directly.
+  return (AArch64::FPR64RegClass.contains(Reg) && SubReg == 0) ||
+         (AArch64::FPR128RegClass.contains(Reg) && SubReg == AArch64::dsub);
 }
 
 // getSrcFromCopy - Get the original source register for a GPR64 <--> FPR64
@@ -128,17 +125,18 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
                                unsigned &SubReg) {
   SubReg = 0;
   // The "FMOV Xd, Dn" instruction is the typical form.
-  if (MI->getOpcode() == ARM64::FMOVDXr || MI->getOpcode() == ARM64::FMOVXDr)
+  if (MI->getOpcode() == AArch64::FMOVDXr ||
+      MI->getOpcode() == AArch64::FMOVXDr)
     return MI->getOperand(1).getReg();
   // A lane zero extract "UMOV.d Xd, Vn[0]" is equivalent. We shouldn't see
   // these at this stage, but it's easy to check for.
-  if (MI->getOpcode() == ARM64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
-    SubReg = ARM64::dsub;
+  if (MI->getOpcode() == AArch64::UMOVvi64 && MI->getOperand(2).getImm() == 0) {
+    SubReg = AArch64::dsub;
     return MI->getOperand(1).getReg();
   }
   // Or just a plain COPY instruction. This can be directly to/from FPR64,
   // or it can be a dsub subreg reference to an FPR128.
-  if (MI->getOpcode() == ARM64::COPY) {
+  if (MI->getOpcode() == AArch64::COPY) {
     if (isFPR64(MI->getOperand(0).getReg(), MI->getOperand(0).getSubReg(),
                 MRI) &&
         isGPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(), MRI))
@@ -147,7 +145,7 @@ static unsigned getSrcFromCopy(const MachineInstr *MI,
                 MRI) &&
         isFPR64(MI->getOperand(1).getReg(), MI->getOperand(1).getSubReg(),
                 MRI)) {
-      SubReg = ARM64::dsub;
+      SubReg = MI->getOperand(1).getSubReg();
       return MI->getOperand(1).getReg();
     }
   }
@@ -164,10 +162,10 @@ static int getTransformOpcode(unsigned Opc) {
   default:
     break;
   // FIXME: Lots more possibilities.
-  case ARM64::ADDXrr:
-    return ARM64::ADDv1i64;
-  case ARM64::SUBXrr:
-    return ARM64::SUBv1i64;
+  case AArch64::ADDXrr:
+    return AArch64::ADDv1i64;
+  case AArch64::SUBXrr:
+    return AArch64::SUBv1i64;
   }
   // No AdvSIMD equivalent, so just return the original opcode.
   return Opc;
@@ -181,7 +179,8 @@ static bool isTransformable(const MachineInstr *MI) {
 // isProfitableToTransform - Predicate function to determine whether an
 // instruction should be transformed to its equivalent AdvSIMD scalar
 // instruction. "add Xd, Xn, Xm" ==> "add Dd, Da, Db", for example.
-bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
+bool
+AArch64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   // If this instruction isn't eligible to be transformed (no SIMD equivalent),
   // early exit since that's the common case.
   if (!isTransformable(MI))
@@ -241,8 +240,8 @@ bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
     // preferable to have it use the FPR64 in most cases, as if the source
     // vector is an IMPLICIT_DEF, the INSERT_SUBREG just goes away entirely.
     // Ditto for a lane insert.
-    else if (Use->getOpcode() == ARM64::INSERT_SUBREG ||
-             Use->getOpcode() == ARM64::INSvi64gpr)
+    else if (Use->getOpcode() == AArch64::INSERT_SUBREG ||
+             Use->getOpcode() == AArch64::INSvi64gpr)
       ;
     else
       AllUsesAreCopies = false;
@@ -252,7 +251,7 @@ bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   if (AllUsesAreCopies)
     --NumNewCopies;
 
-  // If a tranform will not increase the number of cross-class copies required,
+  // If a transform will not increase the number of cross-class copies required,
   // return true.
   if (NumNewCopies <= NumRemovableCopies)
     return true;
@@ -262,10 +261,10 @@ bool ARM64AdvSIMDScalar::isProfitableToTransform(const MachineInstr *MI) const {
   return TransformAll;
 }
 
-static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
+static MachineInstr *insertCopy(const AArch64InstrInfo *TII, MachineInstr *MI,
                                 unsigned Dst, unsigned Src, bool IsKill) {
   MachineInstrBuilder MIB =
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(ARM64::COPY),
+      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AArch64::COPY),
               Dst)
           .addReg(Src, getKillRegState(IsKill));
   DEBUG(dbgs() << "    adding copy: " << *MIB);
@@ -273,10 +272,10 @@ static MachineInstr *insertCopy(const ARM64InstrInfo *TII, MachineInstr *MI,
   return MIB;
 }
 
-// tranformInstruction - Perform the transformation of an instruction
+// transformInstruction - Perform the transformation of an instruction
 // to its equivalant AdvSIMD scalar instruction. Update inputs and outputs
 // to be the correct register class, minimizing cross-class copies.
-void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
+void AArch64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   DEBUG(dbgs() << "Scalar transform: " << *MI);
 
   MachineBasicBlock *MBB = MI->getParent();
@@ -319,19 +318,19 @@ void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
   // copy.
   if (!Src0) {
     SubReg0 = 0;
-    Src0 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    Src0 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     insertCopy(TII, MI, Src0, OrigSrc0, true);
   }
   if (!Src1) {
     SubReg1 = 0;
-    Src1 = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+    Src1 = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
     insertCopy(TII, MI, Src1, OrigSrc1, true);
   }
 
   // Create a vreg for the destination.
   // FIXME: No need to do this if the ultimate user expects an FPR64.
   // Check for that and avoid the copy if possible.
-  unsigned Dst = MRI->createVirtualRegister(&ARM64::FPR64RegClass);
+  unsigned Dst = MRI->createVirtualRegister(&AArch64::FPR64RegClass);
 
   // For now, all of the new instructions have the same simple three-register
   // form, so no need to special case based on what instruction we're
@@ -352,7 +351,7 @@ void ARM64AdvSIMDScalar::transformInstruction(MachineInstr *MI) {
 }
 
 // processMachineBasicBlock - Main optimzation loop.
-bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
+bool AArch64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr *MI = I;
@@ -366,17 +365,13 @@ bool ARM64AdvSIMDScalar::processMachineBasicBlock(MachineBasicBlock *MBB) {
 }
 
 // runOnMachineFunction - Pass entry point from PassManager.
-bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
-  // Early exit if pass disabled.
-  if (!AdvSIMDScalar)
-    return false;
-
+bool AArch64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64AdvSIMDScalar *****\n");
+  DEBUG(dbgs() << "***** AArch64AdvSIMDScalar *****\n");
 
   const TargetMachine &TM = mf.getTarget();
   MRI = &mf.getRegInfo();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
 
   // Just check things on a one-block-at-a-time basis.
   for (MachineFunction::iterator I = mf.begin(), E = mf.end(); I != E; ++I)
@@ -385,8 +380,8 @@ bool ARM64AdvSIMDScalar::runOnMachineFunction(MachineFunction &mf) {
   return Changed;
 }
 
-// createARM64AdvSIMDScalar - Factory function used by ARM64TargetMachine
+// createAArch64AdvSIMDScalar - Factory function used by AArch64TargetMachine
 // to add the pass to the PassManager.
-FunctionPass *llvm::createARM64AdvSIMDScalar() {
-  return new ARM64AdvSIMDScalar();
+FunctionPass *llvm::createAArch64AdvSIMDScalar() {
+  return new AArch64AdvSIMDScalar();
 }
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.cpp b/lib/Target/AArch64/AArch64AsmPrinter.cpp
index f0b52d3..c3ee9bb 100644
--- a/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64AsmPrinter.cpp - Print machine code to an AArch64 .s file --===//
+//===-- AArch64AsmPrinter.cpp - AArch64 LLVM assembly writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,236 +8,337 @@
 //===----------------------------------------------------------------------===//
 //
 // This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to GAS-format AArch64 assembly language.
+// of machine-dependent LLVM code to the AArch64 assembly language.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
-#include "AArch64AsmPrinter.h"
+#include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64MCInstLower.h"
+#include "AArch64RegisterInfo.h"
+#include "AArch64Subtarget.h"
 #include "InstPrinter/AArch64InstPrinter.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
-
 using namespace llvm;
 
-/// Try to print a floating-point register as if it belonged to a specified
-/// register-class. For example the inline asm operand modifier "b" requires its
-/// argument to be printed as "bN".
-static bool printModifiedFPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       char RegType, raw_ostream &O) {
-  if (!MO.isReg())
-    return true;
-
-  for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-    if (AArch64::FPR8RegClass.contains(*AR)) {
-      O << RegType << TRI->getEncodingValue(MO.getReg());
-      return false;
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+
+class AArch64AsmPrinter : public AsmPrinter {
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when printing asm code for different targets.
+  const AArch64Subtarget *Subtarget;
+
+  AArch64MCInstLower MCInstLowering;
+  StackMaps SM;
+
+public:
+  AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+      : AsmPrinter(TM, Streamer),
+        Subtarget(&TM.getSubtarget<AArch64Subtarget>()),
+        MCInstLowering(OutContext, *Mang, *this), SM(*this), AArch64FI(nullptr),
+        LOHLabelCounter(0) {}
+
+  const char *getPassName() const override {
+    return "AArch64 Assembly Printer";
+  }
+
+  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
+  /// tblgen'erated pseudo lowering.
+  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
+    return MCInstLowering.lowerOperand(MO, MCOp);
+  }
+
+  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                     const MachineInstr &MI);
+  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                       const MachineInstr &MI);
+  /// \brief tblgen'erated driver function for lowering simple MI->MC
+  /// pseudo instructions.
+  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
+                                   const MachineInstr *MI);
+
+  void EmitInstruction(const MachineInstr *MI) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AsmPrinter::getAnalysisUsage(AU);
+    AU.setPreservesAll();
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override {
+    AArch64FI = F.getInfo<AArch64FunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(F);
+  }
+
+private:
+  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
+  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
+  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
+  bool printAsmRegInClass(const MachineOperand &MO,
+                          const TargetRegisterClass *RC, bool isVector,
+                          raw_ostream &O);
+
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &O) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &O) override;
+
+  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  void EmitFunctionBodyEnd() override;
+
+  MCSymbol *GetCPISymbol(unsigned CPID) const override;
+  void EmitEndOfAsmFile(Module &M) override;
+  AArch64FunctionInfo *AArch64FI;
+
+  /// \brief Emit the LOHs contained in AArch64FI.
+  void EmitLOHs();
+
+  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
+  MInstToMCSymbol LOHInstToLabel;
+  unsigned LOHLabelCounter;
+};
+
+} // end of anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (Subtarget->isTargetMachO()) {
+    // Funny Darwin hack: This flag tells the linker that no global symbols
+    // contain code that falls through to other global symbols (e.g. the obvious
+    // implementation of multiple entry points).  If this doesn't occur, the
+    // linker can safely perform dead code stripping.  Since LLVM never
+    // generates code that does this, it is always safe to set.
+    OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    SM.serializeToStackMapSection();
+  }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
     }
   }
 
-  // The register doesn't correspond to anything floating-point like.
-  return true;
 }
 
-/// Implements the 'w' and 'x' inline asm operand modifiers, which print a GPR
-/// with the obvious type and an immediate 0 as either wzr or xzr.
-static bool printModifiedGPRAsmOperand(const MachineOperand &MO,
-                                       const TargetRegisterInfo *TRI,
-                                       const TargetRegisterClass &RegClass,
-                                       raw_ostream &O) {
-  char Prefix = &RegClass == &AArch64::GPR32RegClass ? 'w' : 'x';
+MachineLocation
+AArch64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
+  MachineLocation Location;
+  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
+  // Frame address.  Currently handles register +- offset only.
+  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
+    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+  else {
+    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
+  }
+  return Location;
+}
 
-  if (MO.isImm() && MO.getImm() == 0) {
-    O << Prefix << "zr";
-    return false;
-  } else if (MO.isReg()) {
-    if (MO.getReg() == AArch64::XSP || MO.getReg() == AArch64::WSP) {
-      O << (Prefix == 'x' ? "sp" : "wsp");
-      return false;
-    }
+void AArch64AsmPrinter::EmitLOHs() {
+  SmallVector<MCSymbol *, 3> MCArgs;
 
-    for (MCRegAliasIterator AR(MO.getReg(), TRI, true); AR.isValid(); ++AR) {
-      if (RegClass.contains(*AR)) {
-        O << AArch64InstPrinter::getRegisterName(*AR);
-        return false;
-      }
+  for (const auto &D : AArch64FI->getLOHContainer()) {
+    for (const MachineInstr *MI : D.getArgs()) {
+      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
+      assert(LabelIt != LOHInstToLabel.end() &&
+             "Label hasn't been inserted for LOH related instruction");
+      MCArgs.push_back(LabelIt->second);
     }
+    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
+    MCArgs.clear();
   }
+}
 
-  return true;
+void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+  if (!AArch64FI->getLOHRelated().empty())
+    EmitLOHs();
 }
 
-bool AArch64AsmPrinter::printSymbolicAddress(const MachineOperand &MO,
-                                             bool PrintImmediatePrefix,
-                                             StringRef Suffix, raw_ostream &O) {
-  StringRef Name;
-  StringRef Modifier;
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AArch64AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  // Darwin uses a linker-private symbol name for constant-pools (to
+  // avoid addends on the relocation?), ELF has no such concept and
+  // uses a normal private symbol.
+  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
+    return OutContext.GetOrCreateSymbol(
+        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
+        Twine(getFunctionNumber()) + "_" + Twine(CPID));
+
+  return OutContext.GetOrCreateSymbol(
+      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
+      Twine(getFunctionNumber()) + "_" + Twine(CPID));
+}
+
+void AArch64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
+                                     raw_ostream &O) {
+  const MachineOperand &MO = MI->getOperand(OpNum);
   switch (MO.getType()) {
   default:
-    return true;
-  case MachineOperand::MO_GlobalAddress:
-    Name = getSymbol(MO.getGlobal())->getName();
-
-    // Global variables may be accessed either via a GOT or in various fun and
-    // interesting TLS-model specific ways. Set the prefix modifier as
-    // appropriate here.
-    if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(MO.getGlobal())) {
-      Reloc::Model RelocM = TM.getRelocationModel();
-      if (GV->isThreadLocal()) {
-        switch (TM.getTLSModel(GV)) {
-        case TLSModel::GeneralDynamic:
-          Modifier = "tlsdesc";
-          break;
-        case TLSModel::LocalDynamic:
-          Modifier = "dtprel";
-          break;
-        case TLSModel::InitialExec:
-          Modifier = "gottprel";
-          break;
-        case TLSModel::LocalExec:
-          Modifier = "tprel";
-          break;
-        }
-      } else if (Subtarget->GVIsIndirectSymbol(GV, RelocM)) {
-        Modifier = "got";
-      }
-    }
+    assert(0 && "<unknown operand type>");
+  case MachineOperand::MO_Register: {
+    unsigned Reg = MO.getReg();
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+    assert(!MO.getSubReg() && "Subregs should be eliminated!");
+    O << AArch64InstPrinter::getRegisterName(Reg);
+    break;
+  }
+  case MachineOperand::MO_Immediate: {
+    int64_t Imm = MO.getImm();
+    O << '#' << Imm;
     break;
-  case MachineOperand::MO_BlockAddress:
-    Name = GetBlockAddressSymbol(MO.getBlockAddress())->getName();
+  }
+  }
+}
+
+bool AArch64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
+                                          raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  switch (Mode) {
+  default:
+    return true; // Unknown mode.
+  case 'w':
+    Reg = getWRegFromXReg(Reg);
     break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    Name = GetCPISymbol(MO.getIndex())->getName();
+  case 'x':
+    Reg = getXRegFromWReg(Reg);
     break;
   }
 
-  // Some instructions (notably ADRP) don't take the # prefix for
-  // immediates. Only print it if asked to.
-  if (PrintImmediatePrefix)
-    O << '#';
-
-  // Only need the joining "_" if both the prefix and the suffix are
-  // non-null. This little block simply takes care of the four possibly
-  // combinations involved there.
-  if (Modifier == "" && Suffix == "")
-    O << Name;
-  else if (Modifier == "" && Suffix != "")
-    O << ":" << Suffix << ':' << Name;
-  else if (Modifier != "" && Suffix == "")
-    O << ":" << Modifier << ':' << Name;
-  else
-    O << ":" << Modifier << '_' << Suffix << ':' << Name;
+  O << AArch64InstPrinter::getRegisterName(Reg);
+  return false;
+}
 
+// Prints the register in MO using class RC using the offset in the
+// new register class. This should not be used for cross class
+// printing.
+bool AArch64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
+                                           const TargetRegisterClass *RC,
+                                           bool isVector, raw_ostream &O) {
+  assert(MO.isReg() && "Should only get here with a register!");
+  const AArch64RegisterInfo *RI =
+      static_cast<const AArch64RegisterInfo *>(TM.getRegisterInfo());
+  unsigned Reg = MO.getReg();
+  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
+  assert(RI->regsOverlap(RegToPrint, Reg));
+  O << AArch64InstPrinter::getRegisterName(
+           RegToPrint, isVector ? AArch64::vreg : AArch64::NoRegAltName);
   return false;
 }
 
 bool AArch64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                                         unsigned AsmVariant,
                                         const char *ExtraCode, raw_ostream &O) {
-  const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+  const MachineOperand &MO = MI->getOperand(OpNum);
 
-  if (!ExtraCode)
-    ExtraCode = "";
+  // First try the generic code, which knows about modifiers like 'c' and 'n'.
+  if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+    return false;
 
-  switch(ExtraCode[0]) {
-  default:
-    if (!AsmPrinter::PrintAsmOperand(MI, OpNum, AsmVariant, ExtraCode, O))
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0)
+      return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default:
+      return true; // Unknown modifier.
+    case 'w':      // Print W register
+    case 'x':      // Print X register
+      if (MO.isReg())
+        return printAsmMRegister(MO, ExtraCode[0], O);
+      if (MO.isImm() && MO.getImm() == 0) {
+        unsigned Reg = ExtraCode[0] == 'w' ? AArch64::WZR : AArch64::XZR;
+        O << AArch64InstPrinter::getRegisterName(Reg);
         return false;
-    break;
-  case 'w':
-    // Output 32-bit general register operand, constant zero as wzr, or stack
-    // pointer as wsp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR32RegClass, O))
-      return false;
-    break;
-  case 'x':
-    // Output 64-bit general register operand, constant zero as xzr, or stack
-    // pointer as sp. Ignored when used with other operand types.
-    if (!printModifiedGPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    AArch64::GPR64RegClass, O))
-      return false;
-    break;
-  case 'H':
-    // Output higher numbered of a 64-bit general register pair
-  case 'Q':
-    // Output least significant register of a 64-bit general register pair
-  case 'R':
-    // Output most significant register of a 64-bit general register pair
-
-    // FIXME note: these three operand modifiers will require, to some extent,
-    // adding a paired GPR64 register class. Initial investigation suggests that
-    // assertions are hit unless it has a type and is made legal for that type
-    // in ISelLowering. After that step is made, the number of modifications
-    // needed explodes (operation legality, calling conventions, stores, reg
-    // copies ...).
-    llvm_unreachable("FIXME: Unimplemented register pairs");
-  case 'b':
-  case 'h':
-  case 's':
-  case 'd':
-  case 'q':
-    if (!printModifiedFPRAsmOperand(MI->getOperand(OpNum), TRI,
-                                    ExtraCode[0], O))
-      return false;
-    break;
-  case 'A':
-    // Output symbolic address with appropriate relocation modifier (also
-    // suitable for ADRP).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), false, "", O))
-      return false;
-    break;
-  case 'L':
-    // Output bits 11:0 of symbolic address with appropriate :lo12: relocation
-    // modifier.
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "lo12", O))
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'G':
-    // Output bits 23:12 of symbolic address with appropriate :hi12: relocation
-    // modifier (currently only for TLS local exec).
-    if (!printSymbolicAddress(MI->getOperand(OpNum), true, "hi12", O))
+    case 'b': // Print B register.
+    case 'h': // Print H register.
+    case 's': // Print S register.
+    case 'd': // Print D register.
+    case 'q': // Print Q register.
+      if (MO.isReg()) {
+        const TargetRegisterClass *RC;
+        switch (ExtraCode[0]) {
+        case 'b':
+          RC = &AArch64::FPR8RegClass;
+          break;
+        case 'h':
+          RC = &AArch64::FPR16RegClass;
+          break;
+        case 's':
+          RC = &AArch64::FPR32RegClass;
+          break;
+        case 'd':
+          RC = &AArch64::FPR64RegClass;
+          break;
+        case 'q':
+          RC = &AArch64::FPR128RegClass;
+          break;
+        default:
+          return true;
+        }
+        return printAsmRegInClass(MO, RC, false /* vector */, O);
+      }
+      printOperand(MI, OpNum, O);
       return false;
-    break;
-  case 'a':
-    return PrintAsmMemoryOperand(MI, OpNum, AsmVariant, ExtraCode, O);
+    }
   }
 
-  // There's actually no operand modifier, which leads to a slightly eclectic
-  // set of behaviour which we have to handle here.
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    llvm_unreachable("Unexpected operand for inline assembly");
-  case MachineOperand::MO_Register:
-    // GCC prints the unmodified operand of a 'w' constraint as the vector
-    // register. Technically, we could allocate the argument as a VPR128, but
-    // that leads to extremely dodgy copies being generated to get the data
-    // there.
-    if (printModifiedFPRAsmOperand(MO, TRI, 'v', O))
-      O << AArch64InstPrinter::getRegisterName(MO.getReg());
-    break;
-  case MachineOperand::MO_Immediate:
-    O << '#' << MO.getImm();
-    break;
-  case MachineOperand::MO_FPImmediate:
-    assert(MO.getFPImm()->isExactlyValue(0.0) && "Only FP 0.0 expected");
-    O << "#0.0";
-    break;
-  case MachineOperand::MO_BlockAddress:
-  case MachineOperand::MO_ConstantPoolIndex:
-  case MachineOperand::MO_GlobalAddress:
-    return printSymbolicAddress(MO, false, "", O);
+  // According to ARM, we should emit x and v registers unless we have a
+  // modifier.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+
+    // If this is a w or x register, print an x register.
+    if (AArch64::GPR32allRegClass.contains(Reg) ||
+        AArch64::GPR64allRegClass.contains(Reg))
+      return printAsmMRegister(MO, 'x', O);
+
+    // If this is a b, h, s, d, or q register, print it as a v register.
+    return printAsmRegInClass(MO, &AArch64::FPR128RegClass, true /* vector */,
+                              O);
   }
 
+  printOperand(MI, OpNum, O);
   return false;
 }
 
@@ -246,15 +347,90 @@ bool AArch64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
                                               unsigned AsmVariant,
                                               const char *ExtraCode,
                                               raw_ostream &O) {
-  // Currently both the memory constraints (m and Q) behave the same and amount
-  // to the address as a single register. In future, we may allow "m" to provide
-  // both a base and an offset.
+  if (ExtraCode && ExtraCode[0])
+    return true; // Unknown modifier.
+
   const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline assembly memory operand");
-  O << '[' << AArch64InstPrinter::getRegisterName(MO.getReg()) << ']';
+  assert(MO.isReg() && "unexpected inline asm memory operand");
+  O << "[" << AArch64InstPrinter::getRegisterName(MO.getReg()) << "]";
   return false;
 }
 
+void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
+                                               raw_ostream &OS) {
+  unsigned NOps = MI->getNumOperands();
+  assert(NOps == 4);
+  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
+  // cast away const; DIetc do not take const operands for some reason.
+  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
+  OS << V.getName();
+  OS << " <- ";
+  // Frame address.  Currently handles register +- offset only.
+  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  OS << '[';
+  printOperand(MI, 0, OS);
+  OS << '+';
+  printOperand(MI, 1, OS);
+  OS << ']';
+  OS << "+";
+  printOperand(MI, NOps - 2, OS);
+}
+
+void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
+                                      const MachineInstr &MI) {
+  unsigned NumNOPBytes = MI.getOperand(1).getImm();
+
+  SM.recordStackMap(MI);
+  // Emit padding.
+  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
+  for (unsigned i = 0; i < NumNOPBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Lower a patchpoint of the form:
+// [<def>], <id>, <numBytes>, <target>, <numArgs>
+void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
+                                        const MachineInstr &MI) {
+  SM.recordPatchPoint(MI);
+
+  PatchPointOpers Opers(&MI);
+
+  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
+  unsigned EncodedBytes = 0;
+  if (CallTarget) {
+    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
+           "High 16 bits of call target should be zero.");
+    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
+    EncodedBytes = 16;
+    // Materialize the jump address:
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVZWi)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 32) & 0xFFFF)
+                                    .addImm(32));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm((CallTarget >> 16) & 0xFFFF)
+                                    .addImm(16));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::MOVKWi)
+                                    .addReg(ScratchReg)
+                                    .addReg(ScratchReg)
+                                    .addImm(CallTarget & 0xFFFF)
+                                    .addImm(0));
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::BLR).addReg(ScratchReg));
+  }
+  // Emit padding.
+  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
+  assert(NumBytes >= EncodedBytes &&
+         "Patchpoint can't request size less than the length of a call.");
+  assert((NumBytes - EncodedBytes) % 4 == 0 &&
+         "Invalid number of NOP bytes requested!");
+  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
+    EmitToStreamer(OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+}
+
+// Simple pseudo-instructions have their lowering (with expansion to real
+// instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
 void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
@@ -262,41 +438,87 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(OutStreamer, MI))
     return;
 
-  MCInst TmpInst;
-  LowerAArch64MachineInstrToMCInst(MI, TmpInst, *this);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
+  if (AArch64FI->getLOHRelated().count(MI)) {
+    // Generate a label for LOH related instruction
+    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
+    // Associate the instruction with the label
+    LOHInstToLabel[MI] = LOHLabel;
+    OutStreamer.EmitLabel(LOHLabel);
+  }
 
-void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  if (Subtarget->isTargetELF()) {
-    const TargetLoweringObjectFileELF &TLOFELF =
-      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+  // Do any manual lowerings.
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::DBG_VALUE: {
+    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
+      SmallString<128> TmpStr;
+      raw_svector_ostream OS(TmpStr);
+      PrintDebugValueComment(MI, OS);
+      OutStreamer.EmitRawText(StringRef(OS.str()));
+    }
+    return;
+  }
 
-    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+  // Tail calls use pseudo instructions so they have the proper code-gen
+  // attributes (isCall, isReturn, etc.). We lower them to the real
+  // instruction here.
+  case AArch64::TCRETURNri: {
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::BR);
+    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TCRETURNdi: {
+    MCOperand Dest;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
+    MCInst TmpInst;
+    TmpInst.setOpcode(AArch64::B);
+    TmpInst.addOperand(Dest);
+    EmitToStreamer(OutStreamer, TmpInst);
+    return;
+  }
+  case AArch64::TLSDESC_BLR: {
+    MCOperand Callee, Sym;
+    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
+    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
 
-    // Output stubs for external and common global variables.
-    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
-    if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
-      const DataLayout *TD = TM.getDataLayout();
+    // First emit a relocation-annotation. This expands to no code, but requests
+    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
+    MCInst TLSDescCall;
+    TLSDescCall.setOpcode(AArch64::TLSDESCCALL);
+    TLSDescCall.addOperand(Sym);
+    EmitToStreamer(OutStreamer, TLSDescCall);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
-                                    TD->getPointerSize(0));
-      }
-      Stubs.clear();
-    }
+    // Other than that it's just a normal indirect call to the function loaded
+    // from the descriptor.
+    MCInst BLR;
+    BLR.setOpcode(AArch64::BLR);
+    BLR.addOperand(Callee);
+    EmitToStreamer(OutStreamer, BLR);
+
+    return;
+  }
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(OutStreamer, SM, *MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(OutStreamer, SM, *MI);
   }
-}
 
-bool AArch64AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
-  return AsmPrinter::runOnMachineFunction(MF);
+  // Finally, do the automated lowerings for everything else.
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+  EmitToStreamer(OutStreamer, TmpInst);
 }
 
 // Force static initialization.
 extern "C" void LLVMInitializeAArch64AsmPrinter() {
-    RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
-    RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
-}
+  RegisterAsmPrinter<AArch64AsmPrinter> X(TheAArch64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> Y(TheAArch64beTarget);
 
+  RegisterAsmPrinter<AArch64AsmPrinter> Z(TheARM64leTarget);
+  RegisterAsmPrinter<AArch64AsmPrinter> W(TheARM64beTarget);
+}
diff --git a/lib/Target/AArch64/AArch64AsmPrinter.h b/lib/Target/AArch64/AArch64AsmPrinter.h
deleted file mode 100644
index 824f003..0000000
--- a/lib/Target/AArch64/AArch64AsmPrinter.h
+++ /dev/null
@@ -1,76 +0,0 @@
-// AArch64AsmPrinter.h - Print machine code to an AArch64 .s file -*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the AArch64 assembly printer class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64ASMPRINTER_H
-#define LLVM_AARCH64ASMPRINTER_H
-
-#include "AArch64.h"
-#include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Compiler.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class LLVM_LIBRARY_VISIBILITY AArch64AsmPrinter : public AsmPrinter {
-
-  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
-  /// make the right decision when printing asm code for different targets.
-  const AArch64Subtarget *Subtarget;
-
-  // emitPseudoExpansionLowering - tblgen'erated.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  public:
-  explicit AArch64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer) {
-    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
-  }
-
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
-
-  MCOperand lowerSymbolOperand(const MachineOperand &MO,
-                               const MCSymbol *Sym) const;
-
-  void EmitInstruction(const MachineInstr *MI);
-  void EmitEndOfAsmFile(Module &M);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
-
-  /// printSymbolicAddress - Given some kind of reasonably bare symbolic
-  /// reference, print out the appropriate asm string to represent it. If
-  /// appropriate, a relocation-specifier will be produced, composed of a
-  /// general class derived from the MO parameter and an instruction-specific
-  /// suffix, provided in Suffix. E.g. ":got_lo12:" if a Suffix of "lo12" is
-  /// given.
-  bool printSymbolicAddress(const MachineOperand &MO,
-                            bool PrintImmediatePrefix,
-                            StringRef Suffix, raw_ostream &O);
-
-  virtual const char *getPassName() const {
-    return "AArch64 Assembly Printer";
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/AArch64/AArch64BranchFixupPass.cpp b/lib/Target/AArch64/AArch64BranchFixupPass.cpp
deleted file mode 100644
index c03cdde..0000000
--- a/lib/Target/AArch64/AArch64BranchFixupPass.cpp
+++ /dev/null
@@ -1,600 +0,0 @@
-//===-- AArch64BranchFixupPass.cpp - AArch64 branch fixup -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass that fixes AArch64 branches which have ended up out
-// of range for their immediate operands.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "aarch64-branch-fixup"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumSplit,      "Number of uncond branches inserted");
-STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
-
-/// Return the worst case padding that could result from unknown offset bits.
-/// This does not include alignment padding caused by known offset bits.
-///
-/// @param LogAlign log2(alignment)
-/// @param KnownBits Number of known low offset bits.
-static inline unsigned UnknownPadding(unsigned LogAlign, unsigned KnownBits) {
-  if (KnownBits < LogAlign)
-    return (1u << LogAlign) - (1u << KnownBits);
-  return 0;
-}
-
-namespace {
-  /// Due to limited PC-relative displacements, conditional branches to distant
-  /// blocks may need converting into an unconditional equivalent. For example:
-  ///     tbz w1, #0, far_away
-  /// becomes
-  ///     tbnz w1, #0, skip
-  ///     b far_away
-  ///   skip:
-  class AArch64BranchFixup : public MachineFunctionPass {
-    /// Information about the offset and size of a single basic block.
-    struct BasicBlockInfo {
-      /// Distance from the beginning of the function to the beginning of this
-      /// basic block.
-      ///
-      /// Offsets are computed assuming worst case padding before an aligned
-      /// block. This means that subtracting basic block offsets always gives a
-      /// conservative estimate of the real distance which may be smaller.
-      ///
-      /// Because worst case padding is used, the computed offset of an aligned
-      /// block may not actually be aligned.
-      unsigned Offset;
-
-      /// Size of the basic block in bytes.  If the block contains inline
-      /// assembly, this is a worst case estimate.
-      ///
-      /// The size does not include any alignment padding whether from the
-      /// beginning of the block, or from an aligned jump table at the end.
-      unsigned Size;
-
-      /// The number of low bits in Offset that are known to be exact.  The
-      /// remaining bits of Offset are an upper bound.
-      uint8_t KnownBits;
-
-      /// When non-zero, the block contains instructions (inline asm) of unknown
-      /// size.  The real size may be smaller than Size bytes by a multiple of 1
-      /// << Unalign.
-      uint8_t Unalign;
-
-      BasicBlockInfo() : Offset(0), Size(0), KnownBits(0), Unalign(0) {}
-
-      /// Compute the number of known offset bits internally to this block.
-      /// This number should be used to predict worst case padding when
-      /// splitting the block.
-      unsigned internalKnownBits() const {
-        unsigned Bits = Unalign ? Unalign : KnownBits;
-        // If the block size isn't a multiple of the known bits, assume the
-        // worst case padding.
-        if (Size & ((1u << Bits) - 1))
-          Bits = countTrailingZeros(Size);
-        return Bits;
-      }
-
-      /// Compute the offset immediately following this block.  If LogAlign is
-      /// specified, return the offset the successor block will get if it has
-      /// this alignment.
-      unsigned postOffset(unsigned LogAlign = 0) const {
-        unsigned PO = Offset + Size;
-        if (!LogAlign)
-          return PO;
-        // Add alignment padding from the terminator.
-        return PO + UnknownPadding(LogAlign, internalKnownBits());
-      }
-
-      /// Compute the number of known low bits of postOffset.  If this block
-      /// contains inline asm, the number of known bits drops to the
-      /// instruction alignment.  An aligned terminator may increase the number
-      /// of know bits.
-      /// If LogAlign is given, also consider the alignment of the next block.
-      unsigned postKnownBits(unsigned LogAlign = 0) const {
-        return std::max(LogAlign, internalKnownBits());
-      }
-    };
-
-    std::vector<BasicBlockInfo> BBInfo;
-
-    /// One per immediate branch, keeping the machine instruction pointer,
-    /// conditional or unconditional, the max displacement, and (if IsCond is
-    /// true) the corresponding inverted branch opcode.
-    struct ImmBranch {
-      MachineInstr *MI;
-      unsigned OffsetBits : 31;
-      bool IsCond : 1;
-      ImmBranch(MachineInstr *mi, unsigned offsetbits, bool cond)
-        : MI(mi), OffsetBits(offsetbits), IsCond(cond) {}
-    };
-
-    /// Keep track of all the immediate branch instructions.
-    ///
-    std::vector<ImmBranch> ImmBranches;
-
-    MachineFunction *MF;
-    const AArch64InstrInfo *TII;
-  public:
-    static char ID;
-    AArch64BranchFixup() : MachineFunctionPass(ID) {}
-
-    virtual bool runOnMachineFunction(MachineFunction &MF);
-
-    virtual const char *getPassName() const {
-      return "AArch64 branch fixup pass";
-    }
-
-  private:
-    void initializeFunctionInfo();
-    MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-    void adjustBBOffsetsAfter(MachineBasicBlock *BB);
-    bool isBBInRange(MachineInstr *MI, MachineBasicBlock *BB,
-                     unsigned OffsetBits);
-    bool fixupImmediateBr(ImmBranch &Br);
-    bool fixupConditionalBr(ImmBranch &Br);
-
-    void computeBlockSize(MachineBasicBlock *MBB);
-    unsigned getOffsetOf(MachineInstr *MI) const;
-    void dumpBBs();
-    void verify();
-  };
-  char AArch64BranchFixup::ID = 0;
-}
-
-/// check BBOffsets
-void AArch64BranchFixup::verify() {
-#ifndef NDEBUG
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned MBBId = MBB->getNumber();
-    assert(!MBBId || BBInfo[MBBId - 1].postOffset() <= BBInfo[MBBId].Offset);
-  }
-#endif
-}
-
-/// print block size and offset information - debugging
-void AArch64BranchFixup::dumpBBs() {
-  DEBUG({
-    for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
-      const BasicBlockInfo &BBI = BBInfo[J];
-      dbgs() << format("%08x BB#%u\t", BBI.Offset, J)
-             << " kb=" << unsigned(BBI.KnownBits)
-             << " ua=" << unsigned(BBI.Unalign)
-             << format(" size=%#x\n", BBInfo[J].Size);
-    }
-  });
-}
-
-/// Returns an instance of the branch fixup pass.
-FunctionPass *llvm::createAArch64BranchFixupPass() {
-  return new AArch64BranchFixup();
-}
-
-bool AArch64BranchFixup::runOnMachineFunction(MachineFunction &mf) {
-  MF = &mf;
-  DEBUG(dbgs() << "***** AArch64BranchFixup ******");
-  TII = (const AArch64InstrInfo*)MF->getTarget().getInstrInfo();
-
-  // This pass invalidates liveness information when it splits basic blocks.
-  MF->getRegInfo().invalidateLiveness();
-
-  // Renumber all of the machine basic blocks in the function, guaranteeing that
-  // the numbers agree with the position of the block in the function.
-  MF->RenumberBlocks();
-
-  // Do the initial scan of the function, building up information about the
-  // sizes of each block and location of each immediate branch.
-  initializeFunctionInfo();
-
-  // Iteratively fix up branches until there is no change.
-  unsigned NoBRIters = 0;
-  bool MadeChange = false;
-  while (true) {
-    DEBUG(dbgs() << "Beginning iteration #" << NoBRIters << '\n');
-    bool BRChange = false;
-    for (unsigned i = 0, e = ImmBranches.size(); i != e; ++i)
-      BRChange |= fixupImmediateBr(ImmBranches[i]);
-    if (BRChange && ++NoBRIters > 30)
-      report_fatal_error("Branch Fix Up pass failed to converge!");
-    DEBUG(dumpBBs());
-
-    if (!BRChange)
-      break;
-    MadeChange = true;
-  }
-
-  // After a while, this might be made debug-only, but it is not expensive.
-  verify();
-
-  DEBUG(dbgs() << '\n'; dumpBBs());
-
-  BBInfo.clear();
-  ImmBranches.clear();
-
-  return MadeChange;
-}
-
-/// Return true if the specified basic block can fallthrough into the block
-/// immediately after it.
-static bool BBHasFallthrough(MachineBasicBlock *MBB) {
-  // Get the next machine basic block in the function.
-  MachineFunction::iterator MBBI = MBB;
-  // Can't fall off end of function.
-  if (std::next(MBBI) == MBB->getParent()->end())
-    return false;
-
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I)
-    if (*I == NextBB)
-      return true;
-
-  return false;
-}
-
-/// Do the initial scan of the function, building up information about the sizes
-/// of each block, and each immediate branch.
-void AArch64BranchFixup::initializeFunctionInfo() {
-  BBInfo.clear();
-  BBInfo.resize(MF->getNumBlockIDs());
-
-  // First thing, compute the size of all basic blocks, and see if the function
-  // has any inline assembly in it. If so, we have to be conservative about
-  // alignment assumptions, as we don't know for sure the size of any
-  // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
-
-  // The known bits of the entry block offset are determined by the function
-  // alignment.
-  BBInfo.front().KnownBits = MF->getAlignment();
-
-  // Compute block offsets and known bits.
-  adjustBBOffsetsAfter(MF->begin());
-
-  // Now go back through the instructions and build up our data structures.
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
-       MBBI != E; ++MBBI) {
-    MachineBasicBlock &MBB = *MBBI;
-
-    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
-         I != E; ++I) {
-      if (I->isDebugValue())
-        continue;
-
-      int Opc = I->getOpcode();
-      if (I->isBranch()) {
-        bool IsCond = false;
-
-        // The offsets encoded in instructions here scale by the instruction
-        // size (4 bytes), effectively increasing their range by 2 bits.
-        unsigned Bits = 0;
-        switch (Opc) {
-        default:
-          continue;  // Ignore other JT branches
-        case AArch64::TBZxii:
-        case AArch64::TBZwii:
-        case AArch64::TBNZxii:
-        case AArch64::TBNZwii:
-          IsCond = true;
-          Bits = 14 + 2;
-          break;
-        case AArch64::Bcc:
-        case AArch64::CBZx:
-        case AArch64::CBZw:
-        case AArch64::CBNZx:
-        case AArch64::CBNZw:
-          IsCond = true;
-          Bits = 19 + 2;
-          break;
-        case AArch64::Bimm:
-          Bits = 26 + 2;
-          break;
-        }
-
-        // Record this immediate branch.
-        ImmBranches.push_back(ImmBranch(I, Bits, IsCond));
-      }
-    }
-  }
-}
-
-/// Compute the size and some alignment information for MBB.  This function
-/// updates BBInfo directly.
-void AArch64BranchFixup::computeBlockSize(MachineBasicBlock *MBB) {
-  BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
-  BBI.Size = 0;
-  BBI.Unalign = 0;
-
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
-    BBI.Size += TII->getInstSizeInBytes(*I);
-    // For inline asm, GetInstSizeInBytes returns a conservative estimate.
-    // The actual size may be smaller, but still a multiple of the instr size.
-    if (I->isInlineAsm())
-      BBI.Unalign = 2;
-  }
-}
-
-/// Return the current offset of the specified machine instruction from the
-/// start of the function.  This offset changes as stuff is moved around inside
-/// the function.
-unsigned AArch64BranchFixup::getOffsetOf(MachineInstr *MI) const {
-  MachineBasicBlock *MBB = MI->getParent();
-
-  // The offset is composed of two things: the sum of the sizes of all MBB's
-  // before this instruction's block, and the offset from the start of the block
-  // it is in.
-  unsigned Offset = BBInfo[MBB->getNumber()].Offset;
-
-  // Sum instructions before MI in MBB.
-  for (MachineBasicBlock::iterator I = MBB->begin(); &*I != MI; ++I) {
-    assert(I != MBB->end() && "Didn't find MI in its own basic block?");
-    Offset += TII->getInstSizeInBytes(*I);
-  }
-  return Offset;
-}
-
-/// Split the basic block containing MI into two blocks, which are joined by
-/// an unconditional branch.  Update data structures and renumber blocks to
-/// account for this change and returns the newly created block.
-MachineBasicBlock *
-AArch64BranchFixup::splitBlockBeforeInstr(MachineInstr *MI) {
-  MachineBasicBlock *OrigBB = MI->getParent();
-
-  // Create a new MBB for the code after the OrigBB.
-  MachineBasicBlock *NewBB =
-    MF->CreateMachineBasicBlock(OrigBB->getBasicBlock());
-  MachineFunction::iterator MBBI = OrigBB; ++MBBI;
-  MF->insert(MBBI, NewBB);
-
-  // Splice the instructions starting with MI over to NewBB.
-  NewBB->splice(NewBB->end(), OrigBB, MI, OrigBB->end());
-
-  // Add an unconditional branch from OrigBB to NewBB.
-  // Note the new unconditional branch is not being recorded.
-  // There doesn't seem to be meaningful DebugInfo available; this doesn't
-  // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::Bimm)).addMBB(NewBB);
-  ++NumSplit;
-
-  // Update the CFG.  All succs of OrigBB are now succs of NewBB.
-  NewBB->transferSuccessors(OrigBB);
-
-  // OrigBB branches to NewBB.
-  OrigBB->addSuccessor(NewBB);
-
-  // Update internal data structures to account for the newly inserted MBB.
-  MF->RenumberBlocks(NewBB);
-
-  // Insert an entry into BBInfo to align it properly with the (newly
-  // renumbered) block numbers.
-  BBInfo.insert(BBInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
-
-  // Figure out how large the OrigBB is.  As the first half of the original
-  // block, it cannot contain a tablejump.  The size includes
-  // the new jump we added.  (It should be possible to do this without
-  // recounting everything, but it's very confusing, and this is rarely
-  // executed.)
-  computeBlockSize(OrigBB);
-
-  // Figure out how large the NewMBB is.  As the second half of the original
-  // block, it may contain a tablejump.
-  computeBlockSize(NewBB);
-
-  // All BBOffsets following these blocks must be modified.
-  adjustBBOffsetsAfter(OrigBB);
-
-  return NewBB;
-}
-
-void AArch64BranchFixup::adjustBBOffsetsAfter(MachineBasicBlock *BB) {
-  unsigned BBNum = BB->getNumber();
-  for(unsigned i = BBNum + 1, e = MF->getNumBlockIDs(); i < e; ++i) {
-    // Get the offset and known bits at the end of the layout predecessor.
-    // Include the alignment of the current block.
-    unsigned LogAlign = MF->getBlockNumbered(i)->getAlignment();
-    unsigned Offset = BBInfo[i - 1].postOffset(LogAlign);
-    unsigned KnownBits = BBInfo[i - 1].postKnownBits(LogAlign);
-
-    // This is where block i begins.  Stop if the offset is already correct,
-    // and we have updated 2 blocks.  This is the maximum number of blocks
-    // changed before calling this function.
-    if (i > BBNum + 2 &&
-        BBInfo[i].Offset == Offset &&
-        BBInfo[i].KnownBits == KnownBits)
-      break;
-
-    BBInfo[i].Offset = Offset;
-    BBInfo[i].KnownBits = KnownBits;
-  }
-}
-
-/// Returns true if the distance between specific MI and specific BB can fit in
-/// MI's displacement field.
-bool AArch64BranchFixup::isBBInRange(MachineInstr *MI,
-                                     MachineBasicBlock *DestBB,
-                                     unsigned OffsetBits) {
-  int64_t BrOffset   = getOffsetOf(MI);
-  int64_t DestOffset = BBInfo[DestBB->getNumber()].Offset;
-
-  DEBUG(dbgs() << "Branch of destination BB#" << DestBB->getNumber()
-               << " from BB#" << MI->getParent()->getNumber()
-               << " bits available=" << OffsetBits
-               << " from " << getOffsetOf(MI) << " to " << DestOffset
-               << " offset " << int(DestOffset-BrOffset) << "\t" << *MI);
-
-  return isIntN(OffsetBits, DestOffset - BrOffset);
-}
-
-/// Fix up an immediate branch whose destination is too far away to fit in its
-/// displacement field.
-bool AArch64BranchFixup::fixupImmediateBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *DestBB = 0;
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    if (MI->getOperand(i).isMBB()) {
-      DestBB = MI->getOperand(i).getMBB();
-      break;
-    }
-  }
-  assert(DestBB && "Branch with no destination BB?");
-
-  // Check to see if the DestBB is already in-range.
-  if (isBBInRange(MI, DestBB, Br.OffsetBits))
-    return false;
-
-  assert(Br.IsCond && "Only conditional branches should need fixup");
-  return fixupConditionalBr(Br);
-}
-
-/// Fix up a conditional branch whose destination is too far away to fit in its
-/// displacement field. It is converted to an inverse conditional branch + an
-/// unconditional branch to the destination.
-bool
-AArch64BranchFixup::fixupConditionalBr(ImmBranch &Br) {
-  MachineInstr *MI = Br.MI;
-  MachineBasicBlock *MBB = MI->getParent();
-  unsigned CondBrMBBOperand = 0;
-
-  // The general idea is to add an unconditional branch to the destination and
-  // invert the conditional branch to jump over it. Complications occur around
-  // fallthrough and unreachable ends to the block.
-  //   b.lt L1
-  //   =>
-  //   b.ge L2
-  //   b   L1
-  // L2:
-
-  // First we invert the conditional branch, by creating a replacement if
-  // necessary. This if statement contains all the special handling of different
-  // branch types.
-  if (MI->getOpcode() == AArch64::Bcc) {
-    // The basic block is operand number 1 for Bcc
-    CondBrMBBOperand = 1;
-
-    A64CC::CondCodes CC = (A64CC::CondCodes)MI->getOperand(0).getImm();
-    CC = A64InvertCondCode(CC);
-    MI->getOperand(0).setImm(CC);
-  } else {
-    MachineInstrBuilder InvertedMI;
-    int InvertedOpcode;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Unknown branch type");
-    case AArch64::TBZxii: InvertedOpcode = AArch64::TBNZxii; break;
-    case AArch64::TBZwii: InvertedOpcode = AArch64::TBNZwii; break;
-    case AArch64::TBNZxii: InvertedOpcode = AArch64::TBZxii; break;
-    case AArch64::TBNZwii: InvertedOpcode = AArch64::TBZwii; break;
-    case AArch64::CBZx: InvertedOpcode = AArch64::CBNZx; break;
-    case AArch64::CBZw: InvertedOpcode = AArch64::CBNZw; break;
-    case AArch64::CBNZx: InvertedOpcode = AArch64::CBZx; break;
-    case AArch64::CBNZw: InvertedOpcode = AArch64::CBZw; break;
-    }
-
-    InvertedMI = BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(InvertedOpcode));
-    for (unsigned i = 0, e= MI->getNumOperands(); i != e; ++i) {
-      InvertedMI.addOperand(MI->getOperand(i));
-      if (MI->getOperand(i).isMBB())
-        CondBrMBBOperand = i;
-    }
-
-    MI->eraseFromParent();
-    MI = Br.MI = InvertedMI;
-  }
-
-  // If the branch is at the end of its MBB and that has a fall-through block,
-  // direct the updated conditional branch to the fall-through
-  // block. Otherwise, split the MBB before the next instruction.
-  MachineInstr *BMI = &MBB->back();
-  bool NeedSplit = (BMI != MI) || !BBHasFallthrough(MBB);
-
-  ++NumCBrFixed;
-  if (BMI != MI) {
-    if (std::next(MachineBasicBlock::iterator(MI)) == std::prev(MBB->end()) &&
-        BMI->getOpcode() == AArch64::Bimm) {
-      // Last MI in the BB is an unconditional branch. We can swap destinations:
-      // b.eq L1 (temporarily b.ne L1 after first change)
-      // b   L2
-      // =>
-      // b.ne L2
-      // b   L1
-      MachineBasicBlock *NewDest = BMI->getOperand(0).getMBB();
-      if (isBBInRange(MI, NewDest, Br.OffsetBits)) {
-        DEBUG(dbgs() << "  Invert Bcc condition and swap its destination with "
-                     << *BMI);
-        MachineBasicBlock *DestBB = MI->getOperand(CondBrMBBOperand).getMBB();
-        BMI->getOperand(0).setMBB(DestBB);
-        MI->getOperand(CondBrMBBOperand).setMBB(NewDest);
-        return true;
-      }
-    }
-  }
-
-  if (NeedSplit) {
-    MachineBasicBlock::iterator MBBI = MI; ++MBBI;
-    splitBlockBeforeInstr(MBBI);
-    // No need for the branch to the next block. We're adding an unconditional
-    // branch to the destination.
-    int delta = TII->getInstSizeInBytes(MBB->back());
-    BBInfo[MBB->getNumber()].Size -= delta;
-    MBB->back().eraseFromParent();
-    // BBInfo[SplitBB].Offset is wrong temporarily, fixed below
-  }
-
-  // After splitting and removing the unconditional branch from the original BB,
-  // the structure is now:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  //
-  // We now have to change the conditional branch to point to splitbb and add an
-  // unconditional branch after it to L1, giving the final structure:
-  // oldbb:
-  //   [things]
-  //   b.invertedCC splitbb
-  //   b L1
-  // splitbb/fallthroughbb:
-  //   [old b L2/real continuation]
-  MachineBasicBlock *NextBB = std::next(MachineFunction::iterator(MBB));
-
-  DEBUG(dbgs() << "  Insert B to BB#"
-               << MI->getOperand(CondBrMBBOperand).getMBB()->getNumber()
-               << " also invert condition and change dest. to BB#"
-               << NextBB->getNumber() << "\n");
-
-  // Insert a new unconditional branch and fixup the destination of the
-  // conditional one.  Also update the ImmBranch as well as adding a new entry
-  // for the new branch.
-  BuildMI(MBB, DebugLoc(), TII->get(AArch64::Bimm))
-    .addMBB(MI->getOperand(CondBrMBBOperand).getMBB());
-  MI->getOperand(CondBrMBBOperand).setMBB(NextBB);
-
-  BBInfo[MBB->getNumber()].Size += TII->getInstSizeInBytes(MBB->back());
-
-  // 26 bits written down in Bimm, specifying a multiple of 4.
-  unsigned OffsetBits = 26 + 2;
-  ImmBranches.push_back(ImmBranch(&MBB->back(), OffsetBits, false));
-
-  adjustBBOffsetsAfter(MBB);
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64BranchRelaxation.cpp b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
index a9bbef5..5209452 100644
--- a/lib/Target/ARM64/ARM64BranchRelaxation.cpp
+++ b/lib/Target/AArch64/AArch64BranchRelaxation.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64BranchRelaxation.cpp - ARM64 branch relaxation ---------------===//
+//===-- AArch64BranchRelaxation.cpp - AArch64 branch relaxation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,10 +9,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-branch-relax"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -24,27 +23,29 @@
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-branch-relax"
+
 static cl::opt<bool>
-BranchRelaxation("arm64-branch-relax", cl::Hidden, cl::init(true),
+BranchRelaxation("aarch64-branch-relax", cl::Hidden, cl::init(true),
                  cl::desc("Relax out of range conditional branches"));
 
 static cl::opt<unsigned>
-TBZDisplacementBits("arm64-tbz-offset-bits", cl::Hidden, cl::init(14),
+TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
 
 static cl::opt<unsigned>
-CBZDisplacementBits("arm64-cbz-offset-bits", cl::Hidden, cl::init(19),
+CBZDisplacementBits("aarch64-cbz-offset-bits", cl::Hidden, cl::init(19),
                     cl::desc("Restrict range of CB[N]Z instructions (DEBUG)"));
 
 static cl::opt<unsigned>
-BCCDisplacementBits("arm64-bcc-offset-bits", cl::Hidden, cl::init(19),
+BCCDisplacementBits("aarch64-bcc-offset-bits", cl::Hidden, cl::init(19),
                     cl::desc("Restrict range of Bcc instructions (DEBUG)"));
 
 STATISTIC(NumSplit, "Number of basic blocks split");
 STATISTIC(NumRelaxed, "Number of conditional branches relaxed");
 
 namespace {
-class ARM64BranchRelaxation : public MachineFunctionPass {
+class AArch64BranchRelaxation : public MachineFunctionPass {
   /// BasicBlockInfo - Information about the offset and size of a single
   /// basic block.
   struct BasicBlockInfo {
@@ -76,41 +77,39 @@ class ARM64BranchRelaxation : public MachineFunctionPass {
   SmallVector<BasicBlockInfo, 16> BlockInfo;
 
   MachineFunction *MF;
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
   bool relaxBranchInstructions();
   void scanFunction();
   MachineBasicBlock *splitBlockBeforeInstr(MachineInstr *MI);
-  void adjustBlockOffsets(MachineBasicBlock *BB);
+  void adjustBlockOffsets(MachineBasicBlock &MBB);
   bool isBlockInRange(MachineInstr *MI, MachineBasicBlock *BB, unsigned Disp);
   bool fixupConditionalBranch(MachineInstr *MI);
-  void computeBlockSize(MachineBasicBlock *MBB);
+  void computeBlockSize(const MachineBasicBlock &MBB);
   unsigned getInstrOffset(MachineInstr *MI) const;
   void dumpBBs();
   void verify();
 
 public:
   static char ID;
-  ARM64BranchRelaxation() : MachineFunctionPass(ID) {}
+  AArch64BranchRelaxation() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
-    return "ARM64 branch relaxation pass";
+  const char *getPassName() const override {
+    return "AArch64 branch relaxation pass";
   }
 };
-char ARM64BranchRelaxation::ID = 0;
+char AArch64BranchRelaxation::ID = 0;
 }
 
 /// verify - check BBOffsets, BBSizes, alignment of islands
-void ARM64BranchRelaxation::verify() {
+void AArch64BranchRelaxation::verify() {
 #ifndef NDEBUG
   unsigned PrevNum = MF->begin()->getNumber();
-  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end(); MBBI != E;
-       ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned Align = MBB->getAlignment();
-    unsigned Num = MBB->getNumber();
+  for (MachineBasicBlock &MBB : *MF) {
+    unsigned Align = MBB.getAlignment();
+    unsigned Num = MBB.getNumber();
     assert(BlockInfo[Num].Offset % (1u << Align) == 0);
     assert(!Num || BlockInfo[PrevNum].postOffset() <= BlockInfo[Num].Offset);
     PrevNum = Num;
@@ -119,8 +118,8 @@ void ARM64BranchRelaxation::verify() {
 }
 
 /// print block size and offset information - debugging
-void ARM64BranchRelaxation::dumpBBs() {
-  for (auto &MBB: *MF) {
+void AArch64BranchRelaxation::dumpBBs() {
+  for (auto &MBB : *MF) {
     const BasicBlockInfo &BBI = BlockInfo[MBB.getNumber()];
     dbgs() << format("BB#%u\toffset=%08x\t", MBB.getNumber(), BBI.Offset)
            << format("size=%#x\n", BBI.Size);
@@ -133,14 +132,12 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
   // Get the next machine basic block in the function.
   MachineFunction::iterator MBBI = MBB;
   // Can't fall off end of function.
-  if (std::next(MBBI) == MBB->getParent()->end())
+  MachineBasicBlock *NextBB = std::next(MBBI);
+  if (NextBB == MBB->getParent()->end())
     return false;
 
-  MachineBasicBlock *NextBB = std::next(MBBI);
-  for (MachineBasicBlock::succ_iterator I = MBB->succ_begin(),
-                                        E = MBB->succ_end();
-       I != E; ++I)
-    if (*I == NextBB)
+  for (MachineBasicBlock *S : MBB->successors()) 
+    if (S == NextBB)
       return true;
 
   return false;
@@ -148,7 +145,7 @@ static bool BBHasFallthrough(MachineBasicBlock *MBB) {
 
 /// scanFunction - Do the initial scan of the function, building up
 /// information about each block.
-void ARM64BranchRelaxation::scanFunction() {
+void AArch64BranchRelaxation::scanFunction() {
   BlockInfo.clear();
   BlockInfo.resize(MF->getNumBlockIDs());
 
@@ -156,27 +153,26 @@ void ARM64BranchRelaxation::scanFunction() {
   // has any inline assembly in it. If so, we have to be conservative about
   // alignment assumptions, as we don't know for sure the size of any
   // instructions in the inline assembly.
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    computeBlockSize(I);
+  for (MachineBasicBlock &MBB : *MF)
+    computeBlockSize(MBB);
 
   // Compute block offsets and known bits.
-  adjustBlockOffsets(MF->begin());
+  adjustBlockOffsets(*MF->begin());
 }
 
 /// computeBlockSize - Compute the size for MBB.
 /// This function updates BlockInfo directly.
-void ARM64BranchRelaxation::computeBlockSize(MachineBasicBlock *MBB) {
+void AArch64BranchRelaxation::computeBlockSize(const MachineBasicBlock &MBB) {
   unsigned Size = 0;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I)
-    Size += TII->GetInstSizeInBytes(I);
-  BlockInfo[MBB->getNumber()].Size = Size;
+  for (const MachineInstr &MI : MBB)
+    Size += TII->GetInstSizeInBytes(&MI);
+  BlockInfo[MBB.getNumber()].Size = Size;
 }
 
 /// getInstrOffset - Return the current offset of the specified machine
 /// instruction from the start of the function.  This offset changes as stuff is
 /// moved around inside the function.
-unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
+unsigned AArch64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   MachineBasicBlock *MBB = MI->getParent();
 
   // The offset is composed of two things: the sum of the sizes of all MBB's
@@ -192,17 +188,15 @@ unsigned ARM64BranchRelaxation::getInstrOffset(MachineInstr *MI) const {
   return Offset;
 }
 
-void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock *Start) {
-  unsigned PrevNum = Start->getNumber();
-  MachineFunction::iterator MBBI = Start, E = MF->end();
-  for (++MBBI; MBBI != E; ++MBBI) {
-    MachineBasicBlock *MBB = MBBI;
-    unsigned Num = MBB->getNumber();
+void AArch64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
+  unsigned PrevNum = Start.getNumber();
+  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+    unsigned Num = MBB.getNumber();
     if (!Num) // block zero is never changed from offset zero.
       continue;
     // Get the offset and known bits at the end of the layout predecessor.
     // Include the alignment of the current block.
-    unsigned LogAlign = MBBI->getAlignment();
+    unsigned LogAlign = MBB.getAlignment();
     BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(LogAlign);
     PrevNum = Num;
   }
@@ -215,7 +209,7 @@ void ARM64BranchRelaxation::adjustBlockOffsets(MachineBasicBlock *Start) {
 /// and must be updated by the caller! Other transforms follow using this
 /// utility function, so no point updating now rather than waiting.
 MachineBasicBlock *
-ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
+AArch64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   MachineBasicBlock *OrigBB = MI->getParent();
 
   // Create a new MBB for the code after the OrigBB.
@@ -232,7 +226,7 @@ ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   // Note the new unconditional branch is not being recorded.
   // There doesn't seem to be meaningful DebugInfo available; this doesn't
   // correspond to anything in the source.
-  BuildMI(OrigBB, DebugLoc(), TII->get(ARM64::B)).addMBB(NewBB);
+  BuildMI(OrigBB, DebugLoc(), TII->get(AArch64::B)).addMBB(NewBB);
 
   // Insert an entry into BlockInfo to align it properly with the block numbers.
   BlockInfo.insert(BlockInfo.begin() + NewBB->getNumber(), BasicBlockInfo());
@@ -242,14 +236,14 @@ ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
   // the new jump we added.  (It should be possible to do this without
   // recounting everything, but it's very confusing, and this is rarely
   // executed.)
-  computeBlockSize(OrigBB);
+  computeBlockSize(*OrigBB);
 
   // Figure out how large the NewMBB is.  As the second half of the original
   // block, it may contain a tablejump.
-  computeBlockSize(NewBB);
+  computeBlockSize(*NewBB);
 
   // All BBOffsets following these blocks must be modified.
-  adjustBlockOffsets(OrigBB);
+  adjustBlockOffsets(*OrigBB);
 
   ++NumSplit;
 
@@ -258,9 +252,9 @@ ARM64BranchRelaxation::splitBlockBeforeInstr(MachineInstr *MI) {
 
 /// isBlockInRange - Returns true if the distance between specific MI and
 /// specific BB can fit in MI's displacement field.
-bool ARM64BranchRelaxation::isBlockInRange(MachineInstr *MI,
-                                           MachineBasicBlock *DestBB,
-                                           unsigned Bits) {
+bool AArch64BranchRelaxation::isBlockInRange(MachineInstr *MI,
+                                             MachineBasicBlock *DestBB,
+                                             unsigned Bits) {
   unsigned MaxOffs = ((1 << (Bits - 1)) - 1) << 2;
   unsigned BrOffset = getInstrOffset(MI);
   unsigned DestOffset = BlockInfo[DestBB->getNumber()].Offset;
@@ -281,13 +275,15 @@ static bool isConditionalBranch(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
     return true;
   }
 }
@@ -296,14 +292,16 @@ static MachineBasicBlock *getDestBlock(MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
     assert(0 && "unexpected opcode!");
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
+  case AArch64::TBZW:
+  case AArch64::TBNZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZX:
     return MI->getOperand(2).getMBB();
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-  case ARM64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+  case AArch64::Bcc:
     return MI->getOperand(1).getMBB();
   }
 }
@@ -312,13 +310,15 @@ static unsigned getOppositeConditionOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     assert(0 && "unexpected opcode!");
-  case ARM64::TBNZ:    return ARM64::TBZ;
-  case ARM64::TBZ:     return ARM64::TBNZ;
-  case ARM64::CBNZW:   return ARM64::CBZW;
-  case ARM64::CBNZX:   return ARM64::CBZX;
-  case ARM64::CBZW:    return ARM64::CBNZW;
-  case ARM64::CBZX:    return ARM64::CBNZX;
-  case ARM64::Bcc:     return ARM64::Bcc; // Condition is an operand for Bcc.
+  case AArch64::TBNZW:   return AArch64::TBZW;
+  case AArch64::TBNZX:   return AArch64::TBZX;
+  case AArch64::TBZW:    return AArch64::TBNZW;
+  case AArch64::TBZX:    return AArch64::TBNZX;
+  case AArch64::CBNZW:   return AArch64::CBZW;
+  case AArch64::CBNZX:   return AArch64::CBZX;
+  case AArch64::CBZW:    return AArch64::CBNZW;
+  case AArch64::CBZX:    return AArch64::CBNZX;
+  case AArch64::Bcc:     return AArch64::Bcc; // Condition is an operand for Bcc.
   }
 }
 
@@ -326,30 +326,32 @@ static unsigned getBranchDisplacementBits(unsigned Opc) {
   switch (Opc) {
   default:
     assert(0 && "unexpected opcode!");
-  case ARM64::TBNZ:
-  case ARM64::TBZ:
+  case AArch64::TBNZW:
+  case AArch64::TBZW:
+  case AArch64::TBNZX:
+  case AArch64::TBZX:
     return TBZDisplacementBits;
-  case ARM64::CBNZW:
-  case ARM64::CBZW:
-  case ARM64::CBNZX:
-  case ARM64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBZW:
+  case AArch64::CBNZX:
+  case AArch64::CBZX:
     return CBZDisplacementBits;
-  case ARM64::Bcc:
+  case AArch64::Bcc:
     return BCCDisplacementBits;
   }
 }
 
 static inline void invertBccCondition(MachineInstr *MI) {
-  assert(MI->getOpcode() == ARM64::Bcc && "Unexpected opcode!");
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(0).getImm();
-  CC = ARM64CC::getInvertedCondCode(CC);
+  assert(MI->getOpcode() == AArch64::Bcc && "Unexpected opcode!");
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(0).getImm();
+  CC = AArch64CC::getInvertedCondCode(CC);
   MI->getOperand(0).setImm((int64_t)CC);
 }
 
 /// fixupConditionalBranch - Fix up a conditional branch whose destination is
 /// too far away to fit in its displacement field. It is converted to an inverse
 /// conditional branch + an unconditional branch to the destination.
-bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
+bool AArch64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   MachineBasicBlock *DestBB = getDestBlock(MI);
 
   // Add an unconditional branch to the destination and invert the branch
@@ -370,7 +372,7 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   if (BMI != MI) {
     if (std::next(MachineBasicBlock::iterator(MI)) ==
             std::prev(MBB->getLastNonDebugInstr()) &&
-        BMI->getOpcode() == ARM64::B) {
+        BMI->getOpcode() == AArch64::B) {
       // Last MI in the BB is an unconditional branch. Can we simply invert the
       // condition and swap destinations:
       // beq L1
@@ -384,13 +386,15 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
         DEBUG(dbgs() << "  Invert condition and swap its destination with "
                      << *BMI);
         BMI->getOperand(0).setMBB(DestBB);
-        unsigned OpNum =
-            (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
-                ? 2
-                : 1;
+        unsigned OpNum = (MI->getOpcode() == AArch64::TBZW ||
+                          MI->getOpcode() == AArch64::TBNZW ||
+                          MI->getOpcode() == AArch64::TBZX ||
+                          MI->getOpcode() == AArch64::TBNZX)
+                             ? 2
+                             : 1;
         MI->getOperand(OpNum).setMBB(NewDest);
         MI->setDesc(TII->get(getOppositeConditionOpcode(MI->getOpcode())));
-        if (MI->getOpcode() == ARM64::Bcc)
+        if (MI->getOpcode() == AArch64::Bcc)
           invertBccCondition(MI);
         return true;
       }
@@ -426,13 +430,14 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   MachineInstrBuilder MIB = BuildMI(
       MBB, DebugLoc(), TII->get(getOppositeConditionOpcode(MI->getOpcode())))
                                 .addOperand(MI->getOperand(0));
-  if (MI->getOpcode() == ARM64::TBZ || MI->getOpcode() == ARM64::TBNZ)
+  if (MI->getOpcode() == AArch64::TBZW || MI->getOpcode() == AArch64::TBNZW ||
+      MI->getOpcode() == AArch64::TBZX || MI->getOpcode() == AArch64::TBNZX)
     MIB.addOperand(MI->getOperand(1));
-  if (MI->getOpcode() == ARM64::Bcc)
+  if (MI->getOpcode() == AArch64::Bcc)
     invertBccCondition(MIB);
   MIB.addMBB(NextBB);
   BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
-  BuildMI(MBB, DebugLoc(), TII->get(ARM64::B)).addMBB(DestBB);
+  BuildMI(MBB, DebugLoc(), TII->get(AArch64::B)).addMBB(DestBB);
   BlockInfo[MBB->getNumber()].Size += TII->GetInstSizeInBytes(&MBB->back());
 
   // Remove the old conditional branch.  It may or may not still be in MBB.
@@ -440,11 +445,11 @@ bool ARM64BranchRelaxation::fixupConditionalBranch(MachineInstr *MI) {
   MI->eraseFromParent();
 
   // Finally, keep the block offsets up to date.
-  adjustBlockOffsets(MBB);
+  adjustBlockOffsets(*MBB);
   return true;
 }
 
-bool ARM64BranchRelaxation::relaxBranchInstructions() {
+bool AArch64BranchRelaxation::relaxBranchInstructions() {
   bool Changed = false;
   // Relaxing branches involves creating new basic blocks, so re-eval
   // end() for termination.
@@ -461,16 +466,16 @@ bool ARM64BranchRelaxation::relaxBranchInstructions() {
   return Changed;
 }
 
-bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
+bool AArch64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
 
   // If the pass is disabled, just bail early.
   if (!BranchRelaxation)
     return false;
 
-  DEBUG(dbgs() << "***** ARM64BranchRelaxation *****\n");
+  DEBUG(dbgs() << "***** AArch64BranchRelaxation *****\n");
 
-  TII = (const ARM64InstrInfo *)MF->getTarget().getInstrInfo();
+  TII = (const AArch64InstrInfo *)MF->getTarget().getInstrInfo();
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
   // the numbers agree with the position of the block in the function.
@@ -498,8 +503,8 @@ bool ARM64BranchRelaxation::runOnMachineFunction(MachineFunction &mf) {
   return MadeChange;
 }
 
-/// createARM64BranchRelaxation - returns an instance of the constpool
+/// createAArch64BranchRelaxation - returns an instance of the constpool
 /// island pass.
-FunctionPass *llvm::createARM64BranchRelaxation() {
-  return new ARM64BranchRelaxation();
+FunctionPass *llvm::createAArch64BranchRelaxation() {
+  return new AArch64BranchRelaxation();
 }
diff --git a/lib/Target/AArch64/AArch64CallingConv.td b/lib/Target/AArch64/AArch64CallingConv.td
deleted file mode 100644
index 9fe6aae..0000000
--- a/lib/Target/AArch64/AArch64CallingConv.td
+++ /dev/null
@@ -1,197 +0,0 @@
-//==-- AArch64CallingConv.td - Calling Conventions for ARM ----*- tblgen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// This describes the calling conventions for AArch64 architecture.
-//===----------------------------------------------------------------------===//
-
-
-// The AArch64 Procedure Call Standard is unfortunately specified at a slightly
-// higher level of abstraction than LLVM's target interface presents. In
-// particular, it refers (like other ABIs, in fact) directly to
-// structs. However, generic LLVM code takes the liberty of lowering structure
-// arguments to the component fields before we see them.
-//
-// As a result, the obvious direct map from LLVM IR to PCS concepts can't be
-// implemented, so the goals of this calling convention are, in decreasing
-// priority order:
-//     1. Expose *some* way to express the concepts required to implement the
-//        generic PCS from a front-end.
-//     2. Provide a sane ABI for pure LLVM.
-//     3. Follow the generic PCS as closely as is naturally possible.
-//
-// The suggested front-end implementation of PCS features is:
-//     * Integer, float and vector arguments of all sizes which end up in
-//       registers are passed and returned via the natural LLVM type.
-//     * Structure arguments with size <= 16 bytes are passed and returned in
-//       registers as similar integer or composite types. For example:
-//       [1 x i64], [2 x i64] or [1 x i128] (if alignment 16 needed).
-//     * HFAs in registers follow rules similar to small structs: appropriate
-//       composite types.
-//     * Structure arguments with size > 16 bytes are passed via a pointer,
-//       handled completely by the front-end.
-//     * Structure return values > 16 bytes via an sret pointer argument.
-//     * Other stack-based arguments (not large structs) are passed using byval
-//       pointers. Padding arguments are added beforehand to guarantee a large
-//       struct doesn't later use integer registers.
-//
-// N.b. this means that it is the front-end's responsibility (if it cares about
-// PCS compliance) to check whether enough registers are available for an
-// argument when deciding how to pass it.
-
-class CCIfAlign<int Align, CCAction A>:
-  CCIf<"ArgFlags.getOrigAlign() == " # Align, A>;
-
-def CC_A64_APCS : CallingConv<[
-  // SRet is an LLVM-specific concept, so it takes precedence over general ABI
-  // concerns. However, this rule will be used by C/C++ frontends to implement
-  // structure return.
-  CCIfSRet<CCAssignToReg<[X8]>>,
-
-  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
-  // slot is 64-bit.
-  CCIfByVal<CCPassByVal<8, 8>>,
-
-  // Canonicalise the various types that live in different floating-point
-  // registers. This makes sense because the PCS does not distinguish Short
-  // Vectors and Floating-point types.
-  CCIfType<[v1i16, v2i8], CCBitConvertToType<f16>>,
-  CCIfType<[v1i32, v4i8, v2i16], CCBitConvertToType<f32>>,
-  CCIfType<[v8i8, v4i16, v2i32, v2f32, v1i64, v1f64], CCBitConvertToType<f64>>,
-  CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-           CCBitConvertToType<f128>>,
-
-  // PCS: "C.1: If the argument is a Half-, Single-, Double- or Quad- precision
-  // Floating-point or Short Vector Type and the NSRN is less than 8, then the
-  // argument is allocated to the least significant bits of register
-  // v[NSRN]. The NSRN is incremented by one. The argument has now been
-  // allocated."
-  CCIfType<[v1i8], CCAssignToReg<[B0, B1, B2, B3, B4, B5, B6, B7]>>,
-  CCIfType<[f16],  CCAssignToReg<[H0, H1, H2, H3, H4, H5, H6, H7]>>,
-  CCIfType<[f32],  CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7]>>,
-  CCIfType<[f64],  CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
-  CCIfType<[f128], CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-
-  // PCS: "C.2: If the argument is an HFA and there are sufficient unallocated
-  // SIMD and Floating-point registers (NSRN - number of elements < 8), then the
-  // argument is allocated to SIMD and Floating-point registers (with one
-  // register per element of the HFA). The NSRN is incremented by the number of
-  // registers used. The argument has now been allocated."
-  //
-  // N.b. As above, this rule is the responsibility of the front-end.
-
-  // "C.3: If the argument is an HFA then the NSRN is set to 8 and the size of
-  // the argument is rounded up to the nearest multiple of 8 bytes."
-  //
-  // "C.4: If the argument is an HFA, a Quad-precision Floating-point or Short
-  // Vector Type then the NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the Argument's type."
-  //
-  // It is expected that these will be satisfied by adding dummy arguments to
-  // the prototype.
-
-  // PCS: "C.5: If the argument is a Half- or Single- precision Floating-point
-  // type then the size of the argument is set to 8 bytes. The effect is as if
-  // the argument had been copied to the least significant bits of a 64-bit
-  // register and the remaining bits filled with unspecified values."
-  CCIfType<[f16, f32], CCPromoteToType<f64>>,
-
-  // PCS: "C.6: If the argument is an HFA, a Half-, Single-, Double- or Quad-
-  // precision Floating-point or Short Vector Type, then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is incremented by the size of the
-  // argument. The argument has now been allocated."
-  CCIfType<[f64], CCAssignToStack<8, 8>>,
-  CCIfType<[f128], CCAssignToStack<16, 16>>,
-
-  // PCS: "C.7: If the argument is an Integral Type, the size of the argument is
-  // less than or equal to 8 bytes and the NGRN is less than 8, the argument is
-  // copied to the least significant bits of x[NGRN]. The NGRN is incremented by
-  // one. The argument has now been allocated."
-
-  // First we implement C.8 and C.9 (128-bit types get even registers). i128 is
-  // represented as two i64s, the first one being split. If we delayed this
-  // operation C.8 would never be reached.
-  CCIfType<[i64],
-        CCIfSplit<CCAssignToRegWithShadow<[X0, X2, X4, X6], [X0, X1, X3, X5]>>>,
-
-  // Note: the promotion also implements C.14.
-  CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-
-  // And now the real implementation of C.7
-  CCIfType<[i64], CCAssignToReg<[X0, X1, X2, X3, X4, X5, X6, X7]>>,
-
-  // PCS: "C.8: If the argument has an alignment of 16 then the NGRN is rounded
-  // up to the next even number."
-  //
-  // "C.9: If the argument is an Integral Type, the size of the argument is
-  // equal to 16 and the NGRN is less than 7, the argument is copied to x[NGRN]
-  // and x[NGRN+1], x[NGRN] shall contain the lower addressed double-word of the
-  // memory representation of the argument. The NGRN is incremented by two. The
-  // argument has now been allocated."
-  //
-  // Subtlety here: what if alignment is 16 but it is not an integral type? All
-  // floating-point types have been allocated already, which leaves composite
-  // types: this is why a front-end may need to produce i128 for a struct <= 16
-  // bytes.
-
-  // PCS: "C.10 If the argument is a Composite Type and the size in double-words
-  // of the argument is not more than 8 minus NGRN, then the argument is copied
-  // into consecutive general-purpose registers, starting at x[NGRN]. The
-  // argument is passed as though it had been loaded into the registers from a
-  // double-word aligned address with an appropriate sequence of LDR
-  // instructions loading consecutive registers from memory (the contents of any
-  // unused parts of the registers are unspecified by this standard). The NGRN
-  // is incremented by the number of registers used. The argument has now been
-  // allocated."
-  //
-  // Another one that's the responsibility of the front-end (sigh).
-
-  // PCS: "C.11: The NGRN is set to 8."
-  CCCustom<"CC_AArch64NoMoreRegs">,
-
-  // PCS: "C.12: The NSAA is rounded up to the larger of 8 or the Natural
-  // Alignment of the argument's type."
-  //
-  // PCS: "C.13: If the argument is a composite type then the argument is copied
-  // to memory at the adjusted NSAA. The NSAA is by the size of the
-  // argument. The argument has now been allocated."
-  //
-  // Note that the effect of this corresponds to a memcpy rather than register
-  // stores so that the struct ends up correctly addressable at the adjusted
-  // NSAA.
-
-  // PCS: "C.14: If the size of the argument is less than 8 bytes then the size
-  // of the argument is set to 8 bytes. The effect is as if the argument was
-  // copied to the least significant bits of a 64-bit register and the remaining
-  // bits filled with unspecified values."
-  //
-  // Integer types were widened above. Floating-point and composite types have
-  // already been allocated completely. Nothing to do.
-
-  // PCS: "C.15: The argument is copied to memory at the adjusted NSAA. The NSAA
-  // is incremented by the size of the argument. The argument has now been
-  // allocated."
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64], CCAssignToStack<8, 8>>
-
-]>;
-
-// According to the PCS, X19-X30 are callee-saved, however only the low 64-bits
-// of vector registers (8-15) are callee-saved. The order here is is picked up
-// by PrologEpilogInserter.cpp to allocate stack slots, starting from top of
-// stack upon entry. This gives the customary layout of x30 at [sp-8], x29 at
-// [sp-16], ...
-def CSR_PCS : CalleeSavedRegs<(add (sequence "X%u", 30, 19),
-                                   (sequence "D%u", 15, 8))>;
-
-
-// TLS descriptor calls are extremely restricted in their changes, to allow
-// optimisations in the (hopefully) more common fast path where no real action
-// is needed. They actually have to preserve all registers, except for the
-// unavoidable X30 and the return register X0.
-def TLSDesc : CalleeSavedRegs<(add (sequence "X%u", 29, 1),
-                                   (sequence "Q%u", 31, 0))>;
diff --git a/lib/Target/ARM64/ARM64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 9ac888f..ded2e17 100644
--- a/lib/Target/ARM64/ARM64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -1,4 +1,4 @@
-//===- ARM64CallingConv.td - Calling Conventions for ARM64 -*- tablegen -*-===//
+//=- AArch64CallingConv.td - Calling Conventions for AArch64 -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,28 +7,45 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This describes the calling conventions for ARM64 architecture.
+// This describes the calling conventions for AArch64 architecture.
 //
 //===----------------------------------------------------------------------===//
 
 /// CCIfAlign - Match of the original alignment of the arg
 class CCIfAlign<string Align, CCAction A> :
   CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+/// CCIfBigEndian - Match only if we're in big endian mode.
+class CCIfBigEndian<CCAction A> :
+  CCIf<"State.getTarget().getDataLayout()->isBigEndian()", A>;
+
+class CCIfUnallocated<string Reg, CCAction A> :
+  CCIf<"!State.isAllocated(AArch64::" # Reg # ")", A>;
 
 //===----------------------------------------------------------------------===//
 // ARM AAPCS64 Calling Convention
 //===----------------------------------------------------------------------===//
 
-def CC_ARM64_AAPCS : CallingConv<[
+def CC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
 
   // An SRet is passed in X8, not X0 like a normal pointer parameter.
   CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
 
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -36,7 +53,7 @@ def CC_ARM64_AAPCS : CallingConv<[
                                                     [X0, X1, X3, X5]>>>,
 
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
-  CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
+  CCIfType<[i64], CCIfSplit<CCAssignToStackWithShadow<8, 16, [X7]>>>,
 
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
@@ -47,7 +64,7 @@ def CC_ARM64_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
@@ -55,12 +72,20 @@ def CC_ARM64_AAPCS : CallingConv<[
   CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+           CCAssignToStack<16, 16>>
 ]>;
 
-def RetCC_ARM64_AAPCS : CallingConv<[
+def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
-  CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
+  CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
+
+  // Big endian vectors must be passed as if they were 1-element vectors so that
+  // their lanes are in a consistent order.
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+                         CCBitConvertToType<f64>>>,
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+                         CCBitConvertToType<f128>>>,
 
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
@@ -73,7 +98,7 @@ def RetCC_ARM64_AAPCS : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32],
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>
 ]>;
 
@@ -82,16 +107,20 @@ def RetCC_ARM64_AAPCS : CallingConv<[
 // from the standard one at this level:
 //     + i128s (i.e. split i64s) don't need even registers.
 //     + Stack slots are sized as needed rather than being at least 64-bit.
-def CC_ARM64_DarwinPCS : CallingConv<[
+def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
   // An SRet is passed in X8, not X0 like a normal pointer parameter.
   CCIfSRet<CCIfType<[i64], CCAssignToRegWithShadow<[X8], [W8]>>>,
 
+  // Put ByVal arguments directly on the stack. Minimum size and alignment of a
+  // slot is 64-bit.
+  CCIfByVal<CCPassByVal<8, 8>>,
+
   // Handle i1, i8, i16, i32, i64, f32, f64 and v2f64 by passing in registers,
   // up to eight each of GPR and FPR.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X7", CCPromoteToType<i32>>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   // i128 is split to two i64s, we can't fit half to register X7.
@@ -114,14 +143,15 @@ def CC_ARM64_DarwinPCS : CallingConv<[
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_Custom_i1i8i16_Stack">>,
+  CCIfType<[i1, i8], CCAssignToStack<1, 1>>,
+  CCIfType<[i16], CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8],
            CCAssignToStack<8, 8>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64], CCAssignToStack<16, 16>>
 ]>;
 
-def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
+def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32, f128], CCBitConvertToType<v2i64>>,
 
@@ -140,9 +170,9 @@ def CC_ARM64_DarwinPCS_VarArg : CallingConv<[
 // in register and the remaining arguments on stack. We allow 32bit stack slots,
 // so that WebKit can write partial values in the stack and define the other
 // 32bit quantity as undef.
-def CC_ARM64_WebKit_JS : CallingConv<[
+def CC_AArch64_WebKit_JS : CallingConv<[
   // Handle i1, i8, i16, i32, and i64 passing in register X0 (W0).
-  CCIfType<[i1, i8, i16], CCCustom<"CC_ARM64_WebKit_JS_i1i8i16_Reg">>,
+  CCIfType<[i1, i8, i16], CCIfUnallocated<"X0", CCPromoteToType<i32>>>,
   CCIfType<[i32], CCAssignToRegWithShadow<[W0], [X0]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0], [W0]>>,
 
@@ -152,7 +182,7 @@ def CC_ARM64_WebKit_JS : CallingConv<[
   CCIfType<[i64, f64], CCAssignToStack<8, 8>>
 ]>;
 
-def RetCC_ARM64_WebKit_JS : CallingConv<[
+def RetCC_AArch64_WebKit_JS : CallingConv<[
   CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3, W4, W5, W6, W7],
                                           [X0, X1, X2, X3, X4, X5, X6, X7]>>,
   CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3, X4, X5, X6, X7],
@@ -171,7 +201,7 @@ def RetCC_ARM64_WebKit_JS : CallingConv<[
 // It would be better to model its preservation semantics properly (create a
 // vreg on entry, use it in RET & tail call generation; make that vreg def if we
 // end up saving LR as part of a call frame). Watch this space...
-def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
                                            X23, X24, X25, X26, X27, X28,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
@@ -184,24 +214,24 @@ def CSR_ARM64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 // (For generic ARM 64-bit ABI code, clang will not generate constructors or
 // destructors with 'this' returns, so this RegMask will not be used in that
 // case)
-def CSR_ARM64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_ARM64_AAPCS, X0)>;
+def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
 // fast path for calculation, but other registers except X0 (argument/return)
 // and LR (it is a call, after all) are preserved.
-def CSR_ARM64_TLS_Darwin
+def CSR_AArch64_TLS_Darwin
     : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
                            FP,
                            (sequence "Q%u", 0, 31))>;
 
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
-def CSR_ARM64_TLS_ELF
+def CSR_AArch64_TLS_ELF
     : CalleeSavedRegs<(add (sequence "X%u", 1, 28), FP,
                            (sequence "Q%u", 0, 31))>;
 
-def CSR_ARM64_AllRegs
+def CSR_AArch64_AllRegs
     : CalleeSavedRegs<(add (sequence "W%u", 0, 30), WSP,
                            (sequence "X%u", 0, 28), FP, LR, SP,
                            (sequence "B%u", 0, 31), (sequence "H%u", 0, 31),
diff --git a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index e3f8248..4d23dc5 100644
--- a/lib/Target/ARM64/ARM64CleanupLocalDynamicTLSPass.cpp
+++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64CleanupLocalDynamicTLSPass.cpp -----------------------*- C++ -*-=//
+//===-- AArch64CleanupLocalDynamicTLSPass.cpp ---------------------*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -22,10 +22,10 @@
 // pass looks through a function and performs such combinations.
 //
 //===----------------------------------------------------------------------===//
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64TargetMachine.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -38,8 +38,8 @@ struct LDTLSCleanup : public MachineFunctionPass {
   static char ID;
   LDTLSCleanup() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
     if (AFI->getNumLocalDynamicTLSAccesses() < 2) {
       // No point folding accesses if there isn't at least two.
       return false;
@@ -62,7 +62,7 @@ struct LDTLSCleanup : public MachineFunctionPass {
     for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
          ++I) {
       switch (I->getOpcode()) {
-      case ARM64::TLSDESC_BLR:
+      case AArch64::TLSDESC_BLR:
         // Make sure it's a local dynamic access.
         if (!I->getOperand(1).isSymbol() ||
             strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
@@ -92,15 +92,15 @@ struct LDTLSCleanup : public MachineFunctionPass {
   MachineInstr *replaceTLSBaseAddrCall(MachineInstr *I,
                                        unsigned TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
 
     // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
     // code sequence assumes the address will be.
-    MachineInstr *Copy =
-        BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                TII->get(TargetOpcode::COPY), ARM64::X0).addReg(TLSBaseAddrReg);
+    MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
+                                 TII->get(TargetOpcode::COPY),
+                                 AArch64::X0).addReg(TLSBaseAddrReg);
 
     // Erase the TLS_base_addr instruction.
     I->eraseFromParent();
@@ -112,28 +112,28 @@ struct LDTLSCleanup : public MachineFunctionPass {
   // inserting a copy instruction after I. Returns the new instruction.
   MachineInstr *setRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
     MachineFunction *MF = I->getParent()->getParent();
-    const ARM64TargetMachine *TM =
-        static_cast<const ARM64TargetMachine *>(&MF->getTarget());
-    const ARM64InstrInfo *TII = TM->getInstrInfo();
+    const AArch64TargetMachine *TM =
+        static_cast<const AArch64TargetMachine *>(&MF->getTarget());
+    const AArch64InstrInfo *TII = TM->getInstrInfo();
 
     // Create a virtual register for the TLS base address.
     MachineRegisterInfo &RegInfo = MF->getRegInfo();
-    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
+    *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
 
     // Insert a copy from X0 to TLSBaseAddrReg for later.
     MachineInstr *Next = I->getNextNode();
     MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
                                  TII->get(TargetOpcode::COPY),
-                                 *TLSBaseAddrReg).addReg(ARM64::X0);
+                                 *TLSBaseAddrReg).addReg(AArch64::X0);
 
     return Copy;
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Local Dynamic TLS Access Clean-up";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -142,6 +142,6 @@ struct LDTLSCleanup : public MachineFunctionPass {
 }
 
 char LDTLSCleanup::ID = 0;
-FunctionPass *llvm::createARM64CleanupLocalDynamicTLSPass() {
+FunctionPass *llvm::createAArch64CleanupLocalDynamicTLSPass() {
   return new LDTLSCleanup();
 }
diff --git a/lib/Target/ARM64/ARM64CollectLOH.cpp b/lib/Target/AArch64/AArch64CollectLOH.cpp
index f52778f..6b1f096 100644
--- a/lib/Target/ARM64/ARM64CollectLOH.cpp
+++ b/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -1,4 +1,4 @@
-//===-------------- ARM64CollectLOH.cpp - ARM64 collect LOH pass --*- C++ -*-=//
+//===---------- AArch64CollectLOH.cpp - AArch64 collect LOH pass --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -85,8 +85,8 @@
 // This LOH aims at getting rid of redundant ADRP instructions.
 //
 // The overall design for emitting the LOHs is:
-// 1. ARM64CollectLOH (this pass) records the LOHs in the ARM64FunctionInfo.
-// 2. ARM64AsmPrinter reads the LOHs from ARM64FunctionInfo and it:
+// 1. AArch64CollectLOH (this pass) records the LOHs in the AArch64FunctionInfo.
+// 2. AArch64AsmPrinter reads the LOHs from AArch64FunctionInfo and it:
 //     1. Associates them a label.
 //     2. Emits them in a MCStreamer (EmitLOHDirective).
 //         - The MCMachOStreamer records them into the MCAssembler.
@@ -98,11 +98,10 @@
 //         - Other ObjectWriters ignore them.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-collect-loh"
-#include "ARM64.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
@@ -123,14 +122,16 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-collect-loh"
+
 static cl::opt<bool>
-PreCollectRegister("arm64-collect-loh-pre-collect-register", cl::Hidden,
+PreCollectRegister("aarch64-collect-loh-pre-collect-register", cl::Hidden,
                    cl::desc("Restrict analysis to registers invovled"
                             " in LOHs"),
                    cl::init(true));
 
 static cl::opt<bool>
-BasicBlockScopeOnly("arm64-collect-loh-bb-only", cl::Hidden,
+BasicBlockScopeOnly("aarch64-collect-loh-bb-only", cl::Hidden,
                     cl::desc("Restrict analysis at basic block scope"),
                     cl::init(true));
 
@@ -163,23 +164,23 @@ STATISTIC(NumADRSimpleCandidate, "Number of simplifiable ADRP + ADD");
 STATISTIC(NumADRComplexCandidate, "Number of too complex ADRP + ADD");
 
 namespace llvm {
-void initializeARM64CollectLOHPass(PassRegistry &);
+void initializeAArch64CollectLOHPass(PassRegistry &);
 }
 
 namespace {
-struct ARM64CollectLOH : public MachineFunctionPass {
+struct AArch64CollectLOH : public MachineFunctionPass {
   static char ID;
-  ARM64CollectLOH() : MachineFunctionPass(ID) {
-    initializeARM64CollectLOHPass(*PassRegistry::getPassRegistry());
+  AArch64CollectLOH() : MachineFunctionPass(ID) {
+    initializeAArch64CollectLOHPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
-    return "ARM64 Collect Linker Optimization Hint (LOH)";
+  const char *getPassName() const override {
+    return "AArch64 Collect Linker Optimization Hint (LOH)";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
     MachineFunctionPass::getAnalysisUsage(AU);
     AU.addRequired<MachineDominatorTree>();
@@ -213,14 +214,14 @@ typedef DenseMap<unsigned, unsigned> MapRegToId;
 typedef SmallVector<unsigned, 32> MapIdToReg;
 } // end anonymous namespace.
 
-char ARM64CollectLOH::ID = 0;
+char AArch64CollectLOH::ID = 0;
 
-INITIALIZE_PASS_BEGIN(ARM64CollectLOH, "arm64-collect-loh",
-                      "ARM64 Collect Linker Optimization Hint (LOH)", false,
+INITIALIZE_PASS_BEGIN(AArch64CollectLOH, "aarch64-collect-loh",
+                      "AArch64 Collect Linker Optimization Hint (LOH)", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
-                    "ARM64 Collect Linker Optimization Hint (LOH)", false,
+INITIALIZE_PASS_END(AArch64CollectLOH, "aarch64-collect-loh",
+                    "AArch64 Collect Linker Optimization Hint (LOH)", false,
                     false)
 
 /// Given a couple (MBB, reg) get the corresponding set of instruction from
@@ -230,15 +231,14 @@ INITIALIZE_PASS_END(ARM64CollectLOH, "arm64-collect-loh",
 /// \param nbRegs is used internally allocate some memory. It must be consistent
 /// with the way sets is used.
 static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
-                                 const MachineBasicBlock *MBB, unsigned reg,
+                                 const MachineBasicBlock &MBB, unsigned reg,
                                  unsigned nbRegs) {
   SetOfMachineInstr *result;
-  BlockToSetOfInstrsPerColor::iterator it = sets.find(MBB);
-  if (it != sets.end()) {
+  BlockToSetOfInstrsPerColor::iterator it = sets.find(&MBB);
+  if (it != sets.end())
     result = it->second;
-  } else {
-    result = sets[MBB] = new SetOfMachineInstr[nbRegs];
-  }
+  else
+    result = sets[&MBB] = new SetOfMachineInstr[nbRegs];
 
   return result[reg];
 }
@@ -251,18 +251,18 @@ static SetOfMachineInstr &getSet(BlockToSetOfInstrsPerColor &sets,
 /// "sets[reg]".
 /// \pre set[reg] is valid.
 static SetOfMachineInstr &getUses(InstrToInstrs *sets, unsigned reg,
-                                  const MachineInstr *MI) {
-  return sets[reg][MI];
+                                  const MachineInstr &MI) {
+  return sets[reg][&MI];
 }
 
 /// Same as getUses but does not modify the input map: sets.
 /// \return NULL if the couple (reg, MI) is not in sets.
 static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
-                                        const MachineInstr *MI) {
-  InstrToInstrs::const_iterator Res = sets[reg].find(MI);
+                                        const MachineInstr &MI) {
+  InstrToInstrs::const_iterator Res = sets[reg].find(&MI);
   if (Res != sets[reg].end())
     return &(Res->second);
-  return NULL;
+  return nullptr;
 }
 
 /// Initialize the reaching definition algorithm:
@@ -276,41 +276,36 @@ static const SetOfMachineInstr *getUses(const InstrToInstrs *sets, unsigned reg,
 /// definition. It also consider definitions of ADRP instructions as uses and
 /// ignore other uses. The ADRPMode is used to collect the information for LHO
 /// that involve ADRP operation only.
-static void initReachingDef(MachineFunction *MF,
+static void initReachingDef(MachineFunction &MF,
                             InstrToInstrs *ColorOpToReachedUses,
                             BlockToInstrPerColor &Gen, BlockToRegSet &Kill,
                             BlockToSetOfInstrsPerColor &ReachableUses,
                             const MapRegToId &RegToId,
                             const MachineInstr *DummyOp, bool ADRPMode) {
-  const TargetMachine &TM = MF->getTarget();
+  const TargetMachine &TM = MF.getTarget();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
   unsigned NbReg = RegToId.size();
 
-  for (MachineFunction::const_iterator IMBB = MF->begin(), IMBBEnd = MF->end();
-       IMBB != IMBBEnd; ++IMBB) {
-    const MachineBasicBlock *MBB = &(*IMBB);
-    const MachineInstr **&BBGen = Gen[MBB];
+  for (MachineBasicBlock &MBB : MF) {
+    const MachineInstr **&BBGen = Gen[&MBB];
     BBGen = new const MachineInstr *[NbReg];
     memset(BBGen, 0, sizeof(const MachineInstr *) * NbReg);
 
-    BitVector &BBKillSet = Kill[MBB];
+    BitVector &BBKillSet = Kill[&MBB];
     BBKillSet.resize(NbReg);
-    for (MachineBasicBlock::const_iterator II = MBB->begin(), IEnd = MBB->end();
-         II != IEnd; ++II) {
-      bool IsADRP = II->getOpcode() == ARM64::ADRP;
+    for (const MachineInstr &MI : MBB) {
+      bool IsADRP = MI.getOpcode() == AArch64::ADRP;
 
       // Process uses first.
       if (IsADRP || !ADRPMode)
-        for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                              IOEnd = II->operands_end();
-             IO != IOEnd; ++IO) {
+        for (const MachineOperand &MO : MI.operands()) {
           // Treat ADRP def as use, as the goal of the analysis is to find
           // ADRP defs reached by other ADRP defs.
-          if (!IO->isReg() || (!ADRPMode && !IO->isUse()) ||
-              (ADRPMode && (!IsADRP || !IO->isDef())))
+          if (!MO.isReg() || (!ADRPMode && !MO.isUse()) ||
+              (ADRPMode && (!IsADRP || !MO.isDef())))
             continue;
-          unsigned CurReg = IO->getReg();
+          unsigned CurReg = MO.getReg();
           MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
           if (ItCurRegId == RegToId.end())
             continue;
@@ -318,20 +313,18 @@ static void initReachingDef(MachineFunction *MF,
 
           // if CurReg has not been defined, this use is reachable.
           if (!BBGen[CurReg] && !BBKillSet.test(CurReg))
-            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&(*II));
+            getSet(ReachableUses, MBB, CurReg, NbReg).insert(&MI);
           // current basic block definition for this color, if any, is in Gen.
           if (BBGen[CurReg])
-            getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(&(*II));
+            getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(&MI);
         }
 
       // Process clobbers.
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isRegMask())
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
           continue;
         // Clobbers kill the related colors.
-        const uint32_t *PreservedRegs = IO->getRegMask();
+        const uint32_t *PreservedRegs = MO.getRegMask();
 
         // Set generated regs.
         for (const auto Entry : RegToId) {
@@ -342,19 +335,17 @@ static void initReachingDef(MachineFunction *MF,
             // Do not register clobbered definition for no ADRP.
             // This definition is not used anyway (otherwise register
             // allocation is wrong).
-            BBGen[Reg] = ADRPMode ? II : NULL;
+            BBGen[Reg] = ADRPMode ? &MI : nullptr;
             BBKillSet.set(Reg);
           }
         }
       }
 
-      // Process defs
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
-           IO != IOEnd; ++IO) {
-        if (!IO->isReg() || !IO->isDef())
+      // Process register defs.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.isDef())
           continue;
-        unsigned CurReg = IO->getReg();
+        unsigned CurReg = MO.getReg();
         MapRegToId::const_iterator ItCurRegId = RegToId.find(CurReg);
         if (ItCurRegId == RegToId.end())
           continue;
@@ -365,19 +356,19 @@ static void initReachingDef(MachineFunction *MF,
                  "Sub-register of an "
                  "involved register, not recorded as involved!");
           BBKillSet.set(ItRegId->second);
-          BBGen[ItRegId->second] = &(*II);
+          BBGen[ItRegId->second] = &MI;
         }
-        BBGen[ItCurRegId->second] = &(*II);
+        BBGen[ItCurRegId->second] = &MI;
       }
     }
 
     // If we restrict our analysis to basic block scope, conservatively add a
     // dummy
     // use for each generated value.
-    if (!ADRPMode && DummyOp && !MBB->succ_empty())
+    if (!ADRPMode && DummyOp && !MBB.succ_empty())
       for (unsigned CurReg = 0; CurReg < NbReg; ++CurReg)
         if (BBGen[CurReg])
-          getUses(ColorOpToReachedUses, CurReg, BBGen[CurReg]).insert(DummyOp);
+          getUses(ColorOpToReachedUses, CurReg, *BBGen[CurReg]).insert(DummyOp);
   }
 }
 
@@ -390,7 +381,7 @@ static void initReachingDef(MachineFunction *MF,
 ///                 op.reachedUses
 ///
 ///           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-static void reachingDefAlgorithm(MachineFunction *MF,
+static void reachingDefAlgorithm(MachineFunction &MF,
                                  InstrToInstrs *ColorOpToReachedUses,
                                  BlockToSetOfInstrsPerColor &In,
                                  BlockToSetOfInstrsPerColor &Out,
@@ -400,10 +391,7 @@ static void reachingDefAlgorithm(MachineFunction *MF,
   bool HasChanged;
   do {
     HasChanged = false;
-    for (MachineFunction::const_iterator IMBB = MF->begin(),
-                                         IMBBEnd = MF->end();
-         IMBB != IMBBEnd; ++IMBB) {
-      const MachineBasicBlock *MBB = &(*IMBB);
+    for (MachineBasicBlock &MBB : MF) {
       unsigned CurReg;
       for (CurReg = 0; CurReg < NbReg; ++CurReg) {
         SetOfMachineInstr &BBInSet = getSet(In, MBB, CurReg, NbReg);
@@ -412,24 +400,21 @@ static void reachingDefAlgorithm(MachineFunction *MF,
         SetOfMachineInstr &BBOutSet = getSet(Out, MBB, CurReg, NbReg);
         unsigned Size = BBOutSet.size();
         //   In[bb][color] = U Out[bb.predecessors][color]
-        for (MachineBasicBlock::const_pred_iterator
-                 PredMBB = MBB->pred_begin(),
-                 EndPredMBB = MBB->pred_end();
-             PredMBB != EndPredMBB; ++PredMBB) {
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
           SetOfMachineInstr &PredOutSet = getSet(Out, *PredMBB, CurReg, NbReg);
           BBInSet.insert(PredOutSet.begin(), PredOutSet.end());
         }
         //   insert reachableUses[bb][color] in each in[bb][color] op.reachedses
-        for (const MachineInstr *MI: BBInSet) {
+        for (const MachineInstr *MI : BBInSet) {
           SetOfMachineInstr &OpReachedUses =
-              getUses(ColorOpToReachedUses, CurReg, MI);
+              getUses(ColorOpToReachedUses, CurReg, *MI);
           OpReachedUses.insert(BBReachableUses.begin(), BBReachableUses.end());
         }
         //           Out[bb] = Gen[bb] U (In[bb] - Kill[bb])
-        if (!Kill[MBB].test(CurReg))
+        if (!Kill[&MBB].test(CurReg))
           BBOutSet.insert(BBInSet.begin(), BBInSet.end());
-        if (Gen[MBB][CurReg])
-          BBOutSet.insert(Gen[MBB][CurReg]);
+        if (Gen[&MBB][CurReg])
+          BBOutSet.insert(Gen[&MBB][CurReg]);
         HasChanged |= BBOutSet.size() != Size;
       }
     }
@@ -442,38 +427,31 @@ static void finitReachingDef(BlockToSetOfInstrsPerColor &In,
                              BlockToSetOfInstrsPerColor &Out,
                              BlockToInstrPerColor &Gen,
                              BlockToSetOfInstrsPerColor &ReachableUses) {
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = Out.begin(),
-                                                  End = Out.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = In.begin(),
-                                                  End = In.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToSetOfInstrsPerColor::const_iterator IT = ReachableUses.begin(),
-                                                  End = ReachableUses.end();
-       IT != End; ++IT)
-    delete[] IT->second;
-  for (BlockToInstrPerColor::const_iterator IT = Gen.begin(), End = Gen.end();
-       IT != End; ++IT)
-    delete[] IT->second;
+  for (auto &IT : Out)
+    delete[] IT.second;
+  for (auto &IT : In)
+    delete[] IT.second;
+  for (auto &IT : ReachableUses)
+    delete[] IT.second;
+  for (auto &IT : Gen)
+    delete[] IT.second;
 }
 
-/// Reaching definiton algorithm.
+/// Reaching definition algorithm.
 /// \param MF function on which the algorithm will operate.
 /// \param[out] ColorOpToReachedUses will contain the result of the reaching
 /// def algorithm.
 /// \param ADRPMode specify whether the reaching def algorithm should be tuned
 /// for ADRP optimization. \see initReachingDef for more details.
 /// \param DummyOp if not NULL, the algorithm will work at
-/// basic block scope and will set for every exposed defintion a use to
+/// basic block scope and will set for every exposed definition a use to
 /// @p DummyOp.
 /// \pre ColorOpToReachedUses is an array of at least number of registers of
 /// InstrToInstrs.
-static void reachingDef(MachineFunction *MF,
+static void reachingDef(MachineFunction &MF,
                         InstrToInstrs *ColorOpToReachedUses,
                         const MapRegToId &RegToId, bool ADRPMode = false,
-                        const MachineInstr *DummyOp = NULL) {
+                        const MachineInstr *DummyOp = nullptr) {
   // structures:
   // For each basic block.
   // Out: a set per color of definitions that reach the
@@ -511,17 +489,12 @@ static void printReachingDef(const InstrToInstrs *ColorOpToReachedUses,
       continue;
     DEBUG(dbgs() << "*** Reg " << PrintReg(IdToReg[CurReg], TRI) << " ***\n");
 
-    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
-    InstrToInstrs::const_iterator DefsItEnd =
-        ColorOpToReachedUses[CurReg].end();
-    for (; DefsIt != DefsItEnd; ++DefsIt) {
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
       DEBUG(dbgs() << "Def:\n");
-      DEBUG(DefsIt->first->print(dbgs()));
+      DEBUG(DefsIt.first->print(dbgs()));
       DEBUG(dbgs() << "Reachable uses:\n");
-      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
-                                             UsesItEnd = DefsIt->second.end();
-           UsesIt != UsesItEnd; ++UsesIt) {
-        DEBUG((*UsesIt)->print(dbgs()));
+      for (const MachineInstr *MI : DefsIt.second) {
+        DEBUG(MI->print(dbgs()));
       }
     }
   }
@@ -536,9 +509,9 @@ static bool canDefBePartOfLOH(const MachineInstr *Def) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::ADRP:
+  case AArch64::ADRP:
     return true;
-  case ARM64::ADDXri:
+  case AArch64::ADDXri:
     // Check immediate to see if the immediate is an address.
     switch (Def->getOperand(2).getType()) {
     default:
@@ -549,7 +522,7 @@ static bool canDefBePartOfLOH(const MachineInstr *Def) {
     case MachineOperand::MO_BlockAddress:
       return true;
     }
-  case ARM64::LDRXui:
+  case AArch64::LDRXui:
     // Check immediate to see if the immediate is an address.
     switch (Def->getOperand(2).getType()) {
     default:
@@ -568,13 +541,13 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
     // In case we have str xA, [xA, #imm], this is two different uses
     // of xA and we cannot fold, otherwise the xA stored may be wrong,
     // even if #imm == 0.
@@ -584,7 +557,7 @@ static bool isCandidateStore(const MachineInstr *Instr) {
   return false;
 }
 
-/// Given the result of a reaching defintion algorithm in ColorOpToReachedUses,
+/// Given the result of a reaching definition algorithm in ColorOpToReachedUses,
 /// Build the Use to Defs information and filter out obvious non-LOH candidates.
 /// In ADRPMode, non-LOH candidates are "uses" with non-ADRP definitions.
 /// In non-ADRPMode, non-LOH candidates are "uses" with several definition,
@@ -603,34 +576,29 @@ static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
     if (ColorOpToReachedUses[CurReg].empty())
       continue;
 
-    InstrToInstrs::const_iterator DefsIt = ColorOpToReachedUses[CurReg].begin();
-    InstrToInstrs::const_iterator DefsItEnd =
-        ColorOpToReachedUses[CurReg].end();
-    for (; DefsIt != DefsItEnd; ++DefsIt) {
-      for (SetOfMachineInstr::const_iterator UsesIt = DefsIt->second.begin(),
-                                             UsesItEnd = DefsIt->second.end();
-           UsesIt != UsesItEnd; ++UsesIt) {
-        const MachineInstr *Def = DefsIt->first;
+    for (const auto &DefsIt : ColorOpToReachedUses[CurReg]) {
+      for (const MachineInstr *MI : DefsIt.second) {
+        const MachineInstr *Def = DefsIt.first;
         MapRegToId::const_iterator It;
         // if all the reaching defs are not adrp, this use will not be
         // simplifiable.
-        if ((ADRPMode && Def->getOpcode() != ARM64::ADRP) ||
+        if ((ADRPMode && Def->getOpcode() != AArch64::ADRP) ||
             (!ADRPMode && !canDefBePartOfLOH(Def)) ||
-            (!ADRPMode && isCandidateStore(*UsesIt) &&
+            (!ADRPMode && isCandidateStore(MI) &&
              // store are LOH candidate iff the end of the chain is used as
              // base.
-             ((It = RegToId.find((*UsesIt)->getOperand(1).getReg())) == EndIt ||
+             ((It = RegToId.find((MI)->getOperand(1).getReg())) == EndIt ||
               It->second != CurReg))) {
-          NotCandidate.insert(*UsesIt);
+          NotCandidate.insert(MI);
           continue;
         }
         // Do not consider self reaching as a simplifiable case for ADRP.
-        if (!ADRPMode || *UsesIt != DefsIt->first) {
-          UseToReachingDefs[*UsesIt].insert(DefsIt->first);
+        if (!ADRPMode || MI != DefsIt.first) {
+          UseToReachingDefs[MI].insert(DefsIt.first);
           // If UsesIt has several reaching definitions, it is not
           // candidate for simplificaton in non-ADRPMode.
-          if (!ADRPMode && UseToReachingDefs[*UsesIt].size() > 1)
-            NotCandidate.insert(*UsesIt);
+          if (!ADRPMode && UseToReachingDefs[MI].size() > 1)
+            NotCandidate.insert(MI);
         }
       }
     }
@@ -647,10 +615,10 @@ static void reachedUsesToDefs(InstrToInstrs &UseToReachingDefs,
 /// Based on the use to defs information (in ADRPMode), compute the
 /// opportunities of LOH ADRP-related.
 static void computeADRP(const InstrToInstrs &UseToDefs,
-                        ARM64FunctionInfo &ARM64FI,
+                        AArch64FunctionInfo &AArch64FI,
                         const MachineDominatorTree *MDT) {
   DEBUG(dbgs() << "*** Compute LOH for ADRP\n");
-  for (const auto &Entry: UseToDefs) {
+  for (const auto &Entry : UseToDefs) {
     unsigned Size = Entry.second.size();
     if (Size == 0)
       continue;
@@ -666,7 +634,7 @@ static void computeADRP(const InstrToInstrs &UseToDefs,
       SmallVector<const MachineInstr *, 2> Args;
       Args.push_back(L2);
       Args.push_back(L1);
-      ARM64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
+      AArch64FI.addLOHDirective(MCLOH_AdrpAdrp, Args);
       ++NumADRPSimpleCandidate;
     }
 #ifdef DEBUG
@@ -688,19 +656,19 @@ static bool isCandidateLoad(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::LDRSBWui:
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHWui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (Instr->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)
+  case AArch64::LDRSBWui:
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHWui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (Instr->getOperand(2).getTargetFlags() & AArch64II::MO_GOT)
       return false;
     return true;
   }
@@ -713,12 +681,12 @@ static bool supportLoadFromLiteral(const MachineInstr *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
-  case ARM64::LDRSWui:
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
+  case AArch64::LDRSWui:
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
     return true;
   }
   // Unreachable.
@@ -737,7 +705,7 @@ static bool isCandidate(const MachineInstr *Instr,
     return false;
 
   const MachineInstr *Def = *UseToDefs.find(Instr)->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP) {
+  if (Def->getOpcode() != AArch64::ADRP) {
     // At this point, Def is ADDXri or LDRXui of the right type of
     // symbol, because we filtered out the uses that were not defined
     // by these kind of instructions (+ ADRP).
@@ -747,8 +715,9 @@ static bool isCandidate(const MachineInstr *Instr,
     if (!MDT->dominates(Def, Instr))
       return false;
     // Move one node up in the simple chain.
-    if (UseToDefs.find(Def) == UseToDefs.end()
-                               // The map may contain garbage we have to ignore.
+    if (UseToDefs.find(Def) ==
+            UseToDefs.end()
+            // The map may contain garbage we have to ignore.
         ||
         UseToDefs.find(Def)->second.empty())
       return false;
@@ -759,52 +728,52 @@ static bool isCandidate(const MachineInstr *Instr,
   // - top is ADRP.
   // - check the simple chain property: each intermediate node must
   // dominates the next one.
-  if (Def->getOpcode() == ARM64::ADRP)
+  if (Def->getOpcode() == AArch64::ADRP)
     return MDT->dominates(Def, Instr);
   return false;
 }
 
-static bool registerADRCandidate(const MachineInstr *Use,
+static bool registerADRCandidate(const MachineInstr &Use,
                                  const InstrToInstrs &UseToDefs,
                                  const InstrToInstrs *DefsPerColorToUses,
-                                 ARM64FunctionInfo &ARM64FI,
+                                 AArch64FunctionInfo &AArch64FI,
                                  SetOfMachineInstr *InvolvedInLOHs,
                                  const MapRegToId &RegToId) {
   // Look for opportunities to turn ADRP -> ADD or
   // ADRP -> LDR GOTPAGEOFF into ADR.
   // If ADRP has more than one use. Give up.
-  if (Use->getOpcode() != ARM64::ADDXri &&
-      (Use->getOpcode() != ARM64::LDRXui ||
-       !(Use->getOperand(2).getTargetFlags() & ARM64II::MO_GOT)))
+  if (Use.getOpcode() != AArch64::ADDXri &&
+      (Use.getOpcode() != AArch64::LDRXui ||
+       !(Use.getOperand(2).getTargetFlags() & AArch64II::MO_GOT)))
     return false;
-  InstrToInstrs::const_iterator It = UseToDefs.find(Use);
+  InstrToInstrs::const_iterator It = UseToDefs.find(&Use);
   // The map may contain garbage that we need to ignore.
   if (It == UseToDefs.end() || It->second.empty())
     return false;
-  const MachineInstr *Def = *It->second.begin();
-  if (Def->getOpcode() != ARM64::ADRP)
+  const MachineInstr &Def = **It->second.begin();
+  if (Def.getOpcode() != AArch64::ADRP)
     return false;
   // Check the number of users of ADRP.
   const SetOfMachineInstr *Users =
       getUses(DefsPerColorToUses,
-              RegToId.find(Def->getOperand(0).getReg())->second, Def);
+              RegToId.find(Def.getOperand(0).getReg())->second, Def);
   if (Users->size() > 1) {
     ++NumADRComplexCandidate;
     return false;
   }
   ++NumADRSimpleCandidate;
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Def)) &&
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Def)) &&
          "ADRP already involved in LOH.");
-  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(Use)) &&
+  assert((!InvolvedInLOHs || InvolvedInLOHs->insert(&Use)) &&
          "ADD already involved in LOH.");
-  DEBUG(dbgs() << "Record AdrpAdd\n" << *Def << '\n' << *Use << '\n');
+  DEBUG(dbgs() << "Record AdrpAdd\n" << Def << '\n' << Use << '\n');
 
   SmallVector<const MachineInstr *, 2> Args;
-  Args.push_back(Def);
-  Args.push_back(Use);
+  Args.push_back(&Def);
+  Args.push_back(&Use);
 
-  ARM64FI.addLOHDirective(Use->getOpcode() == ARM64::ADDXri ? MCLOH_AdrpAdd
-                                                            : MCLOH_AdrpLdrGot,
+  AArch64FI.addLOHDirective(Use.getOpcode() == AArch64::ADDXri ? MCLOH_AdrpAdd
+                                                           : MCLOH_AdrpLdrGot,
                           Args);
   return true;
 }
@@ -813,9 +782,9 @@ static bool registerADRCandidate(const MachineInstr *Use,
 /// opportunities of LOH non-ADRP-related
 static void computeOthers(const InstrToInstrs &UseToDefs,
                           const InstrToInstrs *DefsPerColorToUses,
-                          ARM64FunctionInfo &ARM64FI, const MapRegToId &RegToId,
+                          AArch64FunctionInfo &AArch64FI, const MapRegToId &RegToId,
                           const MachineDominatorTree *MDT) {
-  SetOfMachineInstr *InvolvedInLOHs = NULL;
+  SetOfMachineInstr *InvolvedInLOHs = nullptr;
 #ifdef DEBUG
   SetOfMachineInstr InvolvedInLOHsStorage;
   InvolvedInLOHs = &InvolvedInLOHsStorage;
@@ -831,20 +800,18 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
   // to be changed.
   SetOfMachineInstr PotentialCandidates;
   SetOfMachineInstr PotentialADROpportunities;
-  for (InstrToInstrs::const_iterator UseIt = UseToDefs.begin(),
-                                     EndUseIt = UseToDefs.end();
-       UseIt != EndUseIt; ++UseIt) {
+  for (auto &Use : UseToDefs) {
     // If no definition is available, this is a non candidate.
-    if (UseIt->second.empty())
+    if (Use.second.empty())
       continue;
     // Keep only instructions that are load or store and at the end of
     // a ADRP -> ADD/LDR/Nothing chain.
     // We already filtered out the no-chain cases.
-    if (!isCandidate(UseIt->first, UseToDefs, MDT)) {
-      PotentialADROpportunities.insert(UseIt->first);
+    if (!isCandidate(Use.first, UseToDefs, MDT)) {
+      PotentialADROpportunities.insert(Use.first);
       continue;
     }
-    PotentialCandidates.insert(UseIt->first);
+    PotentialCandidates.insert(Use.first);
   }
 
   // Make the following distinctions for statistics as the linker does
@@ -862,38 +829,34 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
   // PotentialCandidates are result of a chain ADRP -> ADD/LDR ->
   // A potential candidate becomes a candidate, if its current immediate
   // operand is zero and all nodes of the chain have respectively only one user
-  SetOfMachineInstr::const_iterator CandidateIt, EndCandidateIt;
 #ifdef DEBUG
   SetOfMachineInstr DefsOfPotentialCandidates;
 #endif
-  for (CandidateIt = PotentialCandidates.begin(),
-      EndCandidateIt = PotentialCandidates.end();
-       CandidateIt != EndCandidateIt; ++CandidateIt) {
-    const MachineInstr *Candidate = *CandidateIt;
+  for (const MachineInstr *Candidate : PotentialCandidates) {
     // Get the definition of the candidate i.e., ADD or LDR.
     const MachineInstr *Def = *UseToDefs.find(Candidate)->second.begin();
     // Record the elements of the chain.
     const MachineInstr *L1 = Def;
-    const MachineInstr *L2 = NULL;
+    const MachineInstr *L2 = nullptr;
     unsigned ImmediateDefOpc = Def->getOpcode();
-    if (Def->getOpcode() != ARM64::ADRP) {
+    if (Def->getOpcode() != AArch64::ADRP) {
       // Check the number of users of this node.
       const SetOfMachineInstr *Users =
           getUses(DefsPerColorToUses,
-                  RegToId.find(Def->getOperand(0).getReg())->second, Def);
+                  RegToId.find(Def->getOperand(0).getReg())->second, *Def);
       if (Users->size() > 1) {
 #ifdef DEBUG
         // if all the uses of this def are in potential candidate, this is
         // a complex candidate of level 2.
-        SetOfMachineInstr::const_iterator UseIt = Users->begin();
-        SetOfMachineInstr::const_iterator EndUseIt = Users->end();
-        for (; UseIt != EndUseIt; ++UseIt) {
-          if (!PotentialCandidates.count(*UseIt)) {
+        bool IsLevel2 = true;
+        for (const MachineInstr *MI : *Users) {
+          if (!PotentialCandidates.count(MI)) {
             ++NumTooCplxLvl2;
+            IsLevel2 = false;
             break;
           }
         }
-        if (UseIt == EndUseIt)
+        if (IsLevel2)
           ++NumCplxLvl2;
 #endif // DEBUG
         PotentialADROpportunities.insert(Def);
@@ -908,7 +871,7 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
     // Check the number of users of the first node in the chain, i.e., ADRP
     const SetOfMachineInstr *Users =
         getUses(DefsPerColorToUses,
-                RegToId.find(Def->getOperand(0).getReg())->second, Def);
+                RegToId.find(Def->getOperand(0).getReg())->second, *Def);
     if (Users->size() > 1) {
 #ifdef DEBUG
       // if all the uses of this def are in the defs of the potential candidate,
@@ -923,7 +886,7 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
         }
       }
       bool Found = false;
-      for (auto &Use: *Users) {
+      for (auto &Use : *Users) {
         if (!DefsOfPotentialCandidates.count(Use)) {
           ++NumTooCplxLvl1;
           Found = true;
@@ -936,15 +899,15 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
       continue;
     }
 
-    bool IsL2Add = (ImmediateDefOpc == ARM64::ADDXri);
+    bool IsL2Add = (ImmediateDefOpc == AArch64::ADDXri);
     // If the chain is three instructions long and ldr is the second element,
     // then this ldr must load form GOT, otherwise this is not a correct chain.
-    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != ARM64II::MO_GOT)
+    if (L2 && !IsL2Add && L2->getOperand(2).getTargetFlags() != AArch64II::MO_GOT)
       continue;
     SmallVector<const MachineInstr *, 3> Args;
     MCLOHType Kind;
     if (isCandidateLoad(Candidate)) {
-      if (L2 == NULL) {
+      if (!L2) {
         // At this point, the candidate LOH indicates that the ldr instruction
         // may use a direct access to the symbol. There is not such encoding
         // for loads of byte and half.
@@ -981,18 +944,18 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
 #ifdef DEBUG
         // get the immediate of the load
         if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
+          if (ImmediateDefOpc == AArch64::ADDXri)
             ++NumADDToLDR;
           else
             ++NumLDRToLDR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
+        else if (ImmediateDefOpc == AArch64::ADDXri)
           ++NumADDToLDRWithImm;
         else
           ++NumLDRToLDRWithImm;
 #endif // DEBUG
       }
     } else {
-      if (ImmediateDefOpc == ARM64::ADRP)
+      if (ImmediateDefOpc == AArch64::ADRP)
         continue;
       else {
 
@@ -1015,23 +978,23 @@ static void computeOthers(const InstrToInstrs &UseToDefs,
 #ifdef DEBUG
         // get the immediate of the store
         if (Candidate->getOperand(2).getImm() == 0)
-          if (ImmediateDefOpc == ARM64::ADDXri)
+          if (ImmediateDefOpc == AArch64::ADDXri)
             ++NumADDToSTR;
           else
             ++NumLDRToSTR;
-        else if (ImmediateDefOpc == ARM64::ADDXri)
+        else if (ImmediateDefOpc == AArch64::ADDXri)
           ++NumADDToSTRWithImm;
         else
           ++NumLDRToSTRWithImm;
 #endif // DEBUG
       }
     }
-    ARM64FI.addLOHDirective(Kind, Args);
+    AArch64FI.addLOHDirective(Kind, Args);
   }
 
   // Now, we grabbed all the big patterns, check ADR opportunities.
-  for (const MachineInstr *Candidate: PotentialADROpportunities)
-    registerADRCandidate(Candidate, UseToDefs, DefsPerColorToUses, ARM64FI,
+  for (const MachineInstr *Candidate : PotentialADROpportunities)
+    registerADRCandidate(*Candidate, UseToDefs, DefsPerColorToUses, AArch64FI,
                          InvolvedInLOHs, RegToId);
 }
 
@@ -1053,18 +1016,14 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
   }
 
   DEBUG(dbgs() << "** Collect Involved Register\n");
-  for (MachineFunction::const_iterator IMBB = MF.begin(), IMBBEnd = MF.end();
-       IMBB != IMBBEnd; ++IMBB)
-    for (MachineBasicBlock::const_iterator II = IMBB->begin(),
-                                           IEnd = IMBB->end();
-         II != IEnd; ++II) {
-
-      if (!canDefBePartOfLOH(II))
+  for (const auto &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      if (!canDefBePartOfLOH(&MI))
         continue;
 
       // Process defs
-      for (MachineInstr::const_mop_iterator IO = II->operands_begin(),
-                                            IOEnd = II->operands_end();
+      for (MachineInstr::const_mop_iterator IO = MI.operands_begin(),
+                                            IOEnd = MI.operands_end();
            IO != IOEnd; ++IO) {
         if (!IO->isReg() || !IO->isDef())
           continue;
@@ -1079,31 +1038,32 @@ static void collectInvolvedReg(MachineFunction &MF, MapRegToId &RegToId,
           }
       }
     }
+  }
 }
 
-bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
-  const TargetMachine &TM = Fn.getTarget();
+bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
+  const TargetMachine &TM = MF.getTarget();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
   const MachineDominatorTree *MDT = &getAnalysis<MachineDominatorTree>();
 
   MapRegToId RegToId;
   MapIdToReg IdToReg;
-  ARM64FunctionInfo *ARM64FI = Fn.getInfo<ARM64FunctionInfo>();
-  assert(ARM64FI && "No MachineFunctionInfo for this function!");
+  AArch64FunctionInfo *AArch64FI = MF.getInfo<AArch64FunctionInfo>();
+  assert(AArch64FI && "No MachineFunctionInfo for this function!");
 
-  DEBUG(dbgs() << "Looking for LOH in " << Fn.getName() << '\n');
+  DEBUG(dbgs() << "Looking for LOH in " << MF.getName() << '\n');
 
-  collectInvolvedReg(Fn, RegToId, IdToReg, TRI);
+  collectInvolvedReg(MF, RegToId, IdToReg, TRI);
   if (RegToId.empty())
     return false;
 
-  MachineInstr *DummyOp = NULL;
+  MachineInstr *DummyOp = nullptr;
   if (BasicBlockScopeOnly) {
-    const ARM64InstrInfo *TII =
-        static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+    const AArch64InstrInfo *TII =
+        static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
     // For local analysis, create a dummy operation to record uses that are not
     // local.
-    DummyOp = Fn.CreateMachineInstr(TII->get(ARM64::COPY), DebugLoc());
+    DummyOp = MF.CreateMachineInstr(TII->get(AArch64::COPY), DebugLoc());
   }
 
   unsigned NbReg = RegToId.size();
@@ -1114,7 +1074,7 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
 
   // Compute the reaching def in ADRP mode, meaning ADRP definitions
   // are first considered as uses.
-  reachingDef(&Fn, ColorOpToReachedUses, RegToId, true, DummyOp);
+  reachingDef(MF, ColorOpToReachedUses, RegToId, true, DummyOp);
   DEBUG(dbgs() << "ADRP reaching defs\n");
   DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
 
@@ -1124,14 +1084,14 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
   reachedUsesToDefs(ADRPToReachingDefs, ColorOpToReachedUses, RegToId, true);
 
   // Compute LOH for ADRP.
-  computeADRP(ADRPToReachingDefs, *ARM64FI, MDT);
+  computeADRP(ADRPToReachingDefs, *AArch64FI, MDT);
   delete[] ColorOpToReachedUses;
 
   // Continue with general ADRP -> ADD/LDR -> LDR/STR pattern.
   ColorOpToReachedUses = new InstrToInstrs[NbReg];
 
   // first perform a regular reaching def analysis.
-  reachingDef(&Fn, ColorOpToReachedUses, RegToId, false, DummyOp);
+  reachingDef(MF, ColorOpToReachedUses, RegToId, false, DummyOp);
   DEBUG(dbgs() << "All reaching defs\n");
   DEBUG(printReachingDef(ColorOpToReachedUses, NbReg, TRI, IdToReg));
 
@@ -1140,18 +1100,18 @@ bool ARM64CollectLOH::runOnMachineFunction(MachineFunction &Fn) {
   reachedUsesToDefs(UsesToReachingDefs, ColorOpToReachedUses, RegToId, false);
 
   // Compute other than AdrpAdrp LOH.
-  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *ARM64FI, RegToId,
+  computeOthers(UsesToReachingDefs, ColorOpToReachedUses, *AArch64FI, RegToId,
                 MDT);
   delete[] ColorOpToReachedUses;
 
   if (BasicBlockScopeOnly)
-    Fn.DeleteMachineInstr(DummyOp);
+    MF.DeleteMachineInstr(DummyOp);
 
   return Modified;
 }
 
-/// createARM64CollectLOHPass - returns an instance of the Statistic for
+/// createAArch64CollectLOHPass - returns an instance of the Statistic for
 /// linker optimization pass.
-FunctionPass *llvm::createARM64CollectLOHPass() {
-  return new ARM64CollectLOH();
+FunctionPass *llvm::createAArch64CollectLOHPass() {
+  return new AArch64CollectLOH();
 }
diff --git a/lib/Target/ARM64/ARM64ConditionalCompares.cpp b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index b495afa..452cdec 100644
--- a/lib/Target/ARM64/ARM64ConditionalCompares.cpp
+++ b/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64ConditionalCompares.cpp --- CCMP formation for ARM64 ---------===//
+//===-- AArch64ConditionalCompares.cpp --- CCMP formation for AArch64 -----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ARM64ConditionalCompares pass which reduces
+// This file implements the AArch64ConditionalCompares pass which reduces
 // branching and code size by using the conditional compare instructions CCMP,
 // CCMN, and FCMP.
 //
@@ -17,8 +17,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-ccmp"
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
@@ -43,14 +42,16 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-ccmp"
+
 // Absolute maximum number of instructions allowed per speculated block.
 // This bypasses all other heuristics, so it should be set fairly high.
 static cl::opt<unsigned> BlockInstrLimit(
-    "arm64-ccmp-limit", cl::init(30), cl::Hidden,
+    "aarch64-ccmp-limit", cl::init(30), cl::Hidden,
     cl::desc("Maximum number of instructions per speculated block."));
 
 // Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-ccmp", cl::Hidden,
+static cl::opt<bool> Stress("aarch64-stress-ccmp", cl::Hidden,
                             cl::desc("Turn all knobs to 11"));
 
 STATISTIC(NumConsidered, "Number of ccmps considered");
@@ -62,8 +63,8 @@ STATISTIC(NumCmpBranchRejs, "Number of ccmps rejected (CmpBB branch)");
 STATISTIC(NumCmpTermRejs, "Number of ccmps rejected (CmpBB is cbz...)");
 STATISTIC(NumImmRangeRejs, "Number of ccmps rejected (Imm out of range)");
 STATISTIC(NumLiveDstRejs, "Number of ccmps rejected (Cmp dest live)");
-STATISTIC(NumMultCPSRUses, "Number of ccmps rejected (CPSR used)");
-STATISTIC(NumUnknCPSRDefs, "Number of ccmps rejected (CPSR def unknown)");
+STATISTIC(NumMultNZCVUses, "Number of ccmps rejected (NZCV used)");
+STATISTIC(NumUnknNZCVDefs, "Number of ccmps rejected (NZCV def unknown)");
 
 STATISTIC(NumSpeculateRejs, "Number of ccmps rejected (Can't speculate)");
 
@@ -97,7 +98,7 @@ STATISTIC(NumCompBranches, "Number of cbz/cbnz branches converted");
 //
 // The cmp-conversion turns the compare instruction in CmpBB into a conditional
 // compare, and merges CmpBB into Head, speculatively executing its
-// instructions. The ARM64 conditional compare instructions have an immediate
+// instructions. The AArch64 conditional compare instructions have an immediate
 // operand that specifies the NZCV flag values when the condition is false and
 // the compare isn't executed. This makes it possible to chain compares with
 // different condition codes.
@@ -147,7 +148,7 @@ public:
   /// else.
   MachineBasicBlock *Head;
 
-  /// The block containing cmp+br.cond with a sucessor shared with Head.
+  /// The block containing cmp+br.cond with a successor shared with Head.
   MachineBasicBlock *CmpBB;
 
   /// The common successor for Head and CmpBB.
@@ -161,13 +162,13 @@ private:
   SmallVector<MachineOperand, 4> HeadCond;
 
   /// The condition code that makes Head branch to CmpBB.
-  ARM64CC::CondCode HeadCmpBBCC;
+  AArch64CC::CondCode HeadCmpBBCC;
 
   /// The branch condition in CmpBB.
   SmallVector<MachineOperand, 4> CmpBBCond;
 
   /// The condition code that makes CmpBB branch to Tail.
-  ARM64CC::CondCode CmpBBTailCC;
+  AArch64CC::CondCode CmpBBTailCC;
 
   /// Check if the Tail PHIs are trivially convertible.
   bool trivialTailPHIs();
@@ -212,13 +213,14 @@ public:
 // Check that all PHIs in Tail are selecting the same value from Head and CmpBB.
 // This means that no if-conversion is required when merging CmpBB into Head.
 bool SSACCmpConv::trivialTailPHIs() {
-  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
-       I != E && I->isPHI(); ++I) {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
     unsigned HeadReg = 0, CmpBBReg = 0;
     // PHI operands come in (VReg, MBB) pairs.
-    for (unsigned oi = 1, oe = I->getNumOperands(); oi != oe; oi += 2) {
-      MachineBasicBlock *MBB = I->getOperand(oi + 1).getMBB();
-      unsigned Reg = I->getOperand(oi).getReg();
+    for (unsigned oi = 1, oe = I.getNumOperands(); oi != oe; oi += 2) {
+      MachineBasicBlock *MBB = I.getOperand(oi + 1).getMBB();
+      unsigned Reg = I.getOperand(oi).getReg();
       if (MBB == Head) {
         assert((!HeadReg || HeadReg == Reg) && "Inconsistent PHI operands");
         HeadReg = Reg;
@@ -237,24 +239,25 @@ bool SSACCmpConv::trivialTailPHIs() {
 // Assuming that trivialTailPHIs() is true, update the Tail PHIs by simply
 // removing the CmpBB operands. The Head operands will be identical.
 void SSACCmpConv::updateTailPHIs() {
-  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
-       I != E && I->isPHI(); ++I) {
+  for (auto &I : *Tail) {
+    if (!I.isPHI())
+      break;
     // I is a PHI. It can have multiple entries for CmpBB.
-    for (unsigned oi = I->getNumOperands(); oi > 2; oi -= 2) {
+    for (unsigned oi = I.getNumOperands(); oi > 2; oi -= 2) {
       // PHI operands are (Reg, MBB) at (oi-2, oi-1).
-      if (I->getOperand(oi - 1).getMBB() == CmpBB) {
-        I->RemoveOperand(oi - 1);
-        I->RemoveOperand(oi - 2);
+      if (I.getOperand(oi - 1).getMBB() == CmpBB) {
+        I.RemoveOperand(oi - 1);
+        I.RemoveOperand(oi - 2);
       }
     }
   }
 }
 
-// This pass runs before the ARM64DeadRegisterDefinitions pass, so compares are
-// still writing virtual registers without any uses.
+// This pass runs before the AArch64DeadRegisterDefinitions pass, so compares
+// are still writing virtual registers without any uses.
 bool SSACCmpConv::isDeadDef(unsigned DstReg) {
   // Writes to the zero register are dead.
-  if (DstReg == ARM64::WZR || DstReg == ARM64::XZR)
+  if (DstReg == AArch64::WZR || DstReg == AArch64::XZR)
     return true;
   if (!TargetRegisterInfo::isVirtualRegister(DstReg))
     return false;
@@ -266,11 +269,11 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
 // Parse a condition code returned by AnalyzeBranch, and compute the CondCode
 // corresponding to TBB.
 // Return
-static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
+static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
   // A normal br.cond simply has the condition code.
   if (Cond[0].getImm() != -1) {
     assert(Cond.size() == 1 && "Unknown Cond array format");
-    CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
+    CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
     return true;
   }
   // For tbz and cbz instruction, the opcode is next.
@@ -279,15 +282,15 @@ static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
     // This includes tbz / tbnz branches which can't be converted to
     // ccmp + br.cond.
     return false;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
     assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::EQ;
+    CC = AArch64CC::EQ;
     return true;
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
     assert(Cond.size() == 3 && "Unknown Cond array format");
-    CC = ARM64CC::NE;
+    CC = AArch64CC::NE;
     return true;
   }
 }
@@ -295,20 +298,20 @@ static bool parseCond(ArrayRef<MachineOperand> Cond, ARM64CC::CondCode &CC) {
 MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
   MachineBasicBlock::iterator I = MBB->getFirstTerminator();
   if (I == MBB->end())
-    return 0;
+    return nullptr;
   // The terminator must be controlled by the flags.
-  if (!I->readsRegister(ARM64::CPSR)) {
+  if (!I->readsRegister(AArch64::NZCV)) {
     switch (I->getOpcode()) {
-    case ARM64::CBZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZW:
-    case ARM64::CBNZX:
+    case AArch64::CBZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZW:
+    case AArch64::CBNZX:
       // These can be converted into a ccmp against #0.
       return I;
     }
     ++NumCmpTermRejs;
     DEBUG(dbgs() << "Flags not used by terminator: " << *I);
-    return 0;
+    return nullptr;
   }
 
   // Now find the instruction controlling the terminator.
@@ -317,56 +320,56 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
     assert(!I->isTerminator() && "Spurious terminator");
     switch (I->getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
-    case ARM64::SUBSWri:
-    case ARM64::SUBSXri:
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri:
     // cmn is an alias for adds with a dead destination register.
-    case ARM64::ADDSWri:
-    case ARM64::ADDSXri:
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
       // Check that the immediate operand is within range, ccmp wants a uimm5.
       // Rd = SUBSri Rn, imm, shift
       if (I->getOperand(3).getImm() || !isUInt<5>(I->getOperand(2).getImm())) {
         DEBUG(dbgs() << "Immediate out of range for ccmp: " << *I);
         ++NumImmRangeRejs;
-        return 0;
+        return nullptr;
       }
     // Fall through.
-    case ARM64::SUBSWrr:
-    case ARM64::SUBSXrr:
-    case ARM64::ADDSWrr:
-    case ARM64::ADDSXrr:
+    case AArch64::SUBSWrr:
+    case AArch64::SUBSXrr:
+    case AArch64::ADDSWrr:
+    case AArch64::ADDSXrr:
       if (isDeadDef(I->getOperand(0).getReg()))
         return I;
       DEBUG(dbgs() << "Can't convert compare with live destination: " << *I);
       ++NumLiveDstRejs;
-      return 0;
-    case ARM64::FCMPSrr:
-    case ARM64::FCMPDrr:
-    case ARM64::FCMPESrr:
-    case ARM64::FCMPEDrr:
+      return nullptr;
+    case AArch64::FCMPSrr:
+    case AArch64::FCMPDrr:
+    case AArch64::FCMPESrr:
+    case AArch64::FCMPEDrr:
       return I;
     }
 
     // Check for flag reads and clobbers.
     MIOperands::PhysRegInfo PRI =
-        MIOperands(I).analyzePhysReg(ARM64::CPSR, TRI);
+        MIOperands(I).analyzePhysReg(AArch64::NZCV, TRI);
 
     if (PRI.Reads) {
       // The ccmp doesn't produce exactly the same flags as the original
       // compare, so reject the transform if there are uses of the flags
       // besides the terminators.
       DEBUG(dbgs() << "Can't create ccmp with multiple uses: " << *I);
-      ++NumMultCPSRUses;
-      return 0;
+      ++NumMultNZCVUses;
+      return nullptr;
     }
 
     if (PRI.Clobbers) {
       DEBUG(dbgs() << "Not convertible compare: " << *I);
-      ++NumUnknCPSRDefs;
-      return 0;
+      ++NumUnknNZCVDefs;
+      return nullptr;
     }
   }
   DEBUG(dbgs() << "Flags not defined in BB#" << MBB->getNumber() << '\n');
-  return 0;
+  return nullptr;
 }
 
 /// Determine if all the instructions in MBB can safely
@@ -376,7 +379,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
 ///
 bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
                                      const MachineInstr *CmpMI) {
-  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // Reject any live-in physregs. It's probably NZCV/EFLAGS, and very hard to
   // get right.
   if (!MBB->livein_empty()) {
     DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
@@ -387,10 +390,8 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
 
   // Check all instructions, except the terminators. It is assumed that
   // terminators never have side effects or define any used register values.
-  for (MachineBasicBlock::iterator I = MBB->begin(),
-                                   E = MBB->getFirstTerminator();
-       I != E; ++I) {
-    if (I->isDebugValue())
+  for (auto &I : make_range(MBB->begin(), MBB->getFirstTerminator())) {
+    if (I.isDebugValue())
       continue;
 
     if (++InstrCount > BlockInstrLimit && !Stress) {
@@ -400,29 +401,29 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
     }
 
     // There shouldn't normally be any phis in a single-predecessor block.
-    if (I->isPHI()) {
-      DEBUG(dbgs() << "Can't hoist: " << *I);
+    if (I.isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << I);
       return false;
     }
 
     // Don't speculate loads. Note that it may be possible and desirable to
     // speculate GOT or constant pool loads that are guaranteed not to trap,
     // but we don't support that for now.
-    if (I->mayLoad()) {
-      DEBUG(dbgs() << "Won't speculate load: " << *I);
+    if (I.mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << I);
       return false;
     }
 
     // We never speculate stores, so an AA pointer isn't necessary.
     bool DontMoveAcrossStore = true;
-    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
-      DEBUG(dbgs() << "Can't speculate: " << *I);
+    if (!I.isSafeToMove(TII, nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << I);
       return false;
     }
 
-    // Only CmpMI is alowed to clobber the flags.
-    if (&*I != CmpMI && I->modifiesRegister(ARM64::CPSR, TRI)) {
-      DEBUG(dbgs() << "Clobbers flags: " << *I);
+    // Only CmpMI is allowed to clobber the flags.
+    if (&I != CmpMI && I.modifiesRegister(AArch64::NZCV, TRI)) {
+      DEBUG(dbgs() << "Clobbers flags: " << I);
       return false;
     }
   }
@@ -434,7 +435,7 @@ bool SSACCmpConv::canSpeculateInstrs(MachineBasicBlock *MBB,
 ///
 bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   Head = MBB;
-  Tail = CmpBB = 0;
+  Tail = CmpBB = nullptr;
 
   if (Head->succ_size() != 2)
     return false;
@@ -494,7 +495,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
 
   // The branch we're looking to eliminate must be analyzable.
   HeadCond.clear();
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   if (TII->AnalyzeBranch(*Head, TBB, FBB, HeadCond)) {
     DEBUG(dbgs() << "Head branch not analyzable.\n");
     ++NumHeadBranchRejs;
@@ -518,11 +519,11 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // Make sure the branch direction is right.
   if (TBB != CmpBB) {
     assert(TBB == Tail && "Unexpected TBB");
-    HeadCmpBBCC = ARM64CC::getInvertedCondCode(HeadCmpBBCC);
+    HeadCmpBBCC = AArch64CC::getInvertedCondCode(HeadCmpBBCC);
   }
 
   CmpBBCond.clear();
-  TBB = FBB = 0;
+  TBB = FBB = nullptr;
   if (TII->AnalyzeBranch(*CmpBB, TBB, FBB, CmpBBCond)) {
     DEBUG(dbgs() << "CmpBB branch not analyzable.\n");
     ++NumCmpBranchRejs;
@@ -542,10 +543,10 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   }
 
   if (TBB != Tail)
-    CmpBBTailCC = ARM64CC::getInvertedCondCode(CmpBBTailCC);
+    CmpBBTailCC = AArch64CC::getInvertedCondCode(CmpBBTailCC);
 
-  DEBUG(dbgs() << "Head->CmpBB on " << ARM64CC::getCondCodeName(HeadCmpBBCC)
-               << ", CmpBB->Tail on " << ARM64CC::getCondCodeName(CmpBBTailCC)
+  DEBUG(dbgs() << "Head->CmpBB on " << AArch64CC::getCondCodeName(HeadCmpBBCC)
+               << ", CmpBB->Tail on " << AArch64CC::getCondCodeName(CmpBBTailCC)
                << '\n');
 
   CmpMI = findConvertibleCompare(CmpBB);
@@ -578,13 +579,13 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
     ++NumCompBranches;
     unsigned Opc = 0;
     switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-      Opc = ARM64::SUBSWri;
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+      Opc = AArch64::SUBSWri;
       break;
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
-      Opc = ARM64::SUBSXri;
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
+      Opc = AArch64::SUBSXri;
       break;
     default:
       llvm_unreachable("Cannot convert Head branch");
@@ -614,27 +615,27 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   switch (CmpMI->getOpcode()) {
   default:
     llvm_unreachable("Unknown compare opcode");
-  case ARM64::SUBSWri:    Opc = ARM64::CCMPWi; break;
-  case ARM64::SUBSWrr:    Opc = ARM64::CCMPWr; break;
-  case ARM64::SUBSXri:    Opc = ARM64::CCMPXi; break;
-  case ARM64::SUBSXrr:    Opc = ARM64::CCMPXr; break;
-  case ARM64::ADDSWri:    Opc = ARM64::CCMNWi; break;
-  case ARM64::ADDSWrr:    Opc = ARM64::CCMNWr; break;
-  case ARM64::ADDSXri:    Opc = ARM64::CCMNXi; break;
-  case ARM64::ADDSXrr:    Opc = ARM64::CCMNXr; break;
-  case ARM64::FCMPSrr:    Opc = ARM64::FCCMPSrr; FirstOp = 0; break;
-  case ARM64::FCMPDrr:    Opc = ARM64::FCCMPDrr; FirstOp = 0; break;
-  case ARM64::FCMPESrr:   Opc = ARM64::FCCMPESrr; FirstOp = 0; break;
-  case ARM64::FCMPEDrr:   Opc = ARM64::FCCMPEDrr; FirstOp = 0; break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-    Opc = ARM64::CCMPWi;
+  case AArch64::SUBSWri:    Opc = AArch64::CCMPWi; break;
+  case AArch64::SUBSWrr:    Opc = AArch64::CCMPWr; break;
+  case AArch64::SUBSXri:    Opc = AArch64::CCMPXi; break;
+  case AArch64::SUBSXrr:    Opc = AArch64::CCMPXr; break;
+  case AArch64::ADDSWri:    Opc = AArch64::CCMNWi; break;
+  case AArch64::ADDSWrr:    Opc = AArch64::CCMNWr; break;
+  case AArch64::ADDSXri:    Opc = AArch64::CCMNXi; break;
+  case AArch64::ADDSXrr:    Opc = AArch64::CCMNXr; break;
+  case AArch64::FCMPSrr:    Opc = AArch64::FCCMPSrr; FirstOp = 0; break;
+  case AArch64::FCMPDrr:    Opc = AArch64::FCCMPDrr; FirstOp = 0; break;
+  case AArch64::FCMPESrr:   Opc = AArch64::FCCMPESrr; FirstOp = 0; break;
+  case AArch64::FCMPEDrr:   Opc = AArch64::FCCMPEDrr; FirstOp = 0; break;
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+    Opc = AArch64::CCMPWi;
     FirstOp = 0;
     isZBranch = true;
     break;
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
-    Opc = ARM64::CCMPXi;
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
+    Opc = AArch64::CCMPXi;
     FirstOp = 0;
     isZBranch = true;
     break;
@@ -645,7 +646,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // The NZCV immediate operand should provide flags for the case where Head
   // would have branched to Tail. These flags should cause the new Head
   // terminator to branch to tail.
-  unsigned NZCV = ARM64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
+  unsigned NZCV = AArch64CC::getNZCVToSatisfyCondCode(CmpBBTailCC);
   const MCInstrDesc &MCID = TII->get(Opc);
   MRI->constrainRegClass(CmpMI->getOperand(FirstOp).getReg(),
                          TII->getRegClass(MCID, 0, TRI, *MF));
@@ -664,10 +665,10 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
   // If CmpMI was a terminator, we need a new conditional branch to replace it.
   // This now becomes a Head terminator.
   if (isZBranch) {
-    bool isNZ = CmpMI->getOpcode() == ARM64::CBNZW ||
-                CmpMI->getOpcode() == ARM64::CBNZX;
-    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(ARM64::Bcc))
-        .addImm(isNZ ? ARM64CC::NE : ARM64CC::EQ)
+    bool isNZ = CmpMI->getOpcode() == AArch64::CBNZW ||
+                CmpMI->getOpcode() == AArch64::CBNZX;
+    BuildMI(*Head, CmpMI, CmpMI->getDebugLoc(), TII->get(AArch64::Bcc))
+        .addImm(isNZ ? AArch64CC::NE : AArch64CC::EQ)
         .addOperand(CmpMI->getOperand(1)); // Branch target.
   }
   CmpMI->eraseFromParent();
@@ -686,10 +687,10 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
   // plus a branch instruction.
   if (HeadCond[0].getImm() == -1) {
     switch (HeadCond[1].getImm()) {
-    case ARM64::CBZW:
-    case ARM64::CBNZW:
-    case ARM64::CBZX:
-    case ARM64::CBNZX:
+    case AArch64::CBZW:
+    case AArch64::CBNZW:
+    case AArch64::CBZX:
+    case AArch64::CBNZX:
       // Therefore delta += 1
       delta = 1;
       break;
@@ -705,21 +706,21 @@ int SSACCmpConv::expectedCodeSizeDelta() const {
   default:
     --delta;
     break;
-  case ARM64::CBZW:
-  case ARM64::CBNZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZX:
+  case AArch64::CBZW:
+  case AArch64::CBNZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZX:
     break;
   }
   return delta;
 }
 
 //===----------------------------------------------------------------------===//
-//                       ARM64ConditionalCompares Pass
+//                       AArch64ConditionalCompares Pass
 //===----------------------------------------------------------------------===//
 
 namespace {
-class ARM64ConditionalCompares : public MachineFunctionPass {
+class AArch64ConditionalCompares : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MCSchedModel *SchedModel;
@@ -734,10 +735,12 @@ class ARM64ConditionalCompares : public MachineFunctionPass {
 
 public:
   static char ID;
-  ARM64ConditionalCompares() : MachineFunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const;
-  bool runOnMachineFunction(MachineFunction &MF);
-  const char *getPassName() const { return "ARM64 Conditional Compares"; }
+  AArch64ConditionalCompares() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override {
+    return "AArch64 Conditional Compares";
+  }
 
 private:
   bool tryConvert(MachineBasicBlock *);
@@ -748,25 +751,25 @@ private:
 };
 } // end anonymous namespace
 
-char ARM64ConditionalCompares::ID = 0;
+char AArch64ConditionalCompares::ID = 0;
 
 namespace llvm {
-void initializeARM64ConditionalComparesPass(PassRegistry &);
+void initializeAArch64ConditionalComparesPass(PassRegistry &);
 }
 
-INITIALIZE_PASS_BEGIN(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                      false, false)
+INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
+                      "AArch64 CCMP Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
-INITIALIZE_PASS_END(ARM64ConditionalCompares, "arm64-ccmp", "ARM64 CCMP Pass",
-                    false, false)
+INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
+                    "AArch64 CCMP Pass", false, false)
 
-FunctionPass *llvm::createARM64ConditionalCompares() {
-  return new ARM64ConditionalCompares();
+FunctionPass *llvm::createAArch64ConditionalCompares() {
+  return new AArch64ConditionalCompares();
 }
 
-void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
+void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfo>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
@@ -778,8 +781,8 @@ void ARM64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 /// Update the dominator tree after if-conversion erased some blocks.
-void
-ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
+void AArch64ConditionalCompares::updateDomTree(
+    ArrayRef<MachineBasicBlock *> Removed) {
   // convert() removes CmpBB which was previously dominated by Head.
   // CmpBB children should be transferred to Head.
   MachineDomTreeNode *HeadNode = DomTree->getNode(CmpConv.Head);
@@ -795,7 +798,7 @@ ARM64ConditionalCompares::updateDomTree(ArrayRef<MachineBasicBlock *> Removed) {
 
 /// Update LoopInfo after if-conversion.
 void
-ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
+AArch64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
   if (!Loops)
     return;
   for (unsigned i = 0, e = Removed.size(); i != e; ++i)
@@ -803,7 +806,7 @@ ARM64ConditionalCompares::updateLoops(ArrayRef<MachineBasicBlock *> Removed) {
 }
 
 /// Invalidate MachineTraceMetrics before if-conversion.
-void ARM64ConditionalCompares::invalidateTraces() {
+void AArch64ConditionalCompares::invalidateTraces() {
   Traces->invalidate(CmpConv.Head);
   Traces->invalidate(CmpConv.CmpBB);
 }
@@ -811,7 +814,7 @@ void ARM64ConditionalCompares::invalidateTraces() {
 /// Apply cost model and heuristics to the if-conversion in IfConv.
 /// Return true if the conversion is a good idea.
 ///
-bool ARM64ConditionalCompares::shouldConvert() {
+bool AArch64ConditionalCompares::shouldConvert() {
   // Stress testing mode disables all cost considerations.
   if (Stress)
     return true;
@@ -872,7 +875,7 @@ bool ARM64ConditionalCompares::shouldConvert() {
   return true;
 }
 
-bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
+bool AArch64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
   bool Changed = false;
   while (CmpConv.canConvert(MBB) && shouldConvert()) {
     invalidateTraces();
@@ -885,8 +888,8 @@ bool ARM64ConditionalCompares::tryConvert(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
-  DEBUG(dbgs() << "********** ARM64 Conditional Compares **********\n"
+bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 Conditional Compares **********\n"
                << "********** Function: " << MF.getName() << '\n');
   TII = MF.getTarget().getInstrInfo();
   TRI = MF.getTarget().getRegisterInfo();
@@ -896,7 +899,7 @@ bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DomTree = &getAnalysis<MachineDominatorTree>();
   Loops = getAnalysisIfAvailable<MachineLoopInfo>();
   Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
+  MinInstr = nullptr;
   MinSize = MF.getFunction()->getAttributes().hasAttribute(
       AttributeSet::FunctionIndex, Attribute::MinSize);
 
@@ -906,11 +909,9 @@ bool ARM64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   // Visit blocks in dominator tree pre-order. The pre-order enables multiple
   // cmp-conversions from the same head block.
   // Note that updateDomTree() modifies the children of the DomTree node
-  // currently being visited. The df_iterator supports that, it doesn't look at
+  // currently being visited. The df_iterator supports that; it doesn't look at
   // child_begin() / child_end() until after a node has been visited.
-  for (df_iterator<MachineDominatorTree *> I = df_begin(DomTree),
-                                           E = df_end(DomTree);
-       I != E; ++I)
+  for (auto *I : depth_first(DomTree))
     if (tryConvert(I->getBlock()))
       Changed = true;
 
diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
new file mode 100644
index 0000000..a2d853c
--- /dev/null
+++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp
@@ -0,0 +1,134 @@
+//==-- AArch64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// When allowed by the instruction, replace a dead definition of a GPR with
+// the zero register. This makes the code a bit friendlier towards the
+// hardware's register renamer.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-dead-defs"
+
+STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
+
+namespace {
+class AArch64DeadRegisterDefinitions : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  bool implicitlyDefinesOverlappingReg(unsigned Reg, const MachineInstr &MI);
+  bool processMachineBasicBlock(MachineBasicBlock &MBB);
+  bool usesFrameIndex(const MachineInstr &MI);
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  explicit AArch64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
+
+  virtual bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override { return "Dead register definitions"; }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+char AArch64DeadRegisterDefinitions::ID = 0;
+} // end anonymous namespace
+
+bool AArch64DeadRegisterDefinitions::implicitlyDefinesOverlappingReg(
+    unsigned Reg, const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.implicit_operands())
+    if (MO.isReg() && MO.isDef())
+      if (TRI->regsOverlap(Reg, MO.getReg()))
+        return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::usesFrameIndex(const MachineInstr &MI) {
+  for (const MachineOperand &Op : MI.uses())
+    if (Op.isFI())
+      return true;
+  return false;
+}
+
+bool AArch64DeadRegisterDefinitions::processMachineBasicBlock(
+    MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineInstr &MI : MBB) {
+    if (usesFrameIndex(MI)) {
+      // We need to skip this instruction because while it appears to have a
+      // dead def it uses a frame index which might expand into a multi
+      // instruction sequence during EPI.
+      DEBUG(dbgs() << "    Ignoring, operand is frame index\n");
+      continue;
+    }
+    for (int i = 0, e = MI.getDesc().getNumDefs(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isDead() && MO.isDef()) {
+        assert(!MO.isImplicit() && "Unexpected implicit def!");
+        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
+              MI.print(dbgs()));
+        // Be careful not to change the register if it's a tied operand.
+        if (MI.isRegTiedToUseOperand(i)) {
+          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
+          continue;
+        }
+        // Don't change the register if there's an implicit def of a subreg or
+        // supperreg.
+        if (implicitlyDefinesOverlappingReg(MO.getReg(), MI)) {
+          DEBUG(dbgs() << "    Ignoring, implicitly defines overlap reg.\n");
+          continue;
+        }
+        // Make sure the instruction take a register class that contains
+        // the zero register and replace it if so.
+        unsigned NewReg;
+        switch (MI.getDesc().OpInfo[i].RegClass) {
+        default:
+          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
+          continue;
+        case AArch64::GPR32RegClassID:
+          NewReg = AArch64::WZR;
+          break;
+        case AArch64::GPR64RegClassID:
+          NewReg = AArch64::XZR;
+          break;
+        }
+        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
+        MO.setReg(NewReg);
+        DEBUG(MI.print(dbgs()));
+        ++NumDeadDefsReplaced;
+      }
+    }
+  }
+  return Changed;
+}
+
+// Scan the function for instructions that have a dead definition of a
+// register. Replace that register with the zero register when possible.
+bool AArch64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getTarget().getRegisterInfo();
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64DeadRegisterDefinitions *****\n");
+
+  for (auto &MBB : MF)
+    if (processMachineBasicBlock(MBB))
+      Changed = true;
+  return Changed;
+}
+
+FunctionPass *llvm::createAArch64DeadRegisterDefinitions() {
+  return new AArch64DeadRegisterDefinitions();
+}
diff --git a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index e082baf..a76fd76 100644
--- a/lib/Target/ARM64/ARM64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64ExpandPseudoInsts.cpp - Expand pseudo instructions ---*- C++ -*-=//
+//==-- AArch64ExpandPseudoInsts.cpp - Expand pseudo instructions --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,25 +14,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "ARM64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Support/MathExtras.h"
 using namespace llvm;
 
 namespace {
-class ARM64ExpandPseudo : public MachineFunctionPass {
+class AArch64ExpandPseudo : public MachineFunctionPass {
 public:
   static char ID;
-  ARM64ExpandPseudo() : MachineFunctionPass(ID) {}
+  AArch64ExpandPseudo() : MachineFunctionPass(ID) {}
 
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-  virtual const char *getPassName() const {
-    return "ARM64 pseudo instruction expansion pass";
+  const char *getPassName() const override {
+    return "AArch64 pseudo instruction expansion pass";
   }
 
 private:
@@ -41,7 +41,7 @@ private:
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
 };
-char ARM64ExpandPseudo::ID = 0;
+char AArch64ExpandPseudo::ID = 0;
 }
 
 /// \brief Transfer implicit operands on the pseudo instruction to the
@@ -87,17 +87,17 @@ static uint64_t replicateChunk(uint64_t Imm, unsigned FromIdx, unsigned ToIdx) {
 static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
                        MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator &MBBI,
-                       const ARM64InstrInfo *TII, unsigned ChunkIdx) {
+                       const AArch64InstrInfo *TII, unsigned ChunkIdx) {
   assert(ChunkIdx < 4 && "Out of range chunk index specified!");
   const unsigned ShiftAmt = ChunkIdx * 16;
 
   uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
+  if (AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding)) {
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
             .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
+            .addReg(AArch64::XZR)
             .addImm(Encoding);
 
     // Create the MOVK instruction.
@@ -105,11 +105,11 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
     const unsigned DstReg = MI.getOperand(0).getReg();
     const bool DstIsDead = MI.getOperand(0).isDead();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     transferImpOps(MI, MIB, MIB1);
     MI.eraseFromParent();
@@ -124,7 +124,7 @@ static bool tryOrrMovk(uint64_t UImm, uint64_t OrrImm, MachineInstr &MI,
 static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
   Chunk = (Chunk << 48) | (Chunk << 32) | (Chunk << 16) | Chunk;
 
-  return ARM64_AM::processLogicalImmediate(Chunk, 64, Encoding);
+  return AArch64_AM::processLogicalImmediate(Chunk, 64, Encoding);
 }
 
 /// \brief Check for identical 16-bit chunks within the constant and if so
@@ -138,7 +138,7 @@ static bool canUseOrr(uint64_t Chunk, uint64_t &Encoding) {
 static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
                                  MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator &MBBI,
-                                 const ARM64InstrInfo *TII) {
+                                 const AArch64InstrInfo *TII) {
   typedef DenseMap<uint64_t, unsigned> CountMap;
   CountMap Counts;
 
@@ -162,9 +162,9 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
     const bool CountThree = Count == 3;
     // Create the ORR-immediate instruction.
     MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
             .addOperand(MI.getOperand(0))
-            .addReg(ARM64::XZR)
+            .addReg(AArch64::XZR)
             .addImm(Encoding);
 
     const unsigned DstReg = MI.getOperand(0).getReg();
@@ -182,12 +182,12 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
 
     // Create the first MOVK instruction.
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg,
                     RegState::Define | getDeadRegState(DstIsDead && CountThree))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     // In case we have three instances the whole constant is now materialized
     // and we can exit.
@@ -207,11 +207,11 @@ static bool tryToreplicateChunks(uint64_t UImm, MachineInstr &MI,
 
     // Create the second MOVK instruction.
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
             .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
             .addReg(DstReg)
             .addImm(Imm16)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt));
 
     transferImpOps(MI, MIB, MIB2);
     MI.eraseFromParent();
@@ -272,7 +272,7 @@ static uint64_t updateImm(uint64_t Imm, unsigned Idx, bool Clear) {
 static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
                               MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator &MBBI,
-                              const ARM64InstrInfo *TII) {
+                              const AArch64InstrInfo *TII) {
   const int NotSet = -1;
   const uint64_t Mask = 0xFFFF;
 
@@ -343,11 +343,11 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
   // Create the ORR-immediate instruction.
   uint64_t Encoding = 0;
-  ARM64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
+  AArch64_AM::processLogicalImmediate(OrrImm, 64, Encoding);
   MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ORRXri))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ORRXri))
           .addOperand(MI.getOperand(0))
-          .addReg(ARM64::XZR)
+          .addReg(AArch64::XZR)
           .addImm(Encoding);
 
   const unsigned DstReg = MI.getOperand(0).getReg();
@@ -356,12 +356,13 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
   const bool SingleMovk = SecondMovkIdx == NotSet;
   // Create the first MOVK instruction.
   MachineInstrBuilder MIB1 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
           .addReg(DstReg,
                   RegState::Define | getDeadRegState(DstIsDead && SingleMovk))
           .addReg(DstReg)
           .addImm(getChunk(UImm, FirstMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, FirstMovkIdx * 16));
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, FirstMovkIdx * 16));
 
   // Early exit in case we only need to emit a single MOVK instruction.
   if (SingleMovk) {
@@ -372,11 +373,12 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
   // Create the second MOVK instruction.
   MachineInstrBuilder MIB2 =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::MOVKXi))
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MOVKXi))
           .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
           .addReg(DstReg)
           .addImm(getChunk(UImm, SecondMovkIdx))
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, SecondMovkIdx * 16));
+          .addImm(
+              AArch64_AM::getShifterImm(AArch64_AM::LSL, SecondMovkIdx * 16));
 
   transferImpOps(MI, MIB, MIB2);
   MI.eraseFromParent();
@@ -385,9 +387,9 @@ static bool trySequenceOfOnes(uint64_t UImm, MachineInstr &MI,
 
 /// \brief Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more
 /// real move-immediate instructions to synthesize the immediate.
-bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     unsigned BitSize) {
+bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       unsigned BitSize) {
   MachineInstr &MI = *MBBI;
   uint64_t Imm = MI.getOperand(1).getImm();
   const unsigned Mask = 0xFFFF;
@@ -395,12 +397,12 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   // Try a MOVI instruction (aka ORR-immediate with the zero register).
   uint64_t UImm = Imm << (64 - BitSize) >> (64 - BitSize);
   uint64_t Encoding;
-  if (ARM64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
-    unsigned Opc = (BitSize == 32 ? ARM64::ORRWri : ARM64::ORRXri);
+  if (AArch64_AM::processLogicalImmediate(UImm, BitSize, Encoding)) {
+    unsigned Opc = (BitSize == 32 ? AArch64::ORRWri : AArch64::ORRXri);
     MachineInstrBuilder MIB =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
             .addOperand(MI.getOperand(0))
-            .addReg(BitSize == 32 ? ARM64::WZR : ARM64::XZR)
+            .addReg(BitSize == 32 ? AArch64::WZR : AArch64::XZR)
             .addImm(Encoding);
     transferImpOps(MI, MIB, MIB);
     MI.eraseFromParent();
@@ -504,9 +506,9 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   unsigned FirstOpc;
   if (BitSize == 32) {
     Imm &= (1LL << 32) - 1;
-    FirstOpc = (isNeg ? ARM64::MOVNWi : ARM64::MOVZWi);
+    FirstOpc = (isNeg ? AArch64::MOVNWi : AArch64::MOVZWi);
   } else {
-    FirstOpc = (isNeg ? ARM64::MOVNXi : ARM64::MOVZXi);
+    FirstOpc = (isNeg ? AArch64::MOVNXi : AArch64::MOVZXi);
   }
   unsigned Shift = 0;     // LSL amount for high bits with MOVZ/MOVN
   unsigned LastShift = 0; // LSL amount for last MOVK
@@ -524,7 +526,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
           .addReg(DstReg, RegState::Define |
                               getDeadRegState(DstIsDead && Shift == LastShift))
           .addImm(Imm16)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
 
   // If a MOVN was used for the high bits of a negative value, flip the rest
   // of the bits back for use with MOVK.
@@ -538,7 +540,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
   }
 
   MachineInstrBuilder MIB2;
-  unsigned Opc = (BitSize == 32 ? ARM64::MOVKWi : ARM64::MOVKXi);
+  unsigned Opc = (BitSize == 32 ? AArch64::MOVKWi : AArch64::MOVKXi);
   while (Shift != LastShift) {
     Shift -= 16;
     Imm16 = (Imm >> Shift) & Mask;
@@ -550,7 +552,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
                            getDeadRegState(DstIsDead && Shift == LastShift))
                .addReg(DstReg)
                .addImm(Imm16)
-               .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, Shift));
+               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, Shift));
   }
 
   transferImpOps(MI, MIB1, MIB2);
@@ -560,7 +562,7 @@ bool ARM64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
 
 /// \brief If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
-bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
+bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
@@ -568,67 +570,76 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
   default:
     break;
 
-  case ARM64::ADDWrr:
-  case ARM64::SUBWrr:
-  case ARM64::ADDXrr:
-  case ARM64::SUBXrr:
-  case ARM64::ADDSWrr:
-  case ARM64::SUBSWrr:
-  case ARM64::ADDSXrr:
-  case ARM64::SUBSXrr:
-  case ARM64::ANDWrr:
-  case ARM64::ANDXrr:
-  case ARM64::BICWrr:
-  case ARM64::BICXrr:
-  case ARM64::EONWrr:
-  case ARM64::EONXrr:
-  case ARM64::EORWrr:
-  case ARM64::EORXrr:
-  case ARM64::ORNWrr:
-  case ARM64::ORNXrr:
-  case ARM64::ORRWrr:
-  case ARM64::ORRXrr: {
+  case AArch64::ADDWrr:
+  case AArch64::SUBWrr:
+  case AArch64::ADDXrr:
+  case AArch64::SUBXrr:
+  case AArch64::ADDSWrr:
+  case AArch64::SUBSWrr:
+  case AArch64::ADDSXrr:
+  case AArch64::SUBSXrr:
+  case AArch64::ANDWrr:
+  case AArch64::ANDXrr:
+  case AArch64::BICWrr:
+  case AArch64::BICXrr:
+  case AArch64::ANDSWrr:
+  case AArch64::ANDSXrr:
+  case AArch64::BICSWrr:
+  case AArch64::BICSXrr:
+  case AArch64::EONWrr:
+  case AArch64::EONXrr:
+  case AArch64::EORWrr:
+  case AArch64::EORXrr:
+  case AArch64::ORNWrr:
+  case AArch64::ORNXrr:
+  case AArch64::ORRWrr:
+  case AArch64::ORRXrr: {
     unsigned Opcode;
     switch (MI.getOpcode()) {
     default:
       return false;
-    case ARM64::ADDWrr:      Opcode = ARM64::ADDWrs; break;
-    case ARM64::SUBWrr:      Opcode = ARM64::SUBWrs; break;
-    case ARM64::ADDXrr:      Opcode = ARM64::ADDXrs; break;
-    case ARM64::SUBXrr:      Opcode = ARM64::SUBXrs; break;
-    case ARM64::ADDSWrr:     Opcode = ARM64::ADDSWrs; break;
-    case ARM64::SUBSWrr:     Opcode = ARM64::SUBSWrs; break;
-    case ARM64::ADDSXrr:     Opcode = ARM64::ADDSXrs; break;
-    case ARM64::SUBSXrr:     Opcode = ARM64::SUBSXrs; break;
-    case ARM64::ANDWrr:      Opcode = ARM64::ANDWrs; break;
-    case ARM64::ANDXrr:      Opcode = ARM64::ANDXrs; break;
-    case ARM64::BICWrr:      Opcode = ARM64::BICWrs; break;
-    case ARM64::BICXrr:      Opcode = ARM64::BICXrs; break;
-    case ARM64::EONWrr:      Opcode = ARM64::EONWrs; break;
-    case ARM64::EONXrr:      Opcode = ARM64::EONXrs; break;
-    case ARM64::EORWrr:      Opcode = ARM64::EORWrs; break;
-    case ARM64::EORXrr:      Opcode = ARM64::EORXrs; break;
-    case ARM64::ORNWrr:      Opcode = ARM64::ORNWrs; break;
-    case ARM64::ORNXrr:      Opcode = ARM64::ORNXrs; break;
-    case ARM64::ORRWrr:      Opcode = ARM64::ORRWrs; break;
-    case ARM64::ORRXrr:      Opcode = ARM64::ORRXrs; break;
+    case AArch64::ADDWrr:      Opcode = AArch64::ADDWrs; break;
+    case AArch64::SUBWrr:      Opcode = AArch64::SUBWrs; break;
+    case AArch64::ADDXrr:      Opcode = AArch64::ADDXrs; break;
+    case AArch64::SUBXrr:      Opcode = AArch64::SUBXrs; break;
+    case AArch64::ADDSWrr:     Opcode = AArch64::ADDSWrs; break;
+    case AArch64::SUBSWrr:     Opcode = AArch64::SUBSWrs; break;
+    case AArch64::ADDSXrr:     Opcode = AArch64::ADDSXrs; break;
+    case AArch64::SUBSXrr:     Opcode = AArch64::SUBSXrs; break;
+    case AArch64::ANDWrr:      Opcode = AArch64::ANDWrs; break;
+    case AArch64::ANDXrr:      Opcode = AArch64::ANDXrs; break;
+    case AArch64::BICWrr:      Opcode = AArch64::BICWrs; break;
+    case AArch64::BICXrr:      Opcode = AArch64::BICXrs; break;
+    case AArch64::ANDSWrr:     Opcode = AArch64::ANDSWrs; break;
+    case AArch64::ANDSXrr:     Opcode = AArch64::ANDSXrs; break;
+    case AArch64::BICSWrr:     Opcode = AArch64::BICSWrs; break;
+    case AArch64::BICSXrr:     Opcode = AArch64::BICSXrs; break;
+    case AArch64::EONWrr:      Opcode = AArch64::EONWrs; break;
+    case AArch64::EONXrr:      Opcode = AArch64::EONXrs; break;
+    case AArch64::EORWrr:      Opcode = AArch64::EORWrs; break;
+    case AArch64::EORXrr:      Opcode = AArch64::EORXrs; break;
+    case AArch64::ORNWrr:      Opcode = AArch64::ORNWrs; break;
+    case AArch64::ORNXrr:      Opcode = AArch64::ORNXrs; break;
+    case AArch64::ORRWrr:      Opcode = AArch64::ORRWrs; break;
+    case AArch64::ORRXrr:      Opcode = AArch64::ORRXrs; break;
     }
     MachineInstrBuilder MIB1 =
         BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode),
                 MI.getOperand(0).getReg())
             .addOperand(MI.getOperand(1))
             .addOperand(MI.getOperand(2))
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
     transferImpOps(MI, MIB1, MIB1);
     MI.eraseFromParent();
     return true;
   }
 
-  case ARM64::FCVTSHpseudo: {
+  case AArch64::FCVTSHpseudo: {
     MachineOperand Src = MI.getOperand(1);
     Src.setImplicit();
-    unsigned SrcH = TII->getRegisterInfo().getSubReg(Src.getReg(), ARM64::hsub);
-    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::FCVTSHr))
+    unsigned SrcH =
+        TII->getRegisterInfo().getSubReg(Src.getReg(), AArch64::hsub);
+    auto MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::FCVTSHr))
                    .addOperand(MI.getOperand(0))
                    .addReg(SrcH, RegState::Undef)
                    .addOperand(Src);
@@ -636,33 +647,34 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
-  case ARM64::LOADgot: {
+  case AArch64::LOADgot: {
     // Expand into ADRP + LDR.
     unsigned DstReg = MI.getOperand(0).getReg();
     const MachineOperand &MO1 = MI.getOperand(1);
     unsigned Flags = MO1.getTargetFlags();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg);
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg);
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::LDRXui))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::LDRXui))
             .addOperand(MI.getOperand(0))
             .addReg(DstReg);
 
     if (MO1.isGlobal()) {
-      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | ARM64II::MO_PAGE);
+      MIB1.addGlobalAddress(MO1.getGlobal(), 0, Flags | AArch64II::MO_PAGE);
       MIB2.addGlobalAddress(MO1.getGlobal(), 0,
-                            Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                            Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     } else if (MO1.isSymbol()) {
-      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | ARM64II::MO_PAGE);
+      MIB1.addExternalSymbol(MO1.getSymbolName(), Flags | AArch64II::MO_PAGE);
       MIB2.addExternalSymbol(MO1.getSymbolName(),
-                             Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                             Flags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
     } else {
       assert(MO1.isCPI() &&
              "Only expect globals, externalsymbols, or constant pools");
       MIB1.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGE);
+                                Flags | AArch64II::MO_PAGE);
       MIB2.addConstantPoolIndex(MO1.getIndex(), MO1.getOffset(),
-                                Flags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+                                Flags | AArch64II::MO_PAGEOFF |
+                                    AArch64II::MO_NC);
     }
 
     transferImpOps(MI, MIB1, MIB2);
@@ -670,20 +682,20 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return true;
   }
 
-  case ARM64::MOVaddr:
-  case ARM64::MOVaddrJT:
-  case ARM64::MOVaddrCP:
-  case ARM64::MOVaddrBA:
-  case ARM64::MOVaddrTLS:
-  case ARM64::MOVaddrEXT: {
+  case AArch64::MOVaddr:
+  case AArch64::MOVaddrJT:
+  case AArch64::MOVaddrCP:
+  case AArch64::MOVaddrBA:
+  case AArch64::MOVaddrTLS:
+  case AArch64::MOVaddrEXT: {
     // Expand into ADRP + ADD.
     unsigned DstReg = MI.getOperand(0).getReg();
     MachineInstrBuilder MIB1 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADRP), DstReg)
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADRP), DstReg)
             .addOperand(MI.getOperand(1));
 
     MachineInstrBuilder MIB2 =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::ADDXri))
+        BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri))
             .addOperand(MI.getOperand(0))
             .addReg(DstReg)
             .addOperand(MI.getOperand(2))
@@ -694,13 +706,13 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return true;
   }
 
-  case ARM64::MOVi32imm:
+  case AArch64::MOVi32imm:
     return expandMOVImm(MBB, MBBI, 32);
-  case ARM64::MOVi64imm:
+  case AArch64::MOVi64imm:
     return expandMOVImm(MBB, MBBI, 64);
-  case ARM64::RET_ReallyLR:
-    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM64::RET))
-        .addReg(ARM64::LR);
+  case AArch64::RET_ReallyLR:
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::RET))
+        .addReg(AArch64::LR);
     MI.eraseFromParent();
     return true;
   }
@@ -709,7 +721,7 @@ bool ARM64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
 
 /// \brief Iterate over the instructions in basic block MBB and expand any
 /// pseudo instructions.  Return true if anything was modified.
-bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
+bool AArch64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
 
   MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
@@ -722,8 +734,8 @@ bool ARM64ExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
   return Modified;
 }
 
-bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
-  TII = static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
+bool AArch64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
 
   bool Modified = false;
   for (auto &MBB : MF)
@@ -732,6 +744,6 @@ bool ARM64ExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// \brief Returns an instance of the pseudo instruction expansion pass.
-FunctionPass *llvm::createARM64ExpandPseudoPass() {
-  return new ARM64ExpandPseudo();
+FunctionPass *llvm::createAArch64ExpandPseudoPass() {
+  return new AArch64ExpandPseudo();
 }
diff --git a/lib/Target/ARM64/ARM64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 51b0f76..c3b5369 100644
--- a/lib/Target/ARM64/ARM64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -1,4 +1,4 @@
-//===-- ARM6464FastISel.cpp - ARM64 FastISel implementation ---------------===//
+//===-- AArch6464FastISel.cpp - AArch64 FastISel implementation -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the ARM64-specific support for the FastISel class. Some
+// This file defines the AArch64-specific support for the FastISel class. Some
 // of the target-specific code is generated by tablegen in the file
-// ARM64GenFastISel.inc, which is #included here.
+// AArch64GenFastISel.inc, which is #included here.
 //
 //===----------------------------------------------------------------------===//
 
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64.h"
+#include "AArch64TargetMachine.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -40,7 +39,7 @@ using namespace llvm;
 
 namespace {
 
-class ARM64FastISel : public FastISel {
+class AArch64FastISel : public FastISel {
 
   class Address {
   public:
@@ -85,9 +84,9 @@ class ARM64FastISel : public FastISel {
     bool isValid() { return isFIBase() || (isRegBase() && getReg() != 0); }
   };
 
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
+  const AArch64Subtarget *Subtarget;
   LLVMContext *Context;
 
 private:
@@ -130,8 +129,8 @@ private:
   unsigned EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
   unsigned Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt);
 
-  unsigned ARM64MaterializeFP(const ConstantFP *CFP, MVT VT);
-  unsigned ARM64MaterializeGV(const GlobalValue *GV);
+  unsigned AArch64MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned AArch64MaterializeGV(const GlobalValue *GV);
 
   // Call handling routines.
 private:
@@ -147,32 +146,32 @@ private:
 
 public:
   // Backend specific FastISel code.
-  virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
-  virtual unsigned TargetMaterializeConstant(const Constant *C);
+  unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
 
-  explicit ARM64FastISel(FunctionLoweringInfo &funcInfo,
+  explicit AArch64FastISel(FunctionLoweringInfo &funcInfo,
                          const TargetLibraryInfo *libInfo)
       : FastISel(funcInfo, libInfo) {
-    Subtarget = &TM.getSubtarget<ARM64Subtarget>();
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
     Context = &funcInfo.Fn->getContext();
   }
 
-  virtual bool TargetSelectInstruction(const Instruction *I);
+  bool TargetSelectInstruction(const Instruction *I) override;
 
-#include "ARM64GenFastISel.inc"
+#include "AArch64GenFastISel.inc"
 };
 
 } // end anonymous namespace
 
-#include "ARM64GenCallingConv.inc"
+#include "AArch64GenCallingConv.inc"
 
-CCAssignFn *ARM64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
+CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
   if (CC == CallingConv::WebKit_JS)
-    return CC_ARM64_WebKit_JS;
-  return Subtarget->isTargetDarwin() ? CC_ARM64_DarwinPCS : CC_ARM64_AAPCS;
+    return CC_AArch64_WebKit_JS;
+  return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
 }
 
-unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
+unsigned AArch64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   assert(TLI.getValueType(AI->getType(), true) == MVT::i64 &&
          "Alloca should always return a pointer.");
 
@@ -184,8 +183,8 @@ unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
       FuncInfo.StaticAllocaMap.find(AI);
 
   if (SI != FuncInfo.StaticAllocaMap.end()) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR64RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+    unsigned ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addFrameIndex(SI->second)
         .addImm(0)
@@ -196,7 +195,10 @@ unsigned ARM64FastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   return 0;
 }
 
-unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+unsigned AArch64FastISel::AArch64MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return 0;
+
   const APFloat Val = CFP->getValueAPF();
   bool is64bit = (VT == MVT::f64);
 
@@ -206,11 +208,11 @@ unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
     int Imm;
     unsigned Opc;
     if (is64bit) {
-      Imm = ARM64_AM::getFP64Imm(Val);
-      Opc = ARM64::FMOVDi;
+      Imm = AArch64_AM::getFP64Imm(Val);
+      Opc = AArch64::FMOVDi;
     } else {
-      Imm = ARM64_AM::getFP32Imm(Val);
-      Opc = ARM64::FMOVSi;
+      Imm = AArch64_AM::getFP32Imm(Val);
+      Opc = AArch64::FMOVSi;
     }
     unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
@@ -225,24 +227,29 @@ unsigned ARM64FastISel::ARM64MaterializeFP(const ConstantFP *CFP, MVT VT) {
     Align = DL.getTypeAllocSize(CFP->getType());
 
   unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-          ADRPReg).addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGE);
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+          ADRPReg).addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGE);
 
-  unsigned Opc = is64bit ? ARM64::LDRDui : ARM64::LDRSui;
+  unsigned Opc = is64bit ? AArch64::LDRDui : AArch64::LDRSui;
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(ADRPReg)
-      .addConstantPoolIndex(Idx, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
+      .addConstantPoolIndex(Idx, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
   return ResultReg;
 }
 
-unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
+unsigned AArch64FastISel::AArch64MaterializeGV(const GlobalValue *GV) {
   // We can't handle thread-local variables quickly yet. Unfortunately we have
   // to peer through any aliases to find out if that rule applies.
   const GlobalValue *TLSGV = GV;
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    TLSGV = GA->getAliasedGlobal();
+    TLSGV = GA->getAliasee();
+
+  // MachO still uses GOT for large code-model accesses, but ELF requires
+  // movz/movk sequences, which FastISel doesn't handle yet.
+  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+    return 0;
 
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(TLSGV))
     if (GVar->isThreadLocal())
@@ -253,35 +260,38 @@ unsigned ARM64FastISel::ARM64MaterializeGV(const GlobalValue *GV) {
   EVT DestEVT = TLI.getValueType(GV->getType(), true);
   if (!DestEVT.isSimple())
     return 0;
-  MVT DestVT = DestEVT.getSimpleVT();
 
-  unsigned ADRPReg = createResultReg(&ARM64::GPR64RegClass);
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
+  unsigned ResultReg;
 
-  if (OpFlags & ARM64II::MO_GOT) {
+  if (OpFlags & AArch64II::MO_GOT) {
     // ADRP + LDRX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
             ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGE);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::LDRXui),
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::LDRXui),
             ResultReg)
         .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_GOT | ARM64II::MO_PAGEOFF |
-                                     ARM64II::MO_NC);
+        .addGlobalAddress(GV, 0, AArch64II::MO_GOT | AArch64II::MO_PAGEOFF |
+                          AArch64II::MO_NC);
   } else {
     // ADRP + ADDX
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADRP),
-            ADRPReg).addGlobalAddress(GV, 0, ARM64II::MO_PAGE);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ADDXri),
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
+            ADRPReg).addGlobalAddress(GV, 0, AArch64II::MO_PAGE);
+
+    ResultReg = createResultReg(&AArch64::GPR64spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADDXri),
             ResultReg)
         .addReg(ADRPReg)
-        .addGlobalAddress(GV, 0, ARM64II::MO_PAGEOFF | ARM64II::MO_NC)
+        .addGlobalAddress(GV, 0, AArch64II::MO_PAGEOFF | AArch64II::MO_NC)
         .addImm(0);
   }
   return ResultReg;
 }
 
-unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
+unsigned AArch64FastISel::TargetMaterializeConstant(const Constant *C) {
   EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
@@ -291,16 +301,16 @@ unsigned ARM64FastISel::TargetMaterializeConstant(const Constant *C) {
 
   // FIXME: Handle ConstantInt.
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
-    return ARM64MaterializeFP(CFP, VT);
+    return AArch64MaterializeFP(CFP, VT);
   else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
-    return ARM64MaterializeGV(GV);
+    return AArch64MaterializeGV(GV);
 
   return 0;
 }
 
 // Computes the address to get to an object.
-bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
-  const User *U = NULL;
+bool AArch64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
@@ -407,7 +417,7 @@ bool ARM64FastISel::ComputeAddress(const Value *Obj, Address &Addr) {
   return Addr.isValid();
 }
 
-bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
+bool AArch64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
   EVT evt = TLI.getValueType(Ty, true);
 
   // Only handle simple types.
@@ -415,12 +425,16 @@ bool ARM64FastISel::isTypeLegal(Type *Ty, MVT &VT) {
     return false;
   VT = evt.getSimpleVT();
 
-  // Handle all legal types, i.e. a register that will directly hold this
+  // This is a legal type, but it's not something we handle in fast-isel.
+  if (VT == MVT::f128)
+    return false;
+
+  // Handle all other legal types, i.e. a register that will directly hold this
   // value.
   return TLI.isTypeLegal(VT);
 }
 
-bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
+bool AArch64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
   if (isTypeLegal(Ty, VT))
     return true;
 
@@ -432,8 +446,8 @@ bool ARM64FastISel::isLoadStoreTypeLegal(Type *Ty, MVT &VT) {
   return false;
 }
 
-bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
-                                    bool UseUnscaled) {
+bool AArch64FastISel::SimplifyAddress(Address &Addr, MVT VT,
+                                      int64_t ScaleFactor, bool UseUnscaled) {
   bool needsLowering = false;
   int64_t Offset = Addr.getOffset();
   switch (VT.SimpleTy) {
@@ -476,9 +490,9 @@ bool ARM64FastISel::SimplifyAddress(Address &Addr, MVT VT, int64_t ScaleFactor,
   return true;
 }
 
-void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
-                                         const MachineInstrBuilder &MIB,
-                                         unsigned Flags, bool UseUnscaled) {
+void AArch64FastISel::AddLoadStoreOperands(Address &Addr,
+                                           const MachineInstrBuilder &MIB,
+                                           unsigned Flags, bool UseUnscaled) {
   int64_t Offset = Addr.getOffset();
   // Frame base works a bit differently. Handle it separately.
   if (Addr.getKind() == Address::FrameIndexBase) {
@@ -497,8 +511,8 @@ void ARM64FastISel::AddLoadStoreOperands(Address &Addr,
   }
 }
 
-bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
-                             bool UseUnscaled) {
+bool AArch64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
+                               bool UseUnscaled) {
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   if (!UseUnscaled && Addr.getOffset() < 0)
@@ -515,32 +529,32 @@ bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
     VTIsi1 = true;
   // Intentional fall-through.
   case MVT::i8:
-    Opc = UseUnscaled ? ARM64::LDURBBi : ARM64::LDRBBui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURBBi : AArch64::LDRBBui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 1;
     break;
   case MVT::i16:
-    Opc = UseUnscaled ? ARM64::LDURHHi : ARM64::LDRHHui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURHHi : AArch64::LDRHHui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 2;
     break;
   case MVT::i32:
-    Opc = UseUnscaled ? ARM64::LDURWi : ARM64::LDRWui;
-    RC = &ARM64::GPR32RegClass;
+    Opc = UseUnscaled ? AArch64::LDURWi : AArch64::LDRWui;
+    RC = &AArch64::GPR32RegClass;
     ScaleFactor = 4;
     break;
   case MVT::i64:
-    Opc = UseUnscaled ? ARM64::LDURXi : ARM64::LDRXui;
-    RC = &ARM64::GPR64RegClass;
+    Opc = UseUnscaled ? AArch64::LDURXi : AArch64::LDRXui;
+    RC = &AArch64::GPR64RegClass;
     ScaleFactor = 8;
     break;
   case MVT::f32:
-    Opc = UseUnscaled ? ARM64::LDURSi : ARM64::LDRSui;
+    Opc = UseUnscaled ? AArch64::LDURSi : AArch64::LDRSui;
     RC = TLI.getRegClassFor(VT);
     ScaleFactor = 4;
     break;
   case MVT::f64:
-    Opc = UseUnscaled ? ARM64::LDURDi : ARM64::LDRDui;
+    Opc = UseUnscaled ? AArch64::LDURDi : AArch64::LDRDui;
     RC = TLI.getRegClassFor(VT);
     ScaleFactor = 8;
     break;
@@ -567,17 +581,18 @@ bool ARM64FastISel::EmitLoad(MVT VT, unsigned &ResultReg, Address Addr,
 
   // Loading an i1 requires special handling.
   if (VTIsi1) {
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(ResultReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(ResultReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     ResultReg = ANDReg;
   }
   return true;
 }
 
-bool ARM64FastISel::SelectLoad(const Instruction *I) {
+bool AArch64FastISel::SelectLoad(const Instruction *I) {
   MVT VT;
   // Verify we have a legal type before going any further.  Currently, we handle
   // simple types that will directly fit in a register (i32/f32/i64/f64) or
@@ -598,8 +613,8 @@ bool ARM64FastISel::SelectLoad(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
-                              bool UseUnscaled) {
+bool AArch64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
+                                bool UseUnscaled) {
   // Negative offsets require unscaled, 9-bit, signed immediate offsets.
   // Otherwise, we try using scaled, 12-bit, unsigned immediate offsets.
   if (!UseUnscaled && Addr.getOffset() < 0)
@@ -615,27 +630,27 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
   case MVT::i1:
     VTIsi1 = true;
   case MVT::i8:
-    StrOpc = UseUnscaled ? ARM64::STURBBi : ARM64::STRBBui;
+    StrOpc = UseUnscaled ? AArch64::STURBBi : AArch64::STRBBui;
     ScaleFactor = 1;
     break;
   case MVT::i16:
-    StrOpc = UseUnscaled ? ARM64::STURHHi : ARM64::STRHHui;
+    StrOpc = UseUnscaled ? AArch64::STURHHi : AArch64::STRHHui;
     ScaleFactor = 2;
     break;
   case MVT::i32:
-    StrOpc = UseUnscaled ? ARM64::STURWi : ARM64::STRWui;
+    StrOpc = UseUnscaled ? AArch64::STURWi : AArch64::STRWui;
     ScaleFactor = 4;
     break;
   case MVT::i64:
-    StrOpc = UseUnscaled ? ARM64::STURXi : ARM64::STRXui;
+    StrOpc = UseUnscaled ? AArch64::STURXi : AArch64::STRXui;
     ScaleFactor = 8;
     break;
   case MVT::f32:
-    StrOpc = UseUnscaled ? ARM64::STURSi : ARM64::STRSui;
+    StrOpc = UseUnscaled ? AArch64::STURSi : AArch64::STRSui;
     ScaleFactor = 4;
     break;
   case MVT::f64:
-    StrOpc = UseUnscaled ? ARM64::STURDi : ARM64::STRDui;
+    StrOpc = UseUnscaled ? AArch64::STURDi : AArch64::STRDui;
     ScaleFactor = 8;
     break;
   }
@@ -655,11 +670,12 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
 
   // Storing an i1 requires special handling.
   if (VTIsi1) {
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
     SrcReg = ANDReg;
   }
   // Create the base instruction, then add the operands.
@@ -669,7 +685,7 @@ bool ARM64FastISel::EmitStore(MVT VT, unsigned SrcReg, Address Addr,
   return true;
 }
 
-bool ARM64FastISel::SelectStore(const Instruction *I) {
+bool AArch64FastISel::SelectStore(const Instruction *I) {
   MVT VT;
   Value *Op0 = I->getOperand(0);
   // Verify we have a legal type before going any further.  Currently, we handle
@@ -694,53 +710,53 @@ bool ARM64FastISel::SelectStore(const Instruction *I) {
   return true;
 }
 
-static ARM64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
+static AArch64CC::CondCode getCompareCC(CmpInst::Predicate Pred) {
   switch (Pred) {
   case CmpInst::FCMP_ONE:
   case CmpInst::FCMP_UEQ:
   default:
     // AL is our "false" for now. The other two need more compares.
-    return ARM64CC::AL;
+    return AArch64CC::AL;
   case CmpInst::ICMP_EQ:
   case CmpInst::FCMP_OEQ:
-    return ARM64CC::EQ;
+    return AArch64CC::EQ;
   case CmpInst::ICMP_SGT:
   case CmpInst::FCMP_OGT:
-    return ARM64CC::GT;
+    return AArch64CC::GT;
   case CmpInst::ICMP_SGE:
   case CmpInst::FCMP_OGE:
-    return ARM64CC::GE;
+    return AArch64CC::GE;
   case CmpInst::ICMP_UGT:
   case CmpInst::FCMP_UGT:
-    return ARM64CC::HI;
+    return AArch64CC::HI;
   case CmpInst::FCMP_OLT:
-    return ARM64CC::MI;
+    return AArch64CC::MI;
   case CmpInst::ICMP_ULE:
   case CmpInst::FCMP_OLE:
-    return ARM64CC::LS;
+    return AArch64CC::LS;
   case CmpInst::FCMP_ORD:
-    return ARM64CC::VC;
+    return AArch64CC::VC;
   case CmpInst::FCMP_UNO:
-    return ARM64CC::VS;
+    return AArch64CC::VS;
   case CmpInst::FCMP_UGE:
-    return ARM64CC::PL;
+    return AArch64CC::PL;
   case CmpInst::ICMP_SLT:
   case CmpInst::FCMP_ULT:
-    return ARM64CC::LT;
+    return AArch64CC::LT;
   case CmpInst::ICMP_SLE:
   case CmpInst::FCMP_ULE:
-    return ARM64CC::LE;
+    return AArch64CC::LE;
   case CmpInst::FCMP_UNE:
   case CmpInst::ICMP_NE:
-    return ARM64CC::NE;
+    return AArch64CC::NE;
   case CmpInst::ICMP_UGE:
-    return ARM64CC::CS;
+    return AArch64CC::HS;
   case CmpInst::ICMP_ULT:
-    return ARM64CC::CC;
+    return AArch64CC::LO;
   }
 }
 
-bool ARM64FastISel::SelectBranch(const Instruction *I) {
+bool AArch64FastISel::SelectBranch(const Instruction *I) {
   const BranchInst *BI = cast<BranchInst>(I);
   MachineBasicBlock *TBB = FuncInfo.MBBMap[BI->getSuccessor(0)];
   MachineBasicBlock *FBB = FuncInfo.MBBMap[BI->getSuccessor(1)];
@@ -748,8 +764,8 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   if (const CmpInst *CI = dyn_cast<CmpInst>(BI->getCondition())) {
     if (CI->hasOneUse() && (CI->getParent() == I->getParent())) {
       // We may not handle every CC for now.
-      ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-      if (CC == ARM64CC::AL)
+      AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+      if (CC == AArch64CC::AL)
         return false;
 
       // Emit the cmp.
@@ -757,7 +773,7 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
         return false;
 
       // Emit the branch.
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
       FuncInfo.MBB->addSuccessor(TBB);
@@ -776,25 +792,27 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
       // Issue an extract_subreg to get the lower 32-bits.
       if (SrcVT == MVT::i64)
         CondReg = FastEmitInst_extractsubreg(MVT::i32, CondReg, /*Kill=*/true,
-                                             ARM64::sub_32);
+                                             AArch64::sub_32);
 
-      unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
-              ANDReg)
+      MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+      unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::ANDWri), ANDReg)
           .addReg(CondReg)
-          .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+          .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(AArch64::SUBSWri))
           .addReg(ANDReg)
           .addReg(ANDReg)
           .addImm(0)
           .addImm(0);
 
-      unsigned CC = ARM64CC::NE;
+      unsigned CC = AArch64CC::NE;
       if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
         std::swap(TBB, FBB);
-        CC = ARM64CC::EQ;
+        CC = AArch64CC::EQ;
       }
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
           .addImm(CC)
           .addMBB(TBB);
       FuncInfo.MBB->addSuccessor(TBB);
@@ -805,7 +823,7 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
                  dyn_cast<ConstantInt>(BI->getCondition())) {
     uint64_t Imm = CI->getZExtValue();
     MachineBasicBlock *Target = (Imm == 0) ? FBB : TBB;
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::B))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::B))
         .addMBB(Target);
     FuncInfo.MBB->addSuccessor(Target);
     return true;
@@ -822,19 +840,19 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   // Regardless, the compare has been done in the predecessor block,
   // and it left a value for us in a virtual register.  Ergo, we test
   // the one-bit value left in the virtual register.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri),
-          ARM64::WZR)
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri),
+          AArch64::WZR)
       .addReg(CondReg)
       .addImm(0)
       .addImm(0);
 
-  unsigned CC = ARM64CC::NE;
+  unsigned CC = AArch64CC::NE;
   if (FuncInfo.MBB->isLayoutSuccessor(TBB)) {
     std::swap(TBB, FBB);
-    CC = ARM64CC::EQ;
+    CC = AArch64CC::EQ;
   }
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::Bcc))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::Bcc))
       .addImm(CC)
       .addMBB(TBB);
   FuncInfo.MBB->addSuccessor(TBB);
@@ -842,14 +860,14 @@ bool ARM64FastISel::SelectBranch(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
+bool AArch64FastISel::SelectIndirectBr(const Instruction *I) {
   const IndirectBrInst *BI = cast<IndirectBrInst>(I);
   unsigned AddrReg = getRegForValue(BI->getOperand(0));
   if (AddrReg == 0)
     return false;
 
   // Emit the indirect branch.
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BR))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BR))
       .addReg(AddrReg);
 
   // Make sure the CFG is up-to-date.
@@ -859,7 +877,7 @@ bool ARM64FastISel::SelectIndirectBr(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
+bool AArch64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(Ty, true);
   if (!SrcEVT.isSimple())
@@ -903,26 +921,26 @@ bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
     needsExt = true;
   // Intentional fall-through.
   case MVT::i32:
-    ZReg = ARM64::WZR;
+    ZReg = AArch64::WZR;
     if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSWri : ARM64::SUBSWri;
+      CmpOpc = isNegativeImm ? AArch64::ADDSWri : AArch64::SUBSWri;
     else
-      CmpOpc = ARM64::SUBSWrr;
+      CmpOpc = AArch64::SUBSWrr;
     break;
   case MVT::i64:
-    ZReg = ARM64::XZR;
+    ZReg = AArch64::XZR;
     if (UseImm)
-      CmpOpc = isNegativeImm ? ARM64::ADDSXri : ARM64::SUBSXri;
+      CmpOpc = isNegativeImm ? AArch64::ADDSXri : AArch64::SUBSXri;
     else
-      CmpOpc = ARM64::SUBSXrr;
+      CmpOpc = AArch64::SUBSXrr;
     break;
   case MVT::f32:
     isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPSri : ARM64::FCMPSrr;
+    CmpOpc = UseImm ? AArch64::FCMPSri : AArch64::FCMPSrr;
     break;
   case MVT::f64:
     isICmp = false;
-    CmpOpc = UseImm ? ARM64::FCMPDri : ARM64::FCMPDrr;
+    CmpOpc = UseImm ? AArch64::FCMPDri : AArch64::FCMPDrr;
     break;
   }
 
@@ -973,12 +991,12 @@ bool ARM64FastISel::EmitCmp(Value *Src1Value, Value *Src2Value, bool isZExt) {
   return true;
 }
 
-bool ARM64FastISel::SelectCmp(const Instruction *I) {
+bool AArch64FastISel::SelectCmp(const Instruction *I) {
   const CmpInst *CI = cast<CmpInst>(I);
 
   // We may not handle every CC for now.
-  ARM64CC::CondCode CC = getCompareCC(CI->getPredicate());
-  if (CC == ARM64CC::AL)
+  AArch64CC::CondCode CC = getCompareCC(CI->getPredicate());
+  if (CC == AArch64CC::AL)
     return false;
 
   // Emit the cmp.
@@ -986,19 +1004,19 @@ bool ARM64FastISel::SelectCmp(const Instruction *I) {
     return false;
 
   // Now set a register based on the comparison.
-  ARM64CC::CondCode invertedCC = getInvertedCondCode(CC);
-  unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::CSINCWr),
+  AArch64CC::CondCode invertedCC = getInvertedCondCode(CC);
+  unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::CSINCWr),
           ResultReg)
-      .addReg(ARM64::WZR)
-      .addReg(ARM64::WZR)
+      .addReg(AArch64::WZR)
+      .addReg(AArch64::WZR)
       .addImm(invertedCC);
 
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectSelect(const Instruction *I) {
+bool AArch64FastISel::SelectSelect(const Instruction *I) {
   const SelectInst *SI = cast<SelectInst>(I);
 
   EVT DestEVT = TLI.getValueType(SI->getType(), true);
@@ -1020,13 +1038,15 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
   if (FalseReg == 0)
     return false;
 
-  unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+
+  MRI.constrainRegClass(CondReg, &AArch64::GPR32RegClass);
+  unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
           ANDReg)
       .addReg(CondReg)
-      .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+      .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
 
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SUBSWri))
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SUBSWri))
       .addReg(ANDReg)
       .addReg(ANDReg)
       .addImm(0)
@@ -1037,16 +1057,16 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
   default:
     return false;
   case MVT::i32:
-    SelectOpc = ARM64::CSELWr;
+    SelectOpc = AArch64::CSELWr;
     break;
   case MVT::i64:
-    SelectOpc = ARM64::CSELXr;
+    SelectOpc = AArch64::CSELXr;
     break;
   case MVT::f32:
-    SelectOpc = ARM64::FCSELSrrr;
+    SelectOpc = AArch64::FCSELSrrr;
     break;
   case MVT::f64:
-    SelectOpc = ARM64::FCSELDrrr;
+    SelectOpc = AArch64::FCSELDrrr;
     break;
   }
 
@@ -1055,13 +1075,13 @@ bool ARM64FastISel::SelectSelect(const Instruction *I) {
           ResultReg)
       .addReg(TrueReg)
       .addReg(FalseReg)
-      .addImm(ARM64CC::NE);
+      .addImm(AArch64CC::NE);
 
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectFPExt(const Instruction *I) {
+bool AArch64FastISel::SelectFPExt(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() || !V->getType()->isFloatTy())
     return false;
@@ -1070,14 +1090,14 @@ bool ARM64FastISel::SelectFPExt(const Instruction *I) {
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&ARM64::FPR64RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTDSr),
+  unsigned ResultReg = createResultReg(&AArch64::FPR64RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTDSr),
           ResultReg).addReg(Op);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
+bool AArch64FastISel::SelectFPTrunc(const Instruction *I) {
   Value *V = I->getOperand(0);
   if (!I->getType()->isFloatTy() || !V->getType()->isDoubleTy())
     return false;
@@ -1086,15 +1106,15 @@ bool ARM64FastISel::SelectFPTrunc(const Instruction *I) {
   if (Op == 0)
     return false;
 
-  unsigned ResultReg = createResultReg(&ARM64::FPR32RegClass);
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::FCVTSDr),
+  unsigned ResultReg = createResultReg(&AArch64::FPR32RegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::FCVTSDr),
           ResultReg).addReg(Op);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
 // FPToUI and FPToSI
-bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
+bool AArch64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
@@ -1104,30 +1124,35 @@ bool ARM64FastISel::SelectFPToInt(const Instruction *I, bool Signed) {
     return false;
 
   EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType(), true);
+  if (SrcVT == MVT::f128)
+    return false;
 
   unsigned Opc;
   if (SrcVT == MVT::f64) {
     if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWDr : ARM64::FCVTZSUXDr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWDr : AArch64::FCVTZSUXDr;
     else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWDr : ARM64::FCVTZUUXDr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWDr : AArch64::FCVTZUUXDr;
   } else {
     if (Signed)
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZSUWSr : ARM64::FCVTZSUXSr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZSUWSr : AArch64::FCVTZSUXSr;
     else
-      Opc = (DestVT == MVT::i32) ? ARM64::FCVTZUUWSr : ARM64::FCVTZUUXSr;
+      Opc = (DestVT == MVT::i32) ? AArch64::FCVTZUUWSr : AArch64::FCVTZUUXSr;
   }
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
+  unsigned ResultReg = createResultReg(
+      DestVT == MVT::i32 ? &AArch64::GPR32RegClass : &AArch64::GPR64RegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(SrcReg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
+bool AArch64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
   MVT DestVT;
   if (!isTypeLegal(I->getType(), DestVT) || DestVT.isVector())
     return false;
+  assert ((DestVT == MVT::f32 || DestVT == MVT::f64) &&
+          "Unexpected value type.");
 
   unsigned SrcReg = getRegForValue(I->getOperand(0));
   if (SrcReg == 0)
@@ -1143,17 +1168,20 @@ bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
       return false;
   }
 
+  MRI.constrainRegClass(SrcReg, SrcVT == MVT::i64 ? &AArch64::GPR64RegClass
+                                                  : &AArch64::GPR32RegClass);
+
   unsigned Opc;
   if (SrcVT == MVT::i64) {
     if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUXSri : ARM64::SCVTFUXDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUXSri : AArch64::SCVTFUXDri;
     else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUXSri : ARM64::UCVTFUXDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUXSri : AArch64::UCVTFUXDri;
   } else {
     if (Signed)
-      Opc = (DestVT == MVT::f32) ? ARM64::SCVTFUWSri : ARM64::SCVTFUWDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::SCVTFUWSri : AArch64::SCVTFUWDri;
     else
-      Opc = (DestVT == MVT::f32) ? ARM64::UCVTFUWSri : ARM64::UCVTFUWDri;
+      Opc = (DestVT == MVT::f32) ? AArch64::UCVTFUWSri : AArch64::UCVTFUWDri;
   }
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
@@ -1163,12 +1191,11 @@ bool ARM64FastISel::SelectIntToFP(const Instruction *I, bool Signed) {
   return true;
 }
 
-bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
-                                    SmallVectorImpl<unsigned> &ArgRegs,
-                                    SmallVectorImpl<MVT> &ArgVTs,
-                                    SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
-                                    SmallVectorImpl<unsigned> &RegArgs,
-                                    CallingConv::ID CC, unsigned &NumBytes) {
+bool AArch64FastISel::ProcessCallArgs(
+    SmallVectorImpl<Value *> &Args, SmallVectorImpl<unsigned> &ArgRegs,
+    SmallVectorImpl<MVT> &ArgVTs, SmallVectorImpl<ISD::ArgFlagsTy> &ArgFlags,
+    SmallVectorImpl<unsigned> &RegArgs, CallingConv::ID CC,
+    unsigned &NumBytes) {
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CC, false, *FuncInfo.MF, TM, ArgLocs, *Context);
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CCAssignFnForCall(CC));
@@ -1227,10 +1254,16 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
       assert(VA.isMemLoc() && "Assuming store on stack.");
 
       // Need to store on the stack.
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
+
+      unsigned BEAlign = 0;
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
+
       Address Addr;
       Addr.setKind(Address::RegBase);
-      Addr.setReg(ARM64::SP);
-      Addr.setOffset(VA.getLocMemOffset());
+      Addr.setReg(AArch64::SP);
+      Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
       if (!EmitStore(ArgVT, Arg, Addr))
         return false;
@@ -1239,9 +1272,9 @@ bool ARM64FastISel::ProcessCallArgs(SmallVectorImpl<Value *> &Args,
   return true;
 }
 
-bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
-                               const Instruction *I, CallingConv::ID CC,
-                               unsigned &NumBytes) {
+bool AArch64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
+                                 const Instruction *I, CallingConv::ID CC,
+                                 unsigned &NumBytes) {
   // Issue CALLSEQ_END
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
@@ -1273,8 +1306,8 @@ bool ARM64FastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
   return true;
 }
 
-bool ARM64FastISel::SelectCall(const Instruction *I,
-                               const char *IntrMemName = 0) {
+bool AArch64FastISel::SelectCall(const Instruction *I,
+                                 const char *IntrMemName = nullptr) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
 
@@ -1367,7 +1400,7 @@ bool ARM64FastISel::SelectCall(const Instruction *I,
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BL));
+  MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BL));
   if (!IntrMemName)
     MIB.addGlobalAddress(GV, 0, 0);
   else
@@ -1392,15 +1425,15 @@ bool ARM64FastISel::SelectCall(const Instruction *I,
   return true;
 }
 
-bool ARM64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
+bool AArch64FastISel::IsMemCpySmall(uint64_t Len, unsigned Alignment) {
   if (Alignment)
     return Len / Alignment <= 4;
   else
     return Len < 32;
 }
 
-bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
-                                       unsigned Alignment) {
+bool AArch64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src,
+                                         uint64_t Len, unsigned Alignment) {
   // Make sure we don't bloat code by inlining very large memcpy's.
   if (!IsMemCpySmall(Len, Alignment))
     return false;
@@ -1452,7 +1485,7 @@ bool ARM64FastISel::TryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
   return true;
 }
 
-bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
+bool AArch64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   // FIXME: Handle more intrinsics.
   switch (I.getIntrinsicID()) {
   default:
@@ -1510,7 +1543,7 @@ bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
     return SelectCall(&I, "memset");
   }
   case Intrinsic::trap: {
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::BRK))
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::BRK))
         .addImm(1);
     return true;
   }
@@ -1518,7 +1551,7 @@ bool ARM64FastISel::SelectIntrinsicCall(const IntrinsicInst &I) {
   return false;
 }
 
-bool ARM64FastISel::SelectRet(const Instruction *I) {
+bool AArch64FastISel::SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -1540,8 +1573,8 @@ bool ARM64FastISel::SelectRet(const Instruction *I) {
     SmallVector<CCValAssign, 16> ValLocs;
     CCState CCInfo(CC, F.isVarArg(), *FuncInfo.MF, TM, ValLocs,
                    I->getContext());
-    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                     : RetCC_ARM64_AAPCS;
+    CCAssignFn *RetCC = CC == CallingConv::WebKit_JS ? RetCC_AArch64_WebKit_JS
+                                                     : RetCC_AArch64_AAPCS;
     CCInfo.AnalyzeReturn(Outs, RetCC);
 
     // Only handle a single return value for now.
@@ -1570,7 +1603,14 @@ bool ARM64FastISel::SelectRet(const Instruction *I) {
     EVT RVEVT = TLI.getValueType(RV->getType());
     if (!RVEVT.isSimple())
       return false;
+
+    // Vectors (of > 1 lane) in big endian need tricky handling.
+    if (RVEVT.isVector() && RVEVT.getVectorNumElements() > 1)
+      return false;
+
     MVT RVVT = RVEVT.getSimpleVT();
+    if (RVVT == MVT::f128)
+      return false;
     MVT DestVT = VA.getValVT();
     // Special handling for extended integers.
     if (RVVT != DestVT) {
@@ -1595,13 +1635,13 @@ bool ARM64FastISel::SelectRet(const Instruction *I) {
   }
 
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(ARM64::RET_ReallyLR));
+                                    TII.get(AArch64::RET_ReallyLR));
   for (unsigned i = 0, e = RetRegs.size(); i != e; ++i)
     MIB.addReg(RetRegs[i], RegState::Implicit);
   return true;
 }
 
-bool ARM64FastISel::SelectTrunc(const Instruction *I) {
+bool AArch64FastISel::SelectTrunc(const Instruction *I) {
   Type *DestTy = I->getType();
   Value *Op = I->getOperand(0);
   Type *SrcTy = Op->getType();
@@ -1648,13 +1688,14 @@ bool ARM64FastISel::SelectTrunc(const Instruction *I) {
     }
     // Issue an extract_subreg to get the lower 32-bits.
     unsigned Reg32 = FastEmitInst_extractsubreg(MVT::i32, SrcReg, /*Kill=*/true,
-                                                ARM64::sub_32);
+                                                AArch64::sub_32);
+    MRI.constrainRegClass(Reg32, &AArch64::GPR32RegClass);
     // Create the AND instruction which performs the actual truncation.
-    unsigned ANDReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    unsigned ANDReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ANDReg)
         .addReg(Reg32)
-        .addImm(ARM64_AM::encodeLogicalImmediate(Mask, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(Mask, 32));
     SrcReg = ANDReg;
   }
 
@@ -1662,7 +1703,7 @@ bool ARM64FastISel::SelectTrunc(const Instruction *I) {
   return true;
 }
 
-unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
+unsigned AArch64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
   assert((DestVT == MVT::i8 || DestVT == MVT::i16 || DestVT == MVT::i32 ||
           DestVT == MVT::i64) &&
          "Unexpected value type.");
@@ -1671,21 +1712,22 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
     DestVT = MVT::i32;
 
   if (isZExt) {
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::ANDWri),
+    MRI.constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+    unsigned ResultReg = createResultReg(&AArch64::GPR32spRegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ANDWri),
             ResultReg)
         .addReg(SrcReg)
-        .addImm(ARM64_AM::encodeLogicalImmediate(1, 32));
+        .addImm(AArch64_AM::encodeLogicalImmediate(1, 32));
 
     if (DestVT == MVT::i64) {
       // We're ZExt i1 to i64.  The ANDWri Wd, Ws, #1 implicitly clears the
       // upper 32 bits.  Emit a SUBREG_TO_REG to extend from Wd to Xd.
-      unsigned Reg64 = MRI.createVirtualRegister(&ARM64::GPR64RegClass);
+      unsigned Reg64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(ARM64::SUBREG_TO_REG), Reg64)
+              TII.get(AArch64::SUBREG_TO_REG), Reg64)
           .addImm(0)
           .addReg(ResultReg)
-          .addImm(ARM64::sub_32);
+          .addImm(AArch64::sub_32);
       ResultReg = Reg64;
     }
     return ResultReg;
@@ -1694,8 +1736,8 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
       // FIXME: We're SExt i1 to i64.
       return 0;
     }
-    unsigned ResultReg = createResultReg(&ARM64::GPR32RegClass);
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(ARM64::SBFMWri),
+    unsigned ResultReg = createResultReg(&AArch64::GPR32RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::SBFMWri),
             ResultReg)
         .addReg(SrcReg)
         .addImm(0)
@@ -1704,8 +1746,8 @@ unsigned ARM64FastISel::Emiti1Ext(unsigned SrcReg, MVT DestVT, bool isZExt) {
   }
 }
 
-unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
-                                   bool isZExt) {
+unsigned AArch64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
+                                     bool isZExt) {
   assert(DestVT != MVT::i1 && "ZeroExt/SignExt an i1?");
   unsigned Opc;
   unsigned Imm = 0;
@@ -1717,21 +1759,21 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
     return Emiti1Ext(SrcReg, DestVT, isZExt);
   case MVT::i8:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 7;
     break;
   case MVT::i16:
     if (DestVT == MVT::i64)
-      Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+      Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     else
-      Opc = isZExt ? ARM64::UBFMWri : ARM64::SBFMWri;
+      Opc = isZExt ? AArch64::UBFMWri : AArch64::SBFMWri;
     Imm = 15;
     break;
   case MVT::i32:
     assert(DestVT == MVT::i64 && "IntExt i32 to i32?!?");
-    Opc = isZExt ? ARM64::UBFMXri : ARM64::SBFMXri;
+    Opc = isZExt ? AArch64::UBFMXri : AArch64::SBFMXri;
     Imm = 31;
     break;
   }
@@ -1739,6 +1781,15 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   // Handle i8 and i16 as i32.
   if (DestVT == MVT::i8 || DestVT == MVT::i16)
     DestVT = MVT::i32;
+  else if (DestVT == MVT::i64) {
+    unsigned Src64 = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(AArch64::SUBREG_TO_REG), Src64)
+        .addImm(0)
+        .addReg(SrcReg)
+        .addImm(AArch64::sub_32);
+    SrcReg = Src64;
+  }
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
@@ -1749,7 +1800,7 @@ unsigned ARM64FastISel::EmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   return ResultReg;
 }
 
-bool ARM64FastISel::SelectIntExt(const Instruction *I) {
+bool AArch64FastISel::SelectIntExt(const Instruction *I) {
   // On ARM, in general, integer casts don't involve legal types; this code
   // handles promotable integers.  The high bits for a type smaller than
   // the register size are assumed to be undefined.
@@ -1778,7 +1829,7 @@ bool ARM64FastISel::SelectIntExt(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
+bool AArch64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   EVT DestEVT = TLI.getValueType(I->getType(), true);
   if (!DestEVT.isSimple())
     return false;
@@ -1793,13 +1844,13 @@ bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   default:
     return false;
   case ISD::SREM:
-    DivOpc = is64bit ? ARM64::SDIVXr : ARM64::SDIVWr;
+    DivOpc = is64bit ? AArch64::SDIVXr : AArch64::SDIVWr;
     break;
   case ISD::UREM:
-    DivOpc = is64bit ? ARM64::UDIVXr : ARM64::UDIVWr;
+    DivOpc = is64bit ? AArch64::UDIVXr : AArch64::UDIVWr;
     break;
   }
-  unsigned MSubOpc = is64bit ? ARM64::MSUBXrrr : ARM64::MSUBWrrr;
+  unsigned MSubOpc = is64bit ? AArch64::MSUBXrrr : AArch64::MSUBWrrr;
   unsigned Src0Reg = getRegForValue(I->getOperand(0));
   if (!Src0Reg)
     return false;
@@ -1808,21 +1859,22 @@ bool ARM64FastISel::SelectRem(const Instruction *I, unsigned ISDOpcode) {
   if (!Src1Reg)
     return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
-  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), ResultReg)
+  unsigned QuotReg = createResultReg(TLI.getRegClassFor(DestVT));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(DivOpc), QuotReg)
       .addReg(Src0Reg)
       .addReg(Src1Reg);
-  // The remainder is computed as numerator – (quotient * denominator) using the
+  // The remainder is computed as numerator - (quotient * denominator) using the
   // MSUB instruction.
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(DestVT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MSubOpc), ResultReg)
-      .addReg(ResultReg)
+      .addReg(QuotReg)
       .addReg(Src1Reg)
       .addReg(Src0Reg);
   UpdateValueMap(I, ResultReg);
   return true;
 }
 
-bool ARM64FastISel::SelectMul(const Instruction *I) {
+bool AArch64FastISel::SelectMul(const Instruction *I) {
   EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType(), true);
   if (!SrcEVT.isSimple())
     return false;
@@ -1841,12 +1893,12 @@ bool ARM64FastISel::SelectMul(const Instruction *I) {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    ZReg = ARM64::WZR;
-    Opc = ARM64::MADDWrrr;
+    ZReg = AArch64::WZR;
+    Opc = AArch64::MADDWrrr;
     break;
   case MVT::i64:
-    ZReg = ARM64::XZR;
-    Opc = ARM64::MADDXrrr;
+    ZReg = AArch64::XZR;
+    Opc = AArch64::MADDXrrr;
     break;
   }
 
@@ -1868,7 +1920,7 @@ bool ARM64FastISel::SelectMul(const Instruction *I) {
   return true;
 }
 
-bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
+bool AArch64FastISel::TargetSelectInstruction(const Instruction *I) {
   switch (I->getOpcode()) {
   default:
     break;
@@ -1918,12 +1970,12 @@ bool ARM64FastISel::TargetSelectInstruction(const Instruction *I) {
   }
   return false;
   // Silence warnings.
-  (void)&CC_ARM64_DarwinPCS_VarArg;
+  (void)&CC_AArch64_DarwinPCS_VarArg;
 }
 
 namespace llvm {
-llvm::FastISel *ARM64::createFastISel(FunctionLoweringInfo &funcInfo,
-                                      const TargetLibraryInfo *libInfo) {
-  return new ARM64FastISel(funcInfo, libInfo);
+llvm::FastISel *AArch64::createFastISel(FunctionLoweringInfo &funcInfo,
+                                        const TargetLibraryInfo *libInfo) {
+  return new AArch64FastISel(funcInfo, libInfo);
 }
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index b29587a..deb306a 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1,4 +1,4 @@
-//===- AArch64FrameLowering.cpp - AArch64 Frame Information ---------------===//
+//===- AArch64FrameLowering.cpp - AArch64 Frame Lowering -------*- C++ -*-====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,227 +11,444 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64FrameLowering.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
-void AArch64FrameLowering::splitSPAdjustments(uint64_t Total,
-                                              uint64_t &Initial,
-                                              uint64_t &Residual) const {
-  // 0x1f0 here is a pessimistic (i.e. realistic) boundary: x-register LDP
-  // instructions have a 7-bit signed immediate scaled by 8, giving a reach of
-  // 0x1f8, but stack adjustment should always be a multiple of 16.
-  if (Total <= 0x1f0) {
-    Initial = Total;
-    Residual = 0;
-  } else {
-    Initial = 0x1f0;
-    Residual = Total - Initial;
+#define DEBUG_TYPE "frame-info"
+
+static cl::opt<bool> EnableRedZone("aarch64-redzone",
+                                   cl::desc("enable use of redzone on AArch64"),
+                                   cl::init(false), cl::Hidden);
+
+STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
+
+static unsigned estimateStackSize(MachineFunction &MF) {
+  const MachineFrameInfo *FFI = MF.getFrameInfo();
+  int Offset = 0;
+  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -FFI->getObjectOffset(i);
+    if (FixedOff > Offset)
+      Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
+    if (FFI->isDeadObjectIndex(i))
+      continue;
+    Offset += FFI->getObjectSize(i);
+    unsigned Align = FFI->getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset + Align - 1) / Align * Align;
   }
+  // This does not include the 16 bytes used for fp and lr.
+  return (unsigned)Offset;
 }
 
-void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineBasicBlock &MBB = MF.front();
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
+bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
+  if (!EnableRedZone)
+    return false;
+  // Don't use the red zone if the function explicitly asks us not to.
+  // This is typically used for kernel code.
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoRedZone))
+    return false;
 
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  unsigned NumBytes = AFI->getLocalStackSize();
+
+  // Note: currently hasFP() is always true for hasCalls(), but that's an
+  // implementation detail of the current code, not a strict requirement,
+  // so stay safe here and check both.
+  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
+    return false;
+  return true;
+}
+
+/// hasFP - Return true if the specified function should have a dedicated frame
+/// pointer register.
+bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
+  assert(!RegInfo->needsStackRealignment(MF) &&
+         "No stack realignment on AArch64!");
+#endif
+
+  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
+          MFI->isFrameAddressTaken());
+}
+
+/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
+/// not required, we reserve argument space for call sites in the function
+/// immediately on entry to the current function.  This eliminates the need for
+/// add/sub sp brackets around call sites.  Returns true if the call frame is
+/// included as part of the stack frame.
+bool
+AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
+  return !MF.getFrameInfo()->hasVarSizedObjects();
+}
+
+void AArch64FrameLowering::eliminateCallFramePseudoInstr(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator I) const {
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  DebugLoc DL = I->getDebugLoc();
+  int Opc = I->getOpcode();
+  bool IsDestroy = Opc == TII->getCallFrameDestroyOpcode();
+  uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
+
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  if (!TFI->hasReservedCallFrame(MF)) {
+    unsigned Align = getStackAlignment();
+
+    int64_t Amount = I->getOperand(0).getImm();
+    Amount = RoundUpToAlignment(Amount, Align);
+    if (!IsDestroy)
+      Amount = -Amount;
+
+    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
+    // doesn't have to pop anything), then the first operand will be zero too so
+    // this adjustment is a no-op.
+    if (CalleePopAmount == 0) {
+      // FIXME: in-function stack adjustment for calls is limited to 24-bits
+      // because there's no guaranteed temporary register available.
+      //
+      // ADD/SUB (immediate) has only LSL #0 and LSL #12 avaiable.
+      // 1) For offset <= 12-bit, we use LSL #0
+      // 2) For 12-bit <= offset <= 24-bit, we use two instructions. One uses
+      // LSL #0, and the other uses LSL #12.
+      //
+      // Mostly call frames will be allocated at the start of a function so
+      // this is OK, but it is a limitation that needs dealing with.
+      assert(Amount > -0xffffff && Amount < 0xffffff && "call frame too large");
+      emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, Amount, TII);
+    }
+  } else if (CalleePopAmount != 0) {
+    // If the calling convention demands that the callee pops arguments from the
+    // stack, we want to add it back if we have a reserved call frame.
+    assert(CalleePopAmount < 0xffffff && "call frame too large");
+    emitFrameOffset(MBB, I, DL, AArch64::SP, AArch64::SP, -CalleePopAmount,
+                    TII);
+  }
+  MBB.erase(I);
+}
+
+void AArch64FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    unsigned FramePtr) const {
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  bool NeedsFrameMoves = MMI.hasDebugInfo()
-    || MF.getFunction()->needsUnwindTableEntry();
-
-  uint64_t NumInitialBytes, NumResidualBytes;
-
-  // Currently we expect the stack to be laid out by
-  //     sub sp, sp, #initial
-  //     stp x29, x30, [sp, #offset]
-  //     ...
-  //     str xxx, [sp, #offset]
-  //     sub sp, sp, #rest (possibly via extra instructions).
-  if (MFI->getCalleeSavedInfo().size()) {
-    // If there are callee-saved registers, we want to store them efficiently as
-    // a block, and virtual base assignment happens too early to do it for us so
-    // we adjust the stack in two phases: first just for callee-saved fiddling,
-    // then to allocate the rest of the frame.
-    splitSPAdjustments(MFI->getStackSize(), NumInitialBytes, NumResidualBytes);
-  } else {
-    // If there aren't any callee-saved registers, two-phase adjustment is
-    // inefficient. It's more efficient to adjust with NumInitialBytes too
-    // because when we're in a "callee pops argument space" situation, that pop
-    // must be tacked onto Initial for correctness.
-    NumInitialBytes = MFI->getStackSize();
-    NumResidualBytes = 0;
-  }
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
+
+  // Add callee saved registers to move list.
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+  if (CSI.empty())
+    return;
+
+  const DataLayout *TD = MF.getTarget().getDataLayout();
+  bool HasFP = hasFP(MF);
+
+  // Calculate amount of bytes used for return address storing.
+  int stackGrowth = -TD->getPointerSize(0);
+
+  // Calculate offsets.
+  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
+  unsigned TotalSkipped = 0;
+  for (const auto &Info : CSI) {
+    unsigned Reg = Info.getReg();
+    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
+                     getOffsetOfLocalArea() + saveAreaOffset;
+
+    // Don't output a new CFI directive if we're re-saving the frame pointer or
+    // link register. This happens when the PrologEpilogInserter has inserted an
+    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
+    // method automatically generates the directives when frame pointers are
+    // used. If we generate CFI directives for the extra "STP"s, the linker will
+    // lose track of the correct values for the frame pointer and link register.
+    if (HasFP && (FramePtr == Reg || Reg == AArch64::LR)) {
+      TotalSkipped += stackGrowth;
+      continue;
+    }
 
-  // Tell everyone else how much adjustment we're expecting them to use. In
-  // particular if an adjustment is required for a tail call the epilogue could
-  // have a different view of things.
-  FuncInfo->setInitialStackAdjust(NumInitialBytes);
-
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumInitialBytes,
-               MachineInstr::FrameSetup);
-
-  if (NeedsFrameMoves && NumInitialBytes) {
-    // We emit this update even if the CFA is set from a frame pointer later so
-    // that the CFA is valid in the interim.
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned CFIIndex = MMI.addFrameInst(
-        MCCFIInstruction::createDefCfa(nullptr, Reg, -NumInitialBytes));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
+        nullptr, DwarfReg, Offset - TotalSkipped));
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
+}
 
-  // Otherwise we need to set the frame pointer and/or add a second stack
-  // adjustment.
-
-  bool FPNeedsSetting = hasFP(MF);
-  for (; MBBI != MBB.end(); ++MBBI) {
-    // Note that this search makes strong assumptions about the operation used
-    // to store the frame-pointer: it must be "STP x29, x30, ...". This could
-    // change in future, but until then there's no point in implementing
-    // untestable more generic cases.
-    if (FPNeedsSetting && MBBI->getOpcode() == AArch64::LSPair64_STR
-                       && MBBI->getOperand(0).getReg() == AArch64::X29) {
-      int64_t X29FrameIdx = MBBI->getOperand(2).getIndex();
-      FuncInfo->setFramePointerOffset(MFI->getObjectOffset(X29FrameIdx));
-
-      ++MBBI;
-      emitRegUpdate(MBB, MBBI, DL, TII, AArch64::X29, AArch64::XSP,
-                    AArch64::X29,
-                    NumInitialBytes + MFI->getObjectOffset(X29FrameIdx),
-                    MachineInstr::FrameSetup);
+void AArch64FrameLowering::emitPrologue(MachineFunction &MF) const {
+  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
+  MachineBasicBlock::iterator MBBI = MBB.begin();
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const Function *Fn = MF.getFunction();
+  const AArch64RegisterInfo *RegInfo = TM.getRegisterInfo();
+  const AArch64InstrInfo *TII = TM.getInstrInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
+  bool HasFP = hasFP(MF);
+  DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-      // The offset adjustment used when emitting debugging locations relative
-      // to whatever frame base is set. AArch64 uses the default frame base (FP
-      // or SP) and this adjusts the calculations to be correct.
-      MFI->setOffsetAdjustment(- MFI->getObjectOffset(X29FrameIdx)
-                               - MFI->getStackSize());
-
-      if (NeedsFrameMoves) {
-        unsigned Reg = MRI->getDwarfRegNum(AArch64::X29, true);
-        unsigned Offset = MFI->getObjectOffset(X29FrameIdx);
-        unsigned CFIIndex = MMI.addFrameInst(
-            MCCFIInstruction::createDefCfa(nullptr, Reg, Offset));
-        BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-      }
+  int NumBytes = (int)MFI->getStackSize();
+  if (!AFI->hasStackFrame()) {
+    assert(!HasFP && "unexpected function without stack frame but with FP");
+
+    // All of the stack allocation is for locals.
+    AFI->setLocalStackSize(NumBytes);
 
-      FPNeedsSetting = false;
+    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
+    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
+
+    // REDZONE: If the stack size is less than 128 bytes, we don't need
+    // to actually allocate.
+    if (NumBytes && !canUseRedZone(MF)) {
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
+
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else if (NumBytes) {
+      ++NumRedZoneFunctions;
     }
 
-    if (!MBBI->getFlag(MachineInstr::FrameSetup))
-      break;
+    return;
   }
 
-  assert(!FPNeedsSetting && "Frame pointer couldn't be set");
+  // Only set up FP if we actually need to.
+  int FPOffset = 0;
+  if (HasFP) {
+    // First instruction must a) allocate the stack  and b) have an immediate
+    // that is a multiple of -2.
+    assert((MBBI->getOpcode() == AArch64::STPXpre ||
+            MBBI->getOpcode() == AArch64::STPDpre) &&
+           MBBI->getOperand(3).getReg() == AArch64::SP &&
+           MBBI->getOperand(4).getImm() < 0 &&
+           (MBBI->getOperand(4).getImm() & 1) == 0);
+
+    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
+    // required for the callee saved register area we get the frame pointer
+    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
+    FPOffset = -(MBBI->getOperand(4).getImm() + 2) * 8;
+    assert(FPOffset >= 0 && "Bad Framepointer Offset");
+  }
 
-  emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16, -NumResidualBytes,
-               MachineInstr::FrameSetup);
+  // Move past the saves of the callee-saved registers.
+  while (MBBI->getOpcode() == AArch64::STPXi ||
+         MBBI->getOpcode() == AArch64::STPDi ||
+         MBBI->getOpcode() == AArch64::STPXpre ||
+         MBBI->getOpcode() == AArch64::STPDpre) {
+    ++MBBI;
+    NumBytes -= 16;
+  }
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+  if (HasFP) {
+    // Issue    sub fp, sp, FPOffset or
+    //          mov fp,sp          when FPOffset is zero.
+    // Note: All stores of callee-saved registers are marked as "FrameSetup".
+    // This code marks the instruction(s) that set the FP also.
+    emitFrameOffset(MBB, MBBI, DL, AArch64::FP, AArch64::SP, FPOffset, TII,
+                    MachineInstr::FrameSetup);
+  }
 
-  // Now we emit the rest of the frame setup information, if necessary: we've
-  // already noted the FP and initial SP moves so we're left with the prologue's
-  // final SP update and callee-saved register locations.
-  if (!NeedsFrameMoves)
-    return;
+  // All of the remaining stack allocations are for locals.
+  AFI->setLocalStackSize(NumBytes);
 
-  // The rest of the stack adjustment
-  if (!hasFP(MF) && NumResidualBytes) {
-    MachineLocation Dst(MachineLocation::VirtualFP);
-    unsigned Reg = MRI->getDwarfRegNum(AArch64::XSP, true);
-    unsigned Offset = NumResidualBytes + NumInitialBytes;
-    unsigned CFIIndex =
-        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+  // Allocate space for the rest of the frame.
+  if (NumBytes) {
+    // If we're a leaf function, try using the red zone.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -NumBytes, TII,
+                      MachineInstr::FrameSetup);
   }
 
-  // And any callee-saved registers (it's fine to leave them to the end here,
-  // because the old values are still valid at this point.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.size()) {
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Offset = MFI->getObjectOffset(I->getFrameIdx());
-      unsigned Reg = MRI->getDwarfRegNum(I->getReg(), true);
+  // If we need a base pointer, set it up here. It's whatever the value of the
+  // stack pointer is at this point. Any variable size objects will be allocated
+  // after this, so we can still use the base pointer to reference locals.
+  //
+  // FIXME: Clarify FrameSetup flags here.
+  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
+  // needed.
+  //
+  if (RegInfo->hasBasePointer(MF))
+    TII->copyPhysReg(MBB, MBBI, DL, AArch64::X19, AArch64::SP, false);
+
+  if (needsFrameMoves) {
+    const DataLayout *TD = MF.getTarget().getDataLayout();
+    const int StackGrowth = -TD->getPointerSize(0);
+    unsigned FramePtr = RegInfo->getFrameRegister(MF);
+
+    // An example of the prologue:
+    //
+    //     .globl __foo
+    //     .align 2
+    //  __foo:
+    // Ltmp0:
+    //     .cfi_startproc
+    //     .cfi_personality 155, ___gxx_personality_v0
+    // Leh_func_begin:
+    //     .cfi_lsda 16, Lexception33
+    //
+    //     stp  xa,bx, [sp, -#offset]!
+    //     ...
+    //     stp  x28, x27, [sp, #offset-32]
+    //     stp  fp, lr, [sp, #offset-16]
+    //     add  fp, sp, #offset - 16
+    //     sub  sp, sp, #1360
+    //
+    // The Stack:
+    //       +-------------------------------------------+
+    // 10000 | ........ | ........ | ........ | ........ |
+    // 10004 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10008 | ........ | ........ | ........ | ........ |
+    // 1000c | ........ | ........ | ........ | ........ |
+    //       +===========================================+
+    // 10010 |                X28 Register               |
+    // 10014 |                X28 Register               |
+    //       +-------------------------------------------+
+    // 10018 |                X27 Register               |
+    // 1001c |                X27 Register               |
+    //       +===========================================+
+    // 10020 |                Frame Pointer              |
+    // 10024 |                Frame Pointer              |
+    //       +-------------------------------------------+
+    // 10028 |                Link Register              |
+    // 1002c |                Link Register              |
+    //       +===========================================+
+    // 10030 | ........ | ........ | ........ | ........ |
+    // 10034 | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    // 10038 | ........ | ........ | ........ | ........ |
+    // 1003c | ........ | ........ | ........ | ........ |
+    //       +-------------------------------------------+
+    //
+    //     [sp] = 10030        ::    >>initial value<<
+    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
+    //     fp = sp == 10020    ::  mov fp, sp
+    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
+    //     sp == 10010         ::    >>final value<<
+    //
+    // The frame pointer (w29) points to address 10020. If we use an offset of
+    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
+    // for w27, and -32 for w28:
+    //
+    //  Ltmp1:
+    //     .cfi_def_cfa w29, 16
+    //  Ltmp2:
+    //     .cfi_offset w30, -8
+    //  Ltmp3:
+    //     .cfi_offset w29, -16
+    //  Ltmp4:
+    //     .cfi_offset w27, -24
+    //  Ltmp5:
+    //     .cfi_offset w28, -32
+
+    if (HasFP) {
+      // Define the current CFA rule to use the provided FP.
+      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, Offset));
-      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored LR
+      unsigned LR = RegInfo->getDwarfRegNum(AArch64::LR, true);
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+
+      // Record the location of the stored FP
+      CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+    } else {
+      // Encode the stack size of the leaf function.
+      unsigned CFIIndex = MMI.addFrameInst(
+          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
+
+    // Now emit the moves for whatever callee saved regs we have.
+    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
   }
 }
 
-void
-AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                   MachineBasicBlock &MBB) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
+static bool isCalleeSavedRegister(unsigned Reg, const MCPhysReg *CSRegs) {
+  for (unsigned i = 0; CSRegs[i]; ++i)
+    if (Reg == CSRegs[i])
+      return true;
+  return false;
+}
+
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
+  unsigned RtIdx = 0;
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost)
+    RtIdx = 1;
+
+  if (MI->getOpcode() == AArch64::LDPXpost ||
+      MI->getOpcode() == AArch64::LDPDpost ||
+      MI->getOpcode() == AArch64::LDPXi || MI->getOpcode() == AArch64::LDPDi) {
+    if (!isCalleeSavedRegister(MI->getOperand(RtIdx).getReg(), CSRegs) ||
+        !isCalleeSavedRegister(MI->getOperand(RtIdx + 1).getReg(), CSRegs) ||
+        MI->getOperand(RtIdx + 2).getReg() != AArch64::SP)
+      return false;
+    return true;
+  }
 
+  return false;
+}
+
+void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
+                                        MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64InstrInfo *TII =
+      static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
   DebugLoc DL = MBBI->getDebugLoc();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   unsigned RetOpcode = MBBI->getOpcode();
 
+  int NumBytes = MFI->getStackSize();
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
   // Initial and residual are named for consitency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
-  uint64_t NumInitialBytes = FuncInfo->getInitialStackAdjust();
-  uint64_t NumResidualBytes = MFI.getStackSize() - NumInitialBytes;
   uint64_t ArgumentPopSize = 0;
-  if (RetOpcode == AArch64::TC_RETURNdi ||
-      RetOpcode == AArch64::TC_RETURNxi) {
-    MachineOperand &JumpTarget = MBBI->getOperand(0);
+  if (RetOpcode == AArch64::TCRETURNdi || RetOpcode == AArch64::TCRETURNri) {
     MachineOperand &StackAdjust = MBBI->getOperand(1);
 
-    MachineInstrBuilder MIB;
-    if (RetOpcode == AArch64::TC_RETURNdi) {
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_Bimm));
-      if (JumpTarget.isGlobal()) {
-        MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
-                             JumpTarget.getTargetFlags());
-      } else {
-        assert(JumpTarget.isSymbol() && "unexpected tail call destination");
-        MIB.addExternalSymbol(JumpTarget.getSymbolName(),
-                              JumpTarget.getTargetFlags());
-      }
-    } else {
-      assert(RetOpcode == AArch64::TC_RETURNxi && JumpTarget.isReg()
-             && "Unexpected tail call");
-
-      MIB = BuildMI(MBB, MBBI, DL, TII.get(AArch64::TAIL_BRx));
-      MIB.addReg(JumpTarget.getReg(), RegState::Kill);
-    }
-
-    // Add the extra operands onto the new tail call instruction even though
-    // they're not used directly (so that liveness is tracked properly etc).
-    for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
-        MIB->addOperand(MBBI->getOperand(i));
-
-
-    // Delete the pseudo instruction TC_RETURN.
-    MachineInstr *NewMI = std::prev(MBBI);
-    MBB.erase(MBBI);
-    MBBI = NewMI;
-
     // For a tail-call in a callee-pops-arguments environment, some or all of
     // the stack may actually be in use for the call's arguments, this is
     // calculated during LowerCall and consumed here...
@@ -241,386 +458,434 @@ AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
     // conveniently stored in the MachineFunctionInfo by
     // LowerFormalArguments. This will, of course, be zero for the C calling
     // convention.
-    ArgumentPopSize = FuncInfo->getArgumentStackToRestore();
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
   }
 
-  assert(NumInitialBytes % 16 == 0 && NumResidualBytes % 16 == 0
-         && "refusing to adjust stack by misaligned amt");
-
-  // We may need to address callee-saved registers differently, so find out the
-  // bound on the frame indices.
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  // The stack frame should be like below,
+  //
+  //      ----------------------                     ---
+  //      |                    |                      |
+  //      | BytesInStackArgArea|              CalleeArgStackSize
+  //      | (NumReusableBytes) |                (of tail call)
+  //      |                    |                     ---
+  //      |                    |                      |
+  //      ---------------------|        ---           |
+  //      |                    |         |            |
+  //      |   CalleeSavedReg   |         |            |
+  //      | (NumRestores * 16) |         |            |
+  //      |                    |         |            |
+  //      ---------------------|         |         NumBytes
+  //      |                    |     StackSize  (StackAdjustUp)
+  //      |   LocalStackSize   |         |            |
+  //      | (covering callee   |         |            |
+  //      |       args)        |         |            |
+  //      |                    |         |            |
+  //      ----------------------        ---          ---
+  //
+  // So NumBytes = StackSize + BytesInStackArgArea - CalleeArgStackSize
+  //             = StackSize + ArgumentPopSize
+  //
+  // AArch64TargetLowering::LowerCall figures out ArgumentPopSize and keeps
+  // it as the 2nd argument of AArch64ISD::TC_RETURN.
+  NumBytes += ArgumentPopSize;
+
+  unsigned NumRestores = 0;
+  // Move past the restores of the callee-saved registers.
+  MachineBasicBlock::iterator LastPopI = MBBI;
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  if (LastPopI != MBB.begin()) {
+    do {
+      ++NumRestores;
+      --LastPopI;
+    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
+    if (!isCSRestore(LastPopI, CSRegs)) {
+      ++LastPopI;
+      --NumRestores;
+    }
   }
-
-  // The "residual" stack update comes first from this direction and guarantees
-  // that SP is NumInitialBytes below its value on function entry, either by a
-  // direct update or restoring it from the frame pointer.
-  if (NumInitialBytes + ArgumentPopSize != 0) {
-    emitSPUpdate(MBB, MBBI, DL, TII, AArch64::X16,
-                 NumInitialBytes + ArgumentPopSize);
-    --MBBI;
+  NumBytes -= NumRestores * 16;
+  assert(NumBytes >= 0 && "Negative stack allocation size!?");
+
+  if (!hasFP(MF)) {
+    // If this was a redzone leaf function, we don't need to restore the
+    // stack pointer.
+    if (!canUseRedZone(MF))
+      emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, NumBytes,
+                      TII);
+    return;
   }
 
+  // Restore the original stack pointer.
+  // FIXME: Rather than doing the math here, we should instead just use
+  // non-post-indexed loads for the restores if we aren't actually going to
+  // be able to save any instructions.
+  if (NumBytes || MFI->hasVarSizedObjects())
+    emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::FP,
+                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
+}
 
-  // MBBI now points to the instruction just past the last callee-saved
-  // restoration (either RET/B if NumInitialBytes == 0, or the "ADD sp, sp"
-  // otherwise).
+/// getFrameIndexOffset - Returns the displacement from the frame register to
+/// the stack frame of the specified index.
+int AArch64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                              int FI) const {
+  unsigned FrameReg;
+  return getFrameIndexReference(MF, FI, FrameReg);
+}
 
-  // Now we need to find out where to put the bulk of the stack adjustment
-  MachineBasicBlock::iterator FirstEpilogue = MBBI;
-  while (MBBI != MBB.begin()) {
-    --MBBI;
+/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
+/// debug info.  It's the same as what we use for resolving the code-gen
+/// references for now.  FIXME: This can go wrong when references are
+/// SP-relative and simple call frames aren't used.
+int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                                 int FI,
+                                                 unsigned &FrameReg) const {
+  return resolveFrameIndexReference(MF, FI, FrameReg);
+}
 
-    unsigned FrameOp;
-    for (FrameOp = 0; FrameOp < MBBI->getNumOperands(); ++FrameOp) {
-      if (MBBI->getOperand(FrameOp).isFI())
-        break;
+int AArch64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
+                                                     int FI, unsigned &FrameReg,
+                                                     bool PreferFP) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  int FPOffset = MFI->getObjectOffset(FI) + 16;
+  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
+  bool isFixed = MFI->isFixedObjectIndex(FI);
+
+  // Use frame pointer to reference fixed objects. Use it for locals if
+  // there are VLAs (and thus the SP isn't reliable as a base).
+  // Make sure useFPForScavengingIndex() does the right thing for the emergency
+  // spill slot.
+  bool UseFP = false;
+  if (AFI->hasStackFrame()) {
+    // Note: Keeping the following as multiple 'if' statements rather than
+    // merging to a single expression for readability.
+    //
+    // Argument access should always use the FP.
+    if (isFixed) {
+      UseFP = hasFP(MF);
+    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
+      // Use SP or FP, whichever gives us the best chance of the offset
+      // being in range for direct access. If the FPOffset is positive,
+      // that'll always be best, as the SP will be even further away.
+      // If the FPOffset is negative, we have to keep in mind that the
+      // available offset range for negative offsets is smaller than for
+      // positive ones. If we have variable sized objects, we're stuck with
+      // using the FP regardless, though, as the SP offset is unknown
+      // and we don't have a base pointer available. If an offset is
+      // available via the FP and the SP, use whichever is closest.
+      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
+          (FPOffset >= -256 && Offset > -FPOffset))
+        UseFP = true;
     }
-
-    // If this instruction doesn't have a frame index we've reached the end of
-    // the callee-save restoration.
-    if (FrameOp == MBBI->getNumOperands())
-      break;
-
-    // Likewise if it *is* a local reference, but not to a callee-saved object.
-    int FrameIdx = MBBI->getOperand(FrameOp).getIndex();
-    if (FrameIdx < MinCSFI || FrameIdx > MaxCSFI)
-      break;
-
-    FirstEpilogue = MBBI;
   }
 
-  if (MF.getFrameInfo()->hasVarSizedObjects()) {
-    int64_t StaticFrameBase;
-    StaticFrameBase = -(NumInitialBytes + FuncInfo->getFramePointerOffset());
-    emitRegUpdate(MBB, FirstEpilogue, DL, TII,
-                  AArch64::XSP, AArch64::X29, AArch64::NoRegister,
-                  StaticFrameBase);
-  } else {
-    emitSPUpdate(MBB, FirstEpilogue, DL,TII, AArch64::X16, NumResidualBytes);
+  if (UseFP) {
+    FrameReg = RegInfo->getFrameRegister(MF);
+    return FPOffset;
   }
-}
 
-int64_t
-AArch64FrameLowering::resolveFrameIndexReference(MachineFunction &MF,
-                                                 int FrameIndex,
-                                                 unsigned &FrameReg,
-                                                 int SPAdj,
-                                                 bool IsCalleeSaveOp) const {
-  AArch64MachineFunctionInfo *FuncInfo =
-    MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  int64_t TopOfFrameOffset = MFI->getObjectOffset(FrameIndex);
-
-  assert(!(IsCalleeSaveOp && FuncInfo->getInitialStackAdjust() == 0)
-         && "callee-saved register in unexpected place");
-
-  // If the frame for this function is particularly large, we adjust the stack
-  // in two phases which means the callee-save related operations see a
-  // different (intermediate) stack size.
-  int64_t FrameRegPos;
-  if (IsCalleeSaveOp) {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(FuncInfo->getInitialStackAdjust());
-  } else if (useFPForAddressing(MF)) {
-    // Have to use the frame pointer since we have no idea where SP is.
-    FrameReg = AArch64::X29;
-    FrameRegPos = FuncInfo->getFramePointerOffset();
-  } else {
-    FrameReg = AArch64::XSP;
-    FrameRegPos = -static_cast<int64_t>(MFI->getStackSize()) + SPAdj;
+  // Use the base pointer if we have one.
+  if (RegInfo->hasBasePointer(MF))
+    FrameReg = RegInfo->getBaseRegister();
+  else {
+    FrameReg = AArch64::SP;
+    // If we're using the red zone for this function, the SP won't actually
+    // be adjusted, so the offsets will be negative. They're also all
+    // within range of the signed 9-bit immediate instructions.
+    if (canUseRedZone(MF))
+      Offset -= AFI->getLocalStackSize();
   }
 
-  return TopOfFrameOffset - FrameRegPos;
+  return Offset;
 }
 
-void
-AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                       RegScavenger *RS) const {
-  const AArch64RegisterInfo *RegInfo =
-    static_cast<const AArch64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-
-  if (hasFP(MF)) {
-    MF.getRegInfo().setPhysRegUsed(AArch64::X29);
-    MF.getRegInfo().setPhysRegUsed(AArch64::X30);
-  }
+static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
+  if (Reg != AArch64::LR)
+    return getKillRegState(true);
 
-  // If addressing of local variables is going to be more complicated than
-  // shoving a base register and an offset into the instruction then we may well
-  // need to scavenge registers. We should either specifically add an
-  // callee-save register for this purpose or allocate an extra spill slot.
-  bool BigStack =
-    MFI->estimateStackSize(MF) >= TII.estimateRSStackLimit(MF)
-    || MFI->hasVarSizedObjects() // Access will be from X29: messes things up
-    || (MFI->adjustsStack() && !hasReservedCallFrame(MF));
-
-  if (!BigStack)
-    return;
-
-  // We certainly need some slack space for the scavenger, preferably an extra
-  // register.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
-  uint16_t ExtraReg = AArch64::NoRegister;
-
-  for (unsigned i = 0; CSRegs[i]; ++i) {
-    if (AArch64::GPR64RegClass.contains(CSRegs[i]) &&
-        !MF.getRegInfo().isPhysRegUsed(CSRegs[i])) {
-      ExtraReg = CSRegs[i];
-      break;
-    }
-  }
-
-  if (ExtraReg != 0) {
-    MF.getRegInfo().setPhysRegUsed(ExtraReg);
-  } else {
-    assert(RS && "Expect register scavenger to be available");
-
-    // Create a stack slot for scavenging purposes. PrologEpilogInserter
-    // helpfully places it near either SP or FP for us to avoid
-    // infinitely-regression during scavenging.
-    const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
-    RS->addScavengingFrameIndex(MFI->CreateStackObject(RC->getSize(),
-                                                       RC->getAlignment(),
-                                                       false));
-  }
+  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
+  bool LRLiveIn = MF.getRegInfo().isLiveIn(AArch64::LR);
+  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
+  return getKillRegState(LRKill);
 }
 
-bool AArch64FrameLowering::determinePrologueDeath(MachineBasicBlock &MBB,
-                                                  unsigned Reg) const {
-  // If @llvm.returnaddress is called then it will refer to X30 by some means;
-  // the prologue store does not kill the register.
-  if (Reg == AArch64::X30) {
-    if (MBB.getParent()->getFrameInfo()->isReturnAddressTaken()
-        && MBB.getParent()->getRegInfo().isLiveIn(Reg))
-    return false;
+bool AArch64FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned idx = Count - i - 2;
+    unsigned Reg1 = CSI[idx].getReg();
+    unsigned Reg2 = CSI[idx + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    //
+    // The order of the registers in the list is controlled by
+    // getCalleeSavedRegs(), so they will always be in-order, as well.
+    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    unsigned StrOpc;
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
+    // first spill is a pre-increment that allocates the stack.
+    // For example:
+    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
+    //    stp     x20, x19, [sp, #16]    // addImm(+2)
+    //    stp     fp, lr, [sp, #32]      // addImm(+4)
+    // Rationale: This sequence saves uop updates compared to a sequence of
+    // pre-increment spills like stp xi,xj,[sp,#-16]!
+    // Note: Similar rational and sequence for restores in epilog.
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPXpre;
+      else
+        StrOpc = AArch64::STPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      // For first spill use pre-increment store.
+      if (i == 0)
+        StrOpc = AArch64::STPDpre;
+      else
+        StrOpc = AArch64::STPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
+                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
+    // Compute offset: i = 0 => offset = -Count;
+    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
+    const int Offset = (i == 0) ? -Count : i;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for STP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(StrOpc));
+    if (StrOpc == AArch64::STPDpre || StrOpc == AArch64::STPXpre)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getPrologueDeath(MF, Reg2))
+        .addReg(Reg1, getPrologueDeath(MF, Reg1))
+        .addReg(AArch64::SP)
+        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
+        .setMIFlag(MachineInstr::FrameSetup);
   }
-
-  // In all other cases, physical registers are dead after they've been saved
-  // but live at the beginning of the prologue block.
-  MBB.addLiveIn(Reg);
   return true;
 }
 
-void
-AArch64FrameLowering::emitFrameMemOps(bool isPrologue, MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      const std::vector<CalleeSavedInfo> &CSI,
-                                      const TargetRegisterInfo *TRI,
-                                      const LoadStoreMethod PossClasses[],
-                                      unsigned NumClasses) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
+bool AArch64FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    const std::vector<CalleeSavedInfo> &CSI,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
   const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
+  unsigned Count = CSI.size();
+  DebugLoc DL;
+  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+
+  if (MI != MBB.end())
+    DL = MI->getDebugLoc();
+
+  for (unsigned i = 0; i < Count; i += 2) {
+    unsigned Reg1 = CSI[i].getReg();
+    unsigned Reg2 = CSI[i + 1].getReg();
+    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
+    // list to come in sorted by frame index so that we can issue the store
+    // pair instructions directly. Assert if we see anything otherwise.
+    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
+           "Out of order callee saved regs!");
+    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
+    // the last load is sp-pi post-increment and de-allocates the stack:
+    // For example:
+    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
+    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
+    //    ldp     x22, x21, [sp], #48     // addImm(+6)
+    // Note: see comment in spillCalleeSavedRegisters()
+    unsigned LdrOpc;
+
+    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
+    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
+    if (AArch64::GPR64RegClass.contains(Reg1)) {
+      assert(AArch64::GPR64RegClass.contains(Reg2) &&
+             "Expected GPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPXpost;
+      else
+        LdrOpc = AArch64::LDPXi;
+    } else if (AArch64::FPR64RegClass.contains(Reg1)) {
+      assert(AArch64::FPR64RegClass.contains(Reg2) &&
+             "Expected FPR64 callee-saved register pair!");
+      if (i == Count - 2)
+        LdrOpc = AArch64::LDPDpost;
+      else
+        LdrOpc = AArch64::LDPDi;
+    } else
+      llvm_unreachable("Unexpected callee saved register!");
+    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
+                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
+                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
+
+    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
+    // etc.
+    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
+    assert((Offset >= -64 && Offset <= 63) &&
+           "Offset out of bounds for LDP immediate");
+    MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdrOpc));
+    if (LdrOpc == AArch64::LDPXpost || LdrOpc == AArch64::LDPDpost)
+      MIB.addReg(AArch64::SP, RegState::Define);
+
+    MIB.addReg(Reg2, getDefRegState(true))
+        .addReg(Reg1, getDefRegState(true))
+        .addReg(AArch64::SP)
+        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
+                         // where the factor * 8 is implicit
+  }
+  return true;
+}
 
-  // A certain amount of implicit contract is present here. The actual stack
-  // offsets haven't been allocated officially yet, so for strictly correct code
-  // we rely on the fact that the elements of CSI are allocated in order
-  // starting at SP, purely as dictated by size and alignment. In practice since
-  // this function handles the only accesses to those slots it's not quite so
-  // important.
-  //
-  // We have also ordered the Callee-saved register list in AArch64CallingConv
-  // so that the above scheme puts registers in order: in particular we want
-  // &X30 to be &X29+8 for an ABI-correct frame record (PCS 5.2.2)
-  for (unsigned i = 0, e = CSI.size(); i < e; ++i) {
-    unsigned Reg = CSI[i].getReg();
-
-    // First we need to find out which register class the register belongs to so
-    // that we can use the correct load/store instrucitons.
-    unsigned ClassIdx;
-    for (ClassIdx = 0; ClassIdx < NumClasses; ++ClassIdx) {
-      if (PossClasses[ClassIdx].RegClass->contains(Reg))
-        break;
-    }
-    assert(ClassIdx != NumClasses
-           && "Asked to store register in unexpected class");
-    const TargetRegisterClass &TheClass = *PossClasses[ClassIdx].RegClass;
-
-    // Now we need to decide whether it's possible to emit a paired instruction:
-    // for this we want the next register to be in the same class.
-    MachineInstrBuilder NewMI;
-    bool Pair = false;
-    if (i + 1 < CSI.size() && TheClass.contains(CSI[i+1].getReg())) {
-      Pair = true;
-      unsigned StLow = 0, StHigh = 0;
-      if (isPrologue) {
-        // Most of these registers will be live-in to the MBB and killed by our
-        // store, though there are exceptions (see determinePrologueDeath).
-        StLow = getKillRegState(determinePrologueDeath(MBB, CSI[i+1].getReg()));
-        StHigh = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
-      } else {
-        StLow = RegState::Define;
-        StHigh = RegState::Define;
-      }
+void AArch64FrameLowering::processFunctionBeforeCalleeSavedScan(
+    MachineFunction &MF, RegScavenger *RS) const {
+  const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
+      MF.getTarget().getRegisterInfo());
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  SmallVector<unsigned, 4> UnspilledCSGPRs;
+  SmallVector<unsigned, 4> UnspilledCSFPRs;
 
-      NewMI = BuildMI(MBB, MBBI, DL, TII.get(PossClasses[ClassIdx].PairOpcode))
-                .addReg(CSI[i+1].getReg(), StLow)
-                .addReg(CSI[i].getReg(), StHigh);
+  // The frame record needs to be created by saving the appropriate registers
+  if (hasFP(MF)) {
+    MRI->setPhysRegUsed(AArch64::FP);
+    MRI->setPhysRegUsed(AArch64::LR);
+  }
 
-      // If it's a paired op, we've consumed two registers
-      ++i;
-    } else {
-      unsigned State;
-      if (isPrologue) {
-        State = getKillRegState(determinePrologueDeath(MBB, CSI[i].getReg()));
+  // Spill the BasePtr if it's used. Do this first thing so that the
+  // getCalleeSavedRegs() below will get the right answer.
+  if (RegInfo->hasBasePointer(MF))
+    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
+
+  // If any callee-saved registers are used, the frame cannot be eliminated.
+  unsigned NumGPRSpilled = 0;
+  unsigned NumFPRSpilled = 0;
+  bool ExtraCSSpill = false;
+  bool CanEliminateFrame = true;
+  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+
+  // Check pairs of consecutive callee-saved registers.
+  for (unsigned i = 0; CSRegs[i]; i += 2) {
+    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
+
+    const unsigned OddReg = CSRegs[i];
+    const unsigned EvenReg = CSRegs[i + 1];
+    assert((AArch64::GPR64RegClass.contains(OddReg) &&
+            AArch64::GPR64RegClass.contains(EvenReg)) ^
+               (AArch64::FPR64RegClass.contains(OddReg) &&
+                AArch64::FPR64RegClass.contains(EvenReg)) &&
+           "Register class mismatch!");
+
+    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
+    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
+
+    // Early exit if none of the registers in the register pair is actually
+    // used.
+    if (!OddRegUsed && !EvenRegUsed) {
+      if (AArch64::GPR64RegClass.contains(OddReg)) {
+        UnspilledCSGPRs.push_back(OddReg);
+        UnspilledCSGPRs.push_back(EvenReg);
       } else {
-        State = RegState::Define;
+        UnspilledCSFPRs.push_back(OddReg);
+        UnspilledCSFPRs.push_back(EvenReg);
       }
+      continue;
+    }
 
-      NewMI = BuildMI(MBB, MBBI, DL,
-                      TII.get(PossClasses[ClassIdx].SingleOpcode))
-                .addReg(CSI[i].getReg(), State);
+    unsigned Reg = AArch64::NoRegister;
+    // If only one of the registers of the register pair is used, make sure to
+    // mark the other one as used as well.
+    if (OddRegUsed ^ EvenRegUsed) {
+      // Find out which register is the additional spill.
+      Reg = OddRegUsed ? EvenReg : OddReg;
+      MRI->setPhysRegUsed(Reg);
     }
 
-    // Note that the FrameIdx refers to the second register in a pair: it will
-    // be allocated the smaller numeric address and so is the one an LDP/STP
-    // address must use.
-    int FrameIdx = CSI[i].getFrameIdx();
-    MachineMemOperand::MemOperandFlags Flags;
-    Flags = isPrologue ? MachineMemOperand::MOStore : MachineMemOperand::MOLoad;
-    MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                             Flags,
-                             Pair ? TheClass.getSize() * 2 : TheClass.getSize(),
-                             MFI.getObjectAlignment(FrameIdx));
-
-    NewMI.addFrameIndex(FrameIdx)
-      .addImm(0)                  // address-register offset
-      .addMemOperand(MMO);
-
-    if (isPrologue)
-      NewMI.setMIFlags(MachineInstr::FrameSetup);
-
-    // For aesthetic reasons, during an epilogue we want to emit complementary
-    // operations to the prologue, but in the opposite order. So we still
-    // iterate through the CalleeSavedInfo list in order, but we put the
-    // instructions successively earlier in the MBB.
-    if (!isPrologue)
-      --MBBI;
+    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
+    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
+
+    assert(((OddReg == AArch64::LR && EvenReg == AArch64::FP) ||
+            (RegInfo->getEncodingValue(OddReg) + 1 ==
+             RegInfo->getEncodingValue(EvenReg))) &&
+           "Register pair of non-adjacent registers!");
+    if (AArch64::GPR64RegClass.contains(OddReg)) {
+      NumGPRSpilled += 2;
+      // If it's not a reserved register, we can use it in lieu of an
+      // emergency spill slot for the register scavenger.
+      // FIXME: It would be better to instead keep looking and choose another
+      // unspilled register that isn't reserved, if there is one.
+      if (Reg != AArch64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
+        ExtraCSSpill = true;
+    } else
+      NumFPRSpilled += 2;
+
+    CanEliminateFrame = false;
   }
-}
-
-bool
-AArch64FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_STR, AArch64::LS64_STR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_STR, AArch64::LSFP64_STR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ true, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MBBI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-
-  if (CSI.empty())
-    return false;
-
-  static const LoadStoreMethod PossibleClasses[] = {
-    {&AArch64::GPR64RegClass, AArch64::LSPair64_LDR, AArch64::LS64_LDR},
-    {&AArch64::FPR64RegClass, AArch64::LSFPPair64_LDR, AArch64::LSFP64_LDR},
-  };
-  const unsigned NumClasses = llvm::array_lengthof(PossibleClasses);
-
-  emitFrameMemOps(/* isPrologue = */ false, MBB, MBBI, CSI, TRI,
-                  PossibleClasses, NumClasses);
-
-  return true;
-}
-
-bool
-AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const TargetRegisterInfo *RI = MF.getTarget().getRegisterInfo();
-
-  // This is a decision of ABI compliance. The AArch64 PCS gives various options
-  // for conformance, and even at the most stringent level more or less permits
-  // elimination for leaf functions because there's no loss of functionality
-  // (for debugging etc)..
-  if (MF.getTarget().Options.DisableFramePointerElim(MF) && MFI->hasCalls())
-    return true;
 
-  // The following are hard-limits: incorrect code will be generated if we try
-  // to omit the frame.
-  return (RI->needsStackRealignment(MF) ||
-          MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-bool
-AArch64FrameLowering::useFPForAddressing(const MachineFunction &MF) const {
-  return MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-bool
-AArch64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Of the various reasons for having a frame pointer, it's actually only
-  // variable-sized objects that prevent reservation of a call frame.
-  return !(hasFP(MF) && MFI->hasVarSizedObjects());
-}
-
-void
-AArch64FrameLowering::eliminateCallFramePseudoInstr(
-                                MachineFunction &MF,
-                                MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI) const {
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo *>(MF.getTarget().getInstrInfo());
-  DebugLoc dl = MI->getDebugLoc();
-  int Opcode = MI->getOpcode();
-  bool IsDestroy = Opcode == TII.getCallFrameDestroyOpcode();
-  uint64_t CalleePopAmount = IsDestroy ? MI->getOperand(1).getImm() : 0;
-
-  if (!hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    int64_t Amount = MI->getOperand(0).getImm();
-    Amount = RoundUpToAlignment(Amount, Align);
-    if (!IsDestroy) Amount = -Amount;
+  // FIXME: Set BigStack if any stack slot references may be out of range.
+  // For now, just conservatively guestimate based on unscaled indexing
+  // range. We'll end up allocating an unnecessary spill slot a lot, but
+  // realistically that's not a big deal at this stage of the game.
+  // The CSR spill slots have not been allocated yet, so estimateStackSize
+  // won't include them.
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
+  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
+  bool BigStack = (CFSize >= 256);
+  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
+    AFI->setHasStackFrame(true);
+
+  // Estimate if we might need to scavenge a register at some point in order
+  // to materialize a stack offset. If so, either spill one additional
+  // callee-saved register or reserve a special spill slot to facilitate
+  // register scavenging. If we already spilled an extra callee-saved register
+  // above to keep the number of spills even, we don't need to do anything else
+  // here.
+  if (BigStack && !ExtraCSSpill) {
+
+    // If we're adding a register to spill here, we have to add two of them
+    // to keep the number of regs to spill even.
+    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
+    unsigned Count = 0;
+    while (!UnspilledCSGPRs.empty() && Count < 2) {
+      unsigned Reg = UnspilledCSGPRs.back();
+      UnspilledCSGPRs.pop_back();
+      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
+                   << " to get a scratch register.\n");
+      MRI->setPhysRegUsed(Reg);
+      ExtraCSSpill = true;
+      ++Count;
+    }
 
-    // N.b. if CalleePopAmount is valid but zero (i.e. callee would pop, but it
-    // doesn't have to pop anything), then the first operand will be zero too so
-    // this adjustment is a no-op.
-    if (CalleePopAmount == 0) {
-      // FIXME: in-function stack adjustment for calls is limited to 12-bits
-      // because there's no guaranteed temporary register available. Mostly call
-      // frames will be allocated at the start of a function so this is OK, but
-      // it is a limitation that needs dealing with.
-      assert(Amount > -0xfff && Amount < 0xfff && "call frame too large");
-      emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, Amount);
+    // If we didn't find an extra callee-saved register to spill, create
+    // an emergency spill slot.
+    if (!ExtraCSSpill) {
+      const TargetRegisterClass *RC = &AArch64::GPR64RegClass;
+      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
+      RS->addScavengingFrameIndex(FI);
+      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
+                   << " as the emergency spill slot.\n");
     }
-  } else if (CalleePopAmount != 0) {
-    // If the calling convention demands that the callee pops arguments from the
-    // stack, we want to add it back if we have a reserved call frame.
-    assert(CalleePopAmount < 0xfff && "call frame too large");
-    emitSPUpdate(MBB, MI, dl, TII, AArch64::NoRegister, -CalleePopAmount);
   }
-
-  MBB.erase(MI);
 }
diff --git a/lib/Target/AArch64/AArch64FrameLowering.h b/lib/Target/AArch64/AArch64FrameLowering.h
index 032dd90..0e00d16 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/lib/Target/AArch64/AArch64FrameLowering.h
@@ -1,4 +1,4 @@
-//==- AArch64FrameLowering.h - Define frame lowering for AArch64 -*- C++ -*--=//
+//==-- AArch64FrameLowering.h - TargetFrameLowering for AArch64 --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,100 +7,67 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class implements the AArch64-specific parts of the TargetFrameLowering
-// class.
+//
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_FRAMEINFO_H
-#define LLVM_AARCH64_FRAMEINFO_H
+#ifndef AArch64_FRAMELOWERING_H
+#define AArch64_FRAMELOWERING_H
 
-#include "AArch64Subtarget.h"
 #include "llvm/Target/TargetFrameLowering.h"
 
 namespace llvm {
+
 class AArch64Subtarget;
+class AArch64TargetMachine;
 
 class AArch64FrameLowering : public TargetFrameLowering {
-private:
-  // In order to unify the spilling and restoring of callee-saved registers into
-  // emitFrameMemOps, we need to be able to specify which instructions to use
-  // for the relevant memory operations on each register class. An array of the
-  // following struct is populated and passed in to achieve this.
-  struct LoadStoreMethod {
-    const TargetRegisterClass *RegClass; // E.g. GPR64RegClass
-
-    // The preferred instruction.
-    unsigned PairOpcode; // E.g. LSPair64_STR
-
-    // Sometimes only a single register can be handled at once.
-    unsigned SingleOpcode; // E.g. LS64_STR
-  };
-protected:
-  const AArch64Subtarget &STI;
+  const AArch64TargetMachine &TM;
 
 public:
-  explicit AArch64FrameLowering(const AArch64Subtarget &sti)
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, 16, 0, 16),
-      STI(sti) {
-  }
+  explicit AArch64FrameLowering(const AArch64TargetMachine &TM,
+                              const AArch64Subtarget &STI)
+      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
+                            false /*StackRealignable*/),
+        TM(TM) {}
 
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  /// Decides how much stack adjustment to perform in each phase of the prologue
-  /// and epilogue.
-  void splitSPAdjustments(uint64_t Total, uint64_t &Initial,
-                          uint64_t &Residual) const;
-
-  int64_t resolveFrameIndexReference(MachineFunction &MF, int FrameIndex,
-                                     unsigned &FrameReg, int SPAdj,
-                                     bool IsCalleeSaveOp) const;
-
-  virtual void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                                    RegScavenger *RS) const;
-
-  virtual bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const;
-  virtual bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const;
+  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned FramePtr) const;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI) const;
-
-  /// If the register is X30 (i.e. LR) and the return address is used in the
-  /// function then the callee-save store doesn't actually kill the register,
-  /// otherwise it does.
-  bool determinePrologueDeath(MachineBasicBlock &MBB, unsigned Reg) const;
-
-  /// This function emits the loads or stores required during prologue and
-  /// epilogue as efficiently as possible.
-  ///
-  /// The operations involved in setting up and tearing down the frame are
-  /// similar enough to warrant a shared function, particularly as discrepancies
-  /// between the two would be disastrous.
-  void emitFrameMemOps(bool isStore, MachineBasicBlock &MBB,
-                       MachineBasicBlock::iterator MI,
-                       const std::vector<CalleeSavedInfo> &CSI,
-                       const TargetRegisterInfo *TRI,
-                       const LoadStoreMethod PossibleClasses[],
-                       unsigned NumClasses) const;
-
-
-  virtual bool hasFP(const MachineFunction &MF) const;
-
-  virtual bool useFPForAddressing(const MachineFunction &MF) const;
-
-  /// On AA
-  virtual bool hasReservedCallFrame(const MachineFunction &MF) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
+  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
+  /// the function.
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
+  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
+                                 unsigned &FrameReg,
+                                 bool PreferFP = false) const;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
+
+  /// \brief Can this function use the red zone for local allocations.
+  bool canUseRedZone(const MachineFunction &MF) const;
+
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+
+  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index dac4b32..7007ffc 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -11,118 +11,119 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
-#include "AArch64InstrInfo.h"
-#include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/Function.h" // To access function attributes.
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-isel"
+
 //===--------------------------------------------------------------------===//
-/// AArch64 specific code to select AArch64 machine instructions for
-/// SelectionDAG operations.
+/// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
+/// instructions for SelectionDAG operations.
 ///
 namespace {
 
 class AArch64DAGToDAGISel : public SelectionDAGISel {
   AArch64TargetMachine &TM;
 
-  /// Keep a pointer to the AArch64Subtarget around so that we can
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
 
+  bool ForCodeSize;
+
 public:
   explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
                                CodeGenOpt::Level OptLevel)
-    : SelectionDAGISel(tm, OptLevel), TM(tm),
-      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-  }
+      : SelectionDAGISel(tm, OptLevel), TM(tm), Subtarget(nullptr),
+        ForCodeSize(false) {}
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "AArch64 Instruction Selection";
   }
 
-  // Include the pieces autogenerated from the target description.
-#include "AArch64GenDAGISel.inc"
-
-  template<unsigned MemSize>
-  bool SelectOffsetUImm12(SDValue N, SDValue &UImm12) {
-    const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-    if (!CN || CN->getZExtValue() % MemSize != 0
-        || CN->getZExtValue() / MemSize > 0xfff)
-      return false;
-
-    UImm12 =  CurDAG->getTargetConstant(CN->getZExtValue() / MemSize, MVT::i64);
-    return true;
-  }
-
-  template<unsigned RegWidth>
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
-    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
-  }
-
-  /// Used for pre-lowered address-reference nodes, so we already know
-  /// the fields match. This operand's job is simply to add an
-  /// appropriate shift operand to the MOVZ/MOVK instruction.
-  template<unsigned LogShift>
-  bool SelectMOVWAddressRef(SDValue N, SDValue &Imm, SDValue &Shift) {
-    Imm = N;
-    Shift = CurDAG->getTargetConstant(LogShift, MVT::i32);
-    return true;
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
+    ForCodeSize =
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
+                             Attribute::OptimizeForSize) ||
+        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+    Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
-  bool SelectFPZeroOperand(SDValue N, SDValue &Dummy);
-
-  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                unsigned RegWidth);
+  SDNode *Select(SDNode *Node) override;
 
+  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
+  /// inline asm expressions.
   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                     char ConstraintCode,
-                                    std::vector<SDValue> &OutOps);
-
-  bool SelectLogicalImm(SDValue N, SDValue &Imm);
-
-  template<unsigned RegWidth>
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos) {
-    return SelectTSTBOperand(N, FixedPos, RegWidth);
+                                    std::vector<SDValue> &OutOps) override;
+
+  SDNode *SelectMLAV64LaneV128(SDNode *N);
+  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
+  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
+  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
+  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, false, Reg, Shift);
+  }
+  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
+    return SelectShiftedRegister(N, true, Reg, Shift);
+  }
+  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeIndexed(N, 16, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
+  }
+  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
+    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
   }
 
-  bool SelectTSTBOperand(SDValue N, SDValue &FixedPos, unsigned RegWidth);
-
-  SDNode *SelectAtomic(SDNode *N, unsigned Op8, unsigned Op16, unsigned Op32,
-                       unsigned Op64);
-
-  /// Put the given constant into a pool and return a DAG which will give its
-  /// address.
-  SDValue getConstantPoolItemAddress(SDLoc DL, const Constant *CV);
-
-  SDNode *TrySelectToMoveImm(SDNode *N);
-  SDNode *LowerToFPLitPool(SDNode *Node);
-  SDNode *SelectToLitPool(SDNode *N);
-
-  SDNode* Select(SDNode*);
-private:
-  /// Get the opcode for table lookup instruction
-  unsigned getTBLOpc(bool IsExt, bool Is64Bit, unsigned NumOfVec);
-
-  /// Select NEON table lookup intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  /// IsExt is to indicate if the result will be extended with an argument.
-  SDNode *SelectVTBL(SDNode *N, unsigned NumVecs, bool IsExt);
+  template<int Width>
+  bool SelectAddrModeWRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeWRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON load intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcode);
+  template<int Width>
+  bool SelectAddrModeXRO(SDValue N, SDValue &Base, SDValue &Offset,
+                         SDValue &SignExtend, SDValue &DoShift) {
+    return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
+  }
 
-  /// Select NEON store intrinsics.  NumVecs should be 1, 2, 3 or 4.
-  SDNode *SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
-                    const uint16_t *Opcodes);
 
   /// Form sequences of consecutive 64/128-bit registers for use in NEON
   /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
@@ -136,315 +137,713 @@ private:
   SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
                       unsigned SubRegs[]);
 
-  /// Select NEON load-duplicate intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode array specifies the instructions used for load.
-  SDNode *SelectVLDDup(SDNode *N, bool isUpdating, unsigned NumVecs,
-                       const uint16_t *Opcodes);
+  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
+
+  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
+
+  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                     unsigned SubRegIdx);
+  SDNode *SelectPostLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
+                         unsigned SubRegIdx);
+  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  SDNode *SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+
+  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
+  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
+
+  SDNode *SelectBitfieldExtractOp(SDNode *N);
+  SDNode *SelectBitfieldInsertOp(SDNode *N);
+
+  SDNode *SelectLIBM(SDNode *N);
+
+// Include the pieces autogenerated from the target description.
+#include "AArch64GenDAGISel.inc"
+
+private:
+  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
+                             SDValue &Shift);
+  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
+                             SDValue &OffImm);
+  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
+                              SDValue &OffImm);
+  bool SelectAddrModeWRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool SelectAddrModeXRO(SDValue N, unsigned Size, SDValue &Base,
+                         SDValue &Offset, SDValue &SignExtend,
+                         SDValue &DoShift);
+  bool isWorthFolding(SDValue V) const;
+  bool SelectExtendedSHL(SDValue N, unsigned Size, bool WantExtend,
+                         SDValue &Offset, SDValue &SignExtend);
 
-  /// Select NEON load/store lane intrinsics.  NumVecs should be 2, 3 or 4.
-  /// The opcode arrays specify the instructions used for load/store.
-  SDNode *SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
-                          unsigned NumVecs, const uint16_t *Opcodes);
+  template<unsigned RegWidth>
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos) {
+    return SelectCVTFixedPosOperand(N, FixedPos, RegWidth);
+  }
 
-  SDValue getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                               SDValue Operand);
+  bool SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos, unsigned Width);
 };
+} // end anonymous namespace
+
+/// isIntImmediate - This method tests to see if the node is a constant
+/// operand. If so Imm will receive the 32-bit value.
+static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
+  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
+    Imm = C->getZExtValue();
+    return true;
+  }
+  return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
-                                              unsigned RegWidth) {
-  const ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N);
-  if (!CN) return false;
+// isIntImmediate - This method tests to see if a constant operand.
+// If so Imm will receive the value.
+static bool isIntImmediate(SDValue N, uint64_t &Imm) {
+  return isIntImmediate(N.getNode(), Imm);
+}
 
-  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
-  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
-  // x-register.
-  //
-  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
-  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
-  // integers.
-  bool IsExact;
+// isOpcWithIntImmediate - This method tests to see if the node is a specific
+// opcode and that it has a immediate integer right operand.
+// If so Imm will receive the 32 bit value.
+static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
+                                  uint64_t &Imm) {
+  return N->getOpcode() == Opc &&
+         isIntImmediate(N->getOperand(1).getNode(), Imm);
+}
 
-  // fbits is between 1 and 64 in the worst-case, which means the fmul
-  // could have 2^64 as an actual operand. Need 65 bits of precision.
-  APSInt IntVal(65, true);
-  CN->getValueAPF().convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
+bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
+    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
+  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
+  // Require the address to be in a register.  That is safe for all AArch64
+  // variants and it is hard to do anything much smarter without knowing
+  // how the operand is used.
+  OutOps.push_back(Op);
+  return false;
+}
 
-  // N.b. isPowerOf2 also checks for > 0.
-  if (!IsExact || !IntVal.isPowerOf2()) return false;
-  unsigned FBits = IntVal.logBase2();
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
+                                           SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
 
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding FBits, but it must still be in range.
-  if (FBits == 0 || FBits > RegWidth) return false;
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+  unsigned ShiftAmt;
 
-  FixedPos = CurDAG->getTargetConstant(64 - FBits, MVT::i32);
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
+  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
   return true;
 }
 
-bool
-AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                                 char ConstraintCode,
-                                                 std::vector<SDValue> &OutOps) {
-  switch (ConstraintCode) {
-  default: llvm_unreachable("Unrecognised AArch64 memory constraint");
-  case 'm':
-    // FIXME: more freedom is actually permitted for 'm'. We can go
-    // hunting for a base and an offset if we want. Of course, since
-    // we don't really know how the operand is going to be used we're
-    // probably restricted to the load/store pair's simm7 as an offset
-    // range anyway.
-  case 'Q':
-    OutOps.push_back(Op);
+/// SelectNegArithImmed - As above, but negates the value before trying to
+/// select it.
+bool AArch64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
+                                              SDValue &Shift) {
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  if (!isa<ConstantSDNode>(N.getNode()))
+    return false;
+
+  // The immediate operand must be a 24-bit zero-extended immediate.
+  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
+
+  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
+  // have the opposite effect on the C flag, so this pattern mustn't match under
+  // those circumstances.
+  if (Immed == 0)
+    return false;
+
+  if (N.getValueType() == MVT::i32)
+    Immed = ~((uint32_t)Immed) + 1;
+  else
+    Immed = ~Immed + 1ULL;
+  if (Immed & 0xFFFFFFFFFF000000ULL)
+    return false;
+
+  Immed &= 0xFFFFFFULL;
+  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
+}
+
+/// getShiftTypeForNode - Translate a shift node to the corresponding
+/// ShiftType value.
+static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
+  switch (N.getOpcode()) {
+  default:
+    return AArch64_AM::InvalidShiftExtend;
+  case ISD::SHL:
+    return AArch64_AM::LSL;
+  case ISD::SRL:
+    return AArch64_AM::LSR;
+  case ISD::SRA:
+    return AArch64_AM::ASR;
+  case ISD::ROTR:
+    return AArch64_AM::ROR;
   }
+}
 
+/// \brief Determine wether it is worth to fold V into an extended register.
+bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
+  // it hurts if the a value is used at least twice, unless we are optimizing
+  // for code size.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
   return false;
 }
 
-bool
-AArch64DAGToDAGISel::SelectFPZeroOperand(SDValue N, SDValue &Dummy) {
-  ConstantFPSDNode *Imm = dyn_cast<ConstantFPSDNode>(N);
-  if (!Imm || !Imm->getValueAPF().isPosZero())
+/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
+/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
+/// instructions allow the shifted register to be rotated, but the arithmetic
+/// instructions do not.  The AllowROR parameter specifies whether ROR is
+/// supported.
+bool AArch64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
+                                                SDValue &Reg, SDValue &Shift) {
+  AArch64_AM::ShiftExtendType ShType = getShiftTypeForNode(N);
+  if (ShType == AArch64_AM::InvalidShiftExtend)
+    return false;
+  if (!AllowROR && ShType == AArch64_AM::ROR)
     return false;
 
-  // Doesn't actually carry any information, but keeps TableGen quiet.
-  Dummy = CurDAG->getTargetConstant(0, MVT::i32);
-  return true;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    unsigned BitSize = N.getValueType().getSizeInBits();
+    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
+    unsigned ShVal = AArch64_AM::getShifterImm(ShType, Val);
+
+    Reg = N.getOperand(0);
+    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
+    return isWorthFolding(N);
+  }
+
+  return false;
 }
 
-bool AArch64DAGToDAGISel::SelectLogicalImm(SDValue N, SDValue &Imm) {
-  uint32_t Bits;
-  uint32_t RegWidth = N.getValueType().getSizeInBits();
+/// getExtendTypeForNode - Translate an extend node to the corresponding
+/// ExtendType value.
+static AArch64_AM::ShiftExtendType
+getExtendTypeForNode(SDValue N, bool IsLoadStore = false) {
+  if (N.getOpcode() == ISD::SIGN_EXTEND ||
+      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    EVT SrcVT;
+    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
+      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
+    else
+      SrcVT = N.getOperand(0).getValueType();
+
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::SXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::SXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::SXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
+             N.getOpcode() == ISD::ANY_EXTEND) {
+    EVT SrcVT = N.getOperand(0).getValueType();
+    if (!IsLoadStore && SrcVT == MVT::i8)
+      return AArch64_AM::UXTB;
+    else if (!IsLoadStore && SrcVT == MVT::i16)
+      return AArch64_AM::UXTH;
+    else if (SrcVT == MVT::i32)
+      return AArch64_AM::UXTW;
+    assert(SrcVT != MVT::i64 && "extend from 64-bits?");
+
+    return AArch64_AM::InvalidShiftExtend;
+  } else if (N.getOpcode() == ISD::AND) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return AArch64_AM::InvalidShiftExtend;
+    uint64_t AndMask = CSD->getZExtValue();
+
+    switch (AndMask) {
+    default:
+      return AArch64_AM::InvalidShiftExtend;
+    case 0xFF:
+      return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFF:
+      return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
+    case 0xFFFFFFFF:
+      return AArch64_AM::UXTW;
+    }
+  }
 
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+  return AArch64_AM::InvalidShiftExtend;
+}
+
+// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
+static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
+  if (DL->getOpcode() != AArch64ISD::DUPLANE16 &&
+      DL->getOpcode() != AArch64ISD::DUPLANE32)
+    return false;
 
-  if (!A64Imms::isLogicalImm(RegWidth, CN->getZExtValue(), Bits))
+  SDValue SV = DL->getOperand(0);
+  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
     return false;
 
-  Imm = CurDAG->getTargetConstant(Bits, MVT::i32);
+  SDValue EV = SV.getOperand(1);
+  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+    return false;
+
+  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
+  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
+  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
+  LaneOp = EV.getOperand(0);
+
   return true;
 }
 
-SDNode *AArch64DAGToDAGISel::TrySelectToMoveImm(SDNode *Node) {
-  SDNode *ResNode;
-  SDLoc dl(Node);
-  EVT DestType = Node->getValueType(0);
-  unsigned DestWidth = DestType.getSizeInBits();
-
-  unsigned MOVOpcode;
-  EVT MOVType;
-  int UImm16, Shift;
-  uint32_t LogicalBits;
-
-  uint64_t BitPat = cast<ConstantSDNode>(Node)->getZExtValue();
-  if (A64Imms::isMOVZImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVZxii : AArch64::MOVZwii;
-  } else if (A64Imms::isMOVNImm(DestWidth, BitPat, UImm16, Shift)) {
-    MOVType = DestType;
-    MOVOpcode = DestWidth == 64 ? AArch64::MOVNxii : AArch64::MOVNwii;
-  } else if (DestWidth == 64 && A64Imms::isMOVNImm(32, BitPat, UImm16, Shift)) {
-    // To get something like 0x0000_0000_ffff_1234 into a 64-bit register we can
-    // use a 32-bit instruction: "movn w0, 0xedbc".
-    MOVType = MVT::i32;
-    MOVOpcode = AArch64::MOVNwii;
-  } else if (A64Imms::isLogicalImm(DestWidth, BitPat, LogicalBits))  {
-    MOVOpcode = DestWidth == 64 ? AArch64::ORRxxi : AArch64::ORRwwi;
-    uint16_t ZR = DestWidth == 64 ? AArch64::XZR : AArch64::WZR;
-
-    return CurDAG->getMachineNode(MOVOpcode, dl, DestType,
-                              CurDAG->getRegister(ZR, DestType),
-                              CurDAG->getTargetConstant(LogicalBits, MVT::i32));
-  } else {
-    // Can't handle it in one instruction. There's scope for permitting two (or
-    // more) instructions, but that'll need more thought.
-    return NULL;
+// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
+// high lane extract.
+static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
+                             SDValue &LaneOp, int &LaneIdx) {
+
+  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
+      return false;
+  }
+  StdOp = Op1;
+  return true;
+}
+
+/// SelectMLAV64LaneV128 - AArch64 supports vector MLAs where one multiplicand
+/// is a lane in the upper half of a 128-bit vector.  Recognize and select this
+/// so that we don't emit unnecessary lane extracts.
+SDNode *AArch64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
+  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
+  int LaneIdx = -1; // Will hold the lane index.
+
+  if (Op1.getOpcode() != ISD::MUL ||
+      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                        LaneIdx)) {
+    std::swap(Op0, Op1);
+    if (Op1.getOpcode() != ISD::MUL ||
+        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
+                          LaneIdx))
+      return nullptr;
   }
 
-  ResNode = CurDAG->getMachineNode(MOVOpcode, dl, MOVType,
-                                   CurDAG->getTargetConstant(UImm16, MVT::i32),
-                                   CurDAG->getTargetConstant(Shift, MVT::i32));
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
+
+  unsigned MLAOpc = ~0U;
 
-  if (MOVType != DestType) {
-    ResNode = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                          MVT::i64, MVT::i32, MVT::Other,
-                          CurDAG->getTargetConstant(0, MVT::i64),
-                          SDValue(ResNode, 0),
-                          CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32));
+  switch (N->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unrecognized MLA.");
+  case MVT::v4i16:
+    MLAOpc = AArch64::MLAv4i16_indexed;
+    break;
+  case MVT::v8i16:
+    MLAOpc = AArch64::MLAv8i16_indexed;
+    break;
+  case MVT::v2i32:
+    MLAOpc = AArch64::MLAv2i32_indexed;
+    break;
+  case MVT::v4i32:
+    MLAOpc = AArch64::MLAv4i32_indexed;
+    break;
   }
 
-  return ResNode;
+  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
+  SDValue SMULLOp0;
+  SDValue SMULLOp1;
+  int LaneIdx;
+
+  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
+                        LaneIdx))
+    return nullptr;
+
+  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
+
+  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
+
+  unsigned SMULLOpc = ~0U;
+
+  if (IntNo == Intrinsic::aarch64_neon_smull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::SMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::SMULLv2i32_indexed;
+      break;
+    }
+  } else if (IntNo == Intrinsic::aarch64_neon_umull) {
+    switch (N->getSimpleValueType(0).SimpleTy) {
+    default:
+      llvm_unreachable("Unrecognized SMULL.");
+    case MVT::v4i32:
+      SMULLOpc = AArch64::UMULLv4i16_indexed;
+      break;
+    case MVT::v2i64:
+      SMULLOpc = AArch64::UMULLv2i32_indexed;
+      break;
+    }
+  } else
+    llvm_unreachable("Unrecognized intrinsic.");
+
+  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// Instructions that accept extend modifiers like UXTW expect the register
+/// being extended to be a GPR32, but the incoming DAG might be acting on a
+/// GPR64 (either via SEXT_INREG or AND). Extract the appropriate low bits if
+/// this is the case.
+static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
+  if (N.getValueType() == MVT::i32)
+    return N;
+
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  MachineSDNode *Node = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
+                                               SDLoc(N), MVT::i32, N, SubReg);
+  return SDValue(Node, 0);
+}
+
+
+/// SelectArithExtendedRegister - Select a "extended register" operand.  This
+/// operand folds in an extend followed by an optional left shift.
+bool AArch64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
+                                                      SDValue &Shift) {
+  unsigned ShiftVal = 0;
+  AArch64_AM::ShiftExtendType Ext;
+
+  if (N.getOpcode() == ISD::SHL) {
+    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!CSD)
+      return false;
+    ShiftVal = CSD->getZExtValue();
+    if (ShiftVal > 4)
+      return false;
+
+    Ext = getExtendTypeForNode(N.getOperand(0));
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0).getOperand(0);
+  } else {
+    Ext = getExtendTypeForNode(N);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
+
+    Reg = N.getOperand(0);
+  }
+
+  // AArch64 mandates that the RHS of the operation must use the smallest
+  // register classs that could contain the size being extended from.  Thus,
+  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
+  // there might not be an actual 32-bit value in the program.  We can
+  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
+  assert(Ext != AArch64_AM::UXTX && Ext != AArch64_AM::SXTX);
+  Reg = narrowIfNeeded(CurDAG, Reg);
+  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
+  return isWorthFolding(N);
 }
 
-SDValue
-AArch64DAGToDAGISel::getConstantPoolItemAddress(SDLoc DL,
-                                                const Constant *CV) {
-  EVT PtrVT = getTargetLowering()->getPointerTy();
-
-  switch (getTargetLowering()->getTargetMachine().getCodeModel()) {
-  case CodeModel::Small: {
-    unsigned Alignment =
-      getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-    return CurDAG->getNode(
-        AArch64ISD::WrapperSmall, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_NO_FLAG),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_LO12),
-        CurDAG->getConstant(Alignment, MVT::i32));
-  }
-  case CodeModel::Large: {
-    SDNode *LitAddr;
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVZxii, DL, PtrVT,
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-        CurDAG->getTargetConstant(3, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-        CurDAG->getTargetConstant(2, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-        CurDAG->getTargetConstant(1, MVT::i32));
-    LitAddr = CurDAG->getMachineNode(
-        AArch64::MOVKxii, DL, PtrVT, SDValue(LitAddr, 0),
-        CurDAG->getTargetConstantPool(CV, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC),
-        CurDAG->getTargetConstant(0, MVT::i32));
-    return SDValue(LitAddr, 0);
+/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
+/// immediate" address.  The "Size" argument is the size in bytes of the memory
+/// reference, which determines the scale.
+bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
+                                              SDValue &Base, SDValue &OffImm) {
+  const TargetLowering *TLI = getTargetLowering();
+  if (N.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(N)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+    return true;
   }
-  default:
-    llvm_unreachable("Only small and large code models supported now");
+
+  if (N.getOpcode() == AArch64ISD::ADDlow) {
+    GlobalAddressSDNode *GAN =
+        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
+    Base = N.getOperand(0);
+    OffImm = N.getOperand(1);
+    if (!GAN)
+      return true;
+
+    const GlobalValue *GV = GAN->getGlobal();
+    unsigned Alignment = GV->getAlignment();
+    const DataLayout *DL = TLI->getDataLayout();
+    if (Alignment == 0 && !Subtarget->isTargetDarwin())
+      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
+
+    if (Alignment >= Size)
+      return true;
   }
+
+  if (CurDAG->isBaseWithConstantOffset(N)) {
+    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+      int64_t RHSC = (int64_t)RHS->getZExtValue();
+      unsigned Scale = Log2_32(Size);
+      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
+        Base = N.getOperand(0);
+        if (Base.getOpcode() == ISD::FrameIndex) {
+          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        }
+        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
+        return true;
+      }
+    }
+  }
+
+  // Before falling back to our general case, check if the unscaled
+  // instructions can handle this. If so, that's preferable.
+  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
+    return false;
+
+  // Base only. The address will be materialized into a register before
+  // the memory is accessed.
+  //    add x0, Xbase, #offset
+  //    ldr x0, [x0]
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
+  return true;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectToLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  uint64_t UnsignedVal = cast<ConstantSDNode>(Node)->getZExtValue();
-  int64_t SignedVal = cast<ConstantSDNode>(Node)->getSExtValue();
-  EVT DestType = Node->getValueType(0);
-
-  // Since we may end up loading a 64-bit constant from a 32-bit entry the
-  // constant in the pool may have a different type to the eventual node.
-  ISD::LoadExtType Extension;
-  EVT MemType;
-
-  assert((DestType == MVT::i64 || DestType == MVT::i32)
-         && "Only expect integer constants at the moment");
-
-  if (DestType == MVT::i32) {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i32;
-  } else if (UnsignedVal <= UINT32_MAX) {
-    Extension = ISD::ZEXTLOAD;
-    MemType = MVT::i32;
-  } else if (SignedVal >= INT32_MIN && SignedVal <= INT32_MAX) {
-    Extension = ISD::SEXTLOAD;
-    MemType = MVT::i32;
-  } else {
-    Extension = ISD::NON_EXTLOAD;
-    MemType = MVT::i64;
-  }
-
-  Constant *CV = ConstantInt::get(Type::getIntNTy(*CurDAG->getContext(),
-                                                  MemType.getSizeInBits()),
-                                  UnsignedVal);
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, CV);
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(CV->getType());
-
-  return CurDAG->getExtLoad(Extension, DL, DestType, CurDAG->getEntryNode(),
-                            PoolAddr,
-                            MachinePointerInfo::getConstantPool(), MemType,
-                            /* isVolatile = */ false,
-                            /* isNonTemporal = */ false,
-                            Alignment).getNode();
+/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
+/// immediate" address.  This should only match when there is an offset that
+/// is not valid for a scaled immediate addressing mode.  The "Size" argument
+/// is the size in bytes of the memory reference, which is needed here to know
+/// what is valid for a scaled immediate.
+bool AArch64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
+                                                 SDValue &Base,
+                                                 SDValue &OffImm) {
+  if (!CurDAG->isBaseWithConstantOffset(N))
+    return false;
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    int64_t RHSC = RHS->getSExtValue();
+    // If the offset is valid as a scaled immediate, don't match here.
+    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
+        RHSC < (0x1000 << Log2_32(Size)))
+      return false;
+    if (RHSC >= -256 && RHSC < 256) {
+      Base = N.getOperand(0);
+      if (Base.getOpcode() == ISD::FrameIndex) {
+        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+        const TargetLowering *TLI = getTargetLowering();
+        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      }
+      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
+      return true;
+    }
+  }
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::LowerToFPLitPool(SDNode *Node) {
-  SDLoc DL(Node);
-  const ConstantFP *FV = cast<ConstantFPSDNode>(Node)->getConstantFPValue();
-  EVT DestType = Node->getValueType(0);
-
-  unsigned Alignment =
-    getTargetLowering()->getDataLayout()->getABITypeAlignment(FV->getType());
-  SDValue PoolAddr = getConstantPoolItemAddress(DL, FV);
-
-  return CurDAG->getLoad(DestType, DL, CurDAG->getEntryNode(), PoolAddr,
-                         MachinePointerInfo::getConstantPool(),
-                         /* isVolatile = */ false,
-                         /* isNonTemporal = */ false,
-                         /* isInvariant = */ true,
-                         Alignment).getNode();
+static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
+  SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+  SDValue ImpDef = SDValue(
+      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
+      0);
+  MachineSDNode *Node = CurDAG->getMachineNode(
+      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
+  return SDValue(Node, 0);
 }
 
-bool
-AArch64DAGToDAGISel::SelectTSTBOperand(SDValue N, SDValue &FixedPos,
-                                       unsigned RegWidth) {
-  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
-  if (!CN) return false;
+/// \brief Check if the given SHL node (\p N), can be used to form an
+/// extended register for an addressing mode.
+bool AArch64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
+                                            bool WantExtend, SDValue &Offset,
+                                            SDValue &SignExtend) {
+  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
+  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!CSD || (CSD->getZExtValue() & 0x7) != CSD->getZExtValue())
+    return false;
 
-  uint64_t Val = CN->getZExtValue();
+  if (WantExtend) {
+    AArch64_AM::ShiftExtendType Ext =
+        getExtendTypeForNode(N.getOperand(0), true);
+    if (Ext == AArch64_AM::InvalidShiftExtend)
+      return false;
 
-  if (!isPowerOf2_64(Val)) return false;
+    Offset = narrowIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+  } else {
+    Offset = N.getOperand(0);
+    SignExtend = CurDAG->getTargetConstant(0, MVT::i32);
+  }
 
-  unsigned TestedBit = Log2_64(Val);
-  // Checks above should have guaranteed that we haven't lost information in
-  // finding TestedBit, but it must still be in range.
-  if (TestedBit >= RegWidth) return false;
+  unsigned LegalShiftVal = Log2_32(Size);
+  unsigned ShiftVal = CSD->getZExtValue();
 
-  FixedPos = CurDAG->getTargetConstant(TestedBit, MVT::i64);
-  return true;
+  if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
+    return false;
+
+  if (isWorthFolding(N))
+    return true;
+
+  return false;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                          unsigned Op16,unsigned Op32,
-                                          unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64)
-    Op = Op64;
-  else
-    llvm_unreachable("Unexpected atomic operation");
+bool AArch64DAGToDAGISel::SelectAddrModeWRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
 
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-      Ops.push_back(AN->getOperand(i));
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
 
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, true, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, true, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // There was no shift, whatever else we find.
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+
+  AArch64_AM::ShiftExtendType Ext = AArch64_AM::InvalidShiftExtend;
+  // Try to match an unshifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(LHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = RHS;
+    Offset = narrowIfNeeded(CurDAG, LHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(LHS))
+      return true;
+  }
+
+  // Try to match an unshifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding &&
+      (Ext = getExtendTypeForNode(RHS, true)) !=
+          AArch64_AM::InvalidShiftExtend) {
+    Base = LHS;
+    Offset = narrowIfNeeded(CurDAG, RHS.getOperand(0));
+    SignExtend = CurDAG->getTargetConstant(Ext == AArch64_AM::SXTW, MVT::i32);
+    if (isWorthFolding(RHS))
+      return true;
+  }
 
-  return CurDAG->SelectNodeTo(Node, Op,
-                              AN->getValueType(0), MVT::Other,
-                              &Ops[0], Ops.size());
+  return false;
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeXRO(SDValue N, unsigned Size,
+                                            SDValue &Base, SDValue &Offset,
+                                            SDValue &SignExtend,
+                                            SDValue &DoShift) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+  SDValue LHS = N.getOperand(0);
+  SDValue RHS = N.getOperand(1);
+
+  // We don't want to match immediate adds here, because they are better lowered
+  // to the register-immediate addressing modes.
+  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = N.getNode();
+  for (SDNode *UI : Node->uses()) {
+    if (!isa<MemSDNode>(*UI))
+      return false;
+  }
+
+  // Remember if it is worth folding N when it produces extended register.
+  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
+
+  // Try to match a shifted extend on the RHS.
+  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(RHS, Size, false, Offset, SignExtend)) {
+    Base = LHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Try to match a shifted extend on the LHS.
+  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
+      SelectExtendedSHL(LHS, Size, false, Offset, SignExtend)) {
+    Base = RHS;
+    DoShift = CurDAG->getTargetConstant(true, MVT::i32);
+    return true;
+  }
+
+  // Match any non-shifted, non-extend, non-immediate add expression.
+  Base = LHS;
+  Offset = RHS;
+  SignExtend = CurDAG->getTargetConstant(false, MVT::i32);
+  DoShift = CurDAG->getTargetConstant(false, MVT::i32);
+  // Reg1 + Reg2 is free: no check needed.
+  return true;
 }
 
 SDValue AArch64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::DPairRegClassID,
-                                    AArch64::DTripleRegClassID,
-                                    AArch64::DQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::dsub_0, AArch64::dsub_1,
-                                AArch64::dsub_2, AArch64::dsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::DDRegClassID, AArch64::DDDRegClassID, AArch64::DDDDRegClassID};
+  static unsigned SubRegs[] = { AArch64::dsub0, AArch64::dsub1,
+                                AArch64::dsub2, AArch64::dsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
 SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { AArch64::QPairRegClassID,
-                                    AArch64::QTripleRegClassID,
-                                    AArch64::QQuadRegClassID };
-  static unsigned SubRegs[] = { AArch64::qsub_0, AArch64::qsub_1,
-                                AArch64::qsub_2, AArch64::qsub_3 };
+  static unsigned RegClassIDs[] = {
+      AArch64::QQRegClassID, AArch64::QQQRegClassID, AArch64::QQQQRegClassID};
+  static unsigned SubRegs[] = { AArch64::qsub0, AArch64::qsub1,
+                                AArch64::qsub2, AArch64::qsub3 };
 
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
@@ -478,1100 +877,2159 @@ SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
   return SDValue(N, 0);
 }
 
+SDNode *AArch64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc, bool isExt) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  unsigned ExtOff = isExt;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  unsigned Vec0Off = ExtOff + 1;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
+                               N->op_begin() + Vec0Off + NumVecs);
+  SDValue RegSeq = createQTuple(Regs);
 
-// Get the register stride update opcode of a VLD/VST instruction that
-// is otherwise equivalent to the given fixed stride updating instruction.
-static unsigned getVLDSTRegisterUpdateOpcode(unsigned Opc) {
-  switch (Opc) {
-  default: break;
-  case AArch64::LD1WB_8B_fixed: return AArch64::LD1WB_8B_register;
-  case AArch64::LD1WB_4H_fixed: return AArch64::LD1WB_4H_register;
-  case AArch64::LD1WB_2S_fixed: return AArch64::LD1WB_2S_register;
-  case AArch64::LD1WB_1D_fixed: return AArch64::LD1WB_1D_register;
-  case AArch64::LD1WB_16B_fixed: return AArch64::LD1WB_16B_register;
-  case AArch64::LD1WB_8H_fixed: return AArch64::LD1WB_8H_register;
-  case AArch64::LD1WB_4S_fixed: return AArch64::LD1WB_4S_register;
-  case AArch64::LD1WB_2D_fixed: return AArch64::LD1WB_2D_register;
-
-  case AArch64::LD2WB_8B_fixed: return AArch64::LD2WB_8B_register;
-  case AArch64::LD2WB_4H_fixed: return AArch64::LD2WB_4H_register;
-  case AArch64::LD2WB_2S_fixed: return AArch64::LD2WB_2S_register;
-  case AArch64::LD2WB_16B_fixed: return AArch64::LD2WB_16B_register;
-  case AArch64::LD2WB_8H_fixed: return AArch64::LD2WB_8H_register;
-  case AArch64::LD2WB_4S_fixed: return AArch64::LD2WB_4S_register;
-  case AArch64::LD2WB_2D_fixed: return AArch64::LD2WB_2D_register;
-
-  case AArch64::LD3WB_8B_fixed: return AArch64::LD3WB_8B_register;
-  case AArch64::LD3WB_4H_fixed: return AArch64::LD3WB_4H_register;
-  case AArch64::LD3WB_2S_fixed: return AArch64::LD3WB_2S_register;
-  case AArch64::LD3WB_16B_fixed: return AArch64::LD3WB_16B_register;
-  case AArch64::LD3WB_8H_fixed: return AArch64::LD3WB_8H_register;
-  case AArch64::LD3WB_4S_fixed: return AArch64::LD3WB_4S_register;
-  case AArch64::LD3WB_2D_fixed: return AArch64::LD3WB_2D_register;
-
-  case AArch64::LD4WB_8B_fixed: return AArch64::LD4WB_8B_register;
-  case AArch64::LD4WB_4H_fixed: return AArch64::LD4WB_4H_register;
-  case AArch64::LD4WB_2S_fixed: return AArch64::LD4WB_2S_register;
-  case AArch64::LD4WB_16B_fixed: return AArch64::LD4WB_16B_register;
-  case AArch64::LD4WB_8H_fixed: return AArch64::LD4WB_8H_register;
-  case AArch64::LD4WB_4S_fixed: return AArch64::LD4WB_4S_register;
-  case AArch64::LD4WB_2D_fixed: return AArch64::LD4WB_2D_register;
-
-  case AArch64::LD1x2WB_8B_fixed: return AArch64::LD1x2WB_8B_register;
-  case AArch64::LD1x2WB_4H_fixed: return AArch64::LD1x2WB_4H_register;
-  case AArch64::LD1x2WB_2S_fixed: return AArch64::LD1x2WB_2S_register;
-  case AArch64::LD1x2WB_1D_fixed: return AArch64::LD1x2WB_1D_register;
-  case AArch64::LD1x2WB_16B_fixed: return AArch64::LD1x2WB_16B_register;
-  case AArch64::LD1x2WB_8H_fixed: return AArch64::LD1x2WB_8H_register;
-  case AArch64::LD1x2WB_4S_fixed: return AArch64::LD1x2WB_4S_register;
-  case AArch64::LD1x2WB_2D_fixed: return AArch64::LD1x2WB_2D_register;
-
-  case AArch64::LD1x3WB_8B_fixed: return AArch64::LD1x3WB_8B_register;
-  case AArch64::LD1x3WB_4H_fixed: return AArch64::LD1x3WB_4H_register;
-  case AArch64::LD1x3WB_2S_fixed: return AArch64::LD1x3WB_2S_register;
-  case AArch64::LD1x3WB_1D_fixed: return AArch64::LD1x3WB_1D_register;
-  case AArch64::LD1x3WB_16B_fixed: return AArch64::LD1x3WB_16B_register;
-  case AArch64::LD1x3WB_8H_fixed: return AArch64::LD1x3WB_8H_register;
-  case AArch64::LD1x3WB_4S_fixed: return AArch64::LD1x3WB_4S_register;
-  case AArch64::LD1x3WB_2D_fixed: return AArch64::LD1x3WB_2D_register;
-
-  case AArch64::LD1x4WB_8B_fixed: return AArch64::LD1x4WB_8B_register;
-  case AArch64::LD1x4WB_4H_fixed: return AArch64::LD1x4WB_4H_register;
-  case AArch64::LD1x4WB_2S_fixed: return AArch64::LD1x4WB_2S_register;
-  case AArch64::LD1x4WB_1D_fixed: return AArch64::LD1x4WB_1D_register;
-  case AArch64::LD1x4WB_16B_fixed: return AArch64::LD1x4WB_16B_register;
-  case AArch64::LD1x4WB_8H_fixed: return AArch64::LD1x4WB_8H_register;
-  case AArch64::LD1x4WB_4S_fixed: return AArch64::LD1x4WB_4S_register;
-  case AArch64::LD1x4WB_2D_fixed: return AArch64::LD1x4WB_2D_register;
-
-  case AArch64::ST1WB_8B_fixed: return AArch64::ST1WB_8B_register;
-  case AArch64::ST1WB_4H_fixed: return AArch64::ST1WB_4H_register;
-  case AArch64::ST1WB_2S_fixed: return AArch64::ST1WB_2S_register;
-  case AArch64::ST1WB_1D_fixed: return AArch64::ST1WB_1D_register;
-  case AArch64::ST1WB_16B_fixed: return AArch64::ST1WB_16B_register;
-  case AArch64::ST1WB_8H_fixed: return AArch64::ST1WB_8H_register;
-  case AArch64::ST1WB_4S_fixed: return AArch64::ST1WB_4S_register;
-  case AArch64::ST1WB_2D_fixed: return AArch64::ST1WB_2D_register;
-
-  case AArch64::ST2WB_8B_fixed: return AArch64::ST2WB_8B_register;
-  case AArch64::ST2WB_4H_fixed: return AArch64::ST2WB_4H_register;
-  case AArch64::ST2WB_2S_fixed: return AArch64::ST2WB_2S_register;
-  case AArch64::ST2WB_16B_fixed: return AArch64::ST2WB_16B_register;
-  case AArch64::ST2WB_8H_fixed: return AArch64::ST2WB_8H_register;
-  case AArch64::ST2WB_4S_fixed: return AArch64::ST2WB_4S_register;
-  case AArch64::ST2WB_2D_fixed: return AArch64::ST2WB_2D_register;
-
-  case AArch64::ST3WB_8B_fixed: return AArch64::ST3WB_8B_register;
-  case AArch64::ST3WB_4H_fixed: return AArch64::ST3WB_4H_register;
-  case AArch64::ST3WB_2S_fixed: return AArch64::ST3WB_2S_register;
-  case AArch64::ST3WB_16B_fixed: return AArch64::ST3WB_16B_register;
-  case AArch64::ST3WB_8H_fixed: return AArch64::ST3WB_8H_register;
-  case AArch64::ST3WB_4S_fixed: return AArch64::ST3WB_4S_register;
-  case AArch64::ST3WB_2D_fixed: return AArch64::ST3WB_2D_register;
-
-  case AArch64::ST4WB_8B_fixed: return AArch64::ST4WB_8B_register;
-  case AArch64::ST4WB_4H_fixed: return AArch64::ST4WB_4H_register;
-  case AArch64::ST4WB_2S_fixed: return AArch64::ST4WB_2S_register;
-  case AArch64::ST4WB_16B_fixed: return AArch64::ST4WB_16B_register;
-  case AArch64::ST4WB_8H_fixed: return AArch64::ST4WB_8H_register;
-  case AArch64::ST4WB_4S_fixed: return AArch64::ST4WB_4S_register;
-  case AArch64::ST4WB_2D_fixed: return AArch64::ST4WB_2D_register;
-
-  case AArch64::ST1x2WB_8B_fixed: return AArch64::ST1x2WB_8B_register;
-  case AArch64::ST1x2WB_4H_fixed: return AArch64::ST1x2WB_4H_register;
-  case AArch64::ST1x2WB_2S_fixed: return AArch64::ST1x2WB_2S_register;
-  case AArch64::ST1x2WB_1D_fixed: return AArch64::ST1x2WB_1D_register;
-  case AArch64::ST1x2WB_16B_fixed: return AArch64::ST1x2WB_16B_register;
-  case AArch64::ST1x2WB_8H_fixed: return AArch64::ST1x2WB_8H_register;
-  case AArch64::ST1x2WB_4S_fixed: return AArch64::ST1x2WB_4S_register;
-  case AArch64::ST1x2WB_2D_fixed: return AArch64::ST1x2WB_2D_register;
-
-  case AArch64::ST1x3WB_8B_fixed: return AArch64::ST1x3WB_8B_register;
-  case AArch64::ST1x3WB_4H_fixed: return AArch64::ST1x3WB_4H_register;
-  case AArch64::ST1x3WB_2S_fixed: return AArch64::ST1x3WB_2S_register;
-  case AArch64::ST1x3WB_1D_fixed: return AArch64::ST1x3WB_1D_register;
-  case AArch64::ST1x3WB_16B_fixed: return AArch64::ST1x3WB_16B_register;
-  case AArch64::ST1x3WB_8H_fixed: return AArch64::ST1x3WB_8H_register;
-  case AArch64::ST1x3WB_4S_fixed: return AArch64::ST1x3WB_4S_register;
-  case AArch64::ST1x3WB_2D_fixed: return AArch64::ST1x3WB_2D_register;
-
-  case AArch64::ST1x4WB_8B_fixed: return AArch64::ST1x4WB_8B_register;
-  case AArch64::ST1x4WB_4H_fixed: return AArch64::ST1x4WB_4H_register;
-  case AArch64::ST1x4WB_2S_fixed: return AArch64::ST1x4WB_2S_register;
-  case AArch64::ST1x4WB_1D_fixed: return AArch64::ST1x4WB_1D_register;
-  case AArch64::ST1x4WB_16B_fixed: return AArch64::ST1x4WB_16B_register;
-  case AArch64::ST1x4WB_8H_fixed: return AArch64::ST1x4WB_8H_register;
-  case AArch64::ST1x4WB_4S_fixed: return AArch64::ST1x4WB_4S_register;
-  case AArch64::ST1x4WB_2D_fixed: return AArch64::ST1x4WB_2D_register;
-
-  // Post-index of duplicate loads
-  case AArch64::LD2R_WB_8B_fixed: return AArch64::LD2R_WB_8B_register;
-  case AArch64::LD2R_WB_4H_fixed: return AArch64::LD2R_WB_4H_register;
-  case AArch64::LD2R_WB_2S_fixed: return AArch64::LD2R_WB_2S_register;
-  case AArch64::LD2R_WB_1D_fixed: return AArch64::LD2R_WB_1D_register;
-  case AArch64::LD2R_WB_16B_fixed: return AArch64::LD2R_WB_16B_register;
-  case AArch64::LD2R_WB_8H_fixed: return AArch64::LD2R_WB_8H_register;
-  case AArch64::LD2R_WB_4S_fixed: return AArch64::LD2R_WB_4S_register;
-  case AArch64::LD2R_WB_2D_fixed: return AArch64::LD2R_WB_2D_register;
-
-  case AArch64::LD3R_WB_8B_fixed: return AArch64::LD3R_WB_8B_register;
-  case AArch64::LD3R_WB_4H_fixed: return AArch64::LD3R_WB_4H_register;
-  case AArch64::LD3R_WB_2S_fixed: return AArch64::LD3R_WB_2S_register;
-  case AArch64::LD3R_WB_1D_fixed: return AArch64::LD3R_WB_1D_register;
-  case AArch64::LD3R_WB_16B_fixed: return AArch64::LD3R_WB_16B_register;
-  case AArch64::LD3R_WB_8H_fixed: return AArch64::LD3R_WB_8H_register;
-  case AArch64::LD3R_WB_4S_fixed: return AArch64::LD3R_WB_4S_register;
-  case AArch64::LD3R_WB_2D_fixed: return AArch64::LD3R_WB_2D_register;
-
-  case AArch64::LD4R_WB_8B_fixed: return AArch64::LD4R_WB_8B_register;
-  case AArch64::LD4R_WB_4H_fixed: return AArch64::LD4R_WB_4H_register;
-  case AArch64::LD4R_WB_2S_fixed: return AArch64::LD4R_WB_2S_register;
-  case AArch64::LD4R_WB_1D_fixed: return AArch64::LD4R_WB_1D_register;
-  case AArch64::LD4R_WB_16B_fixed: return AArch64::LD4R_WB_16B_register;
-  case AArch64::LD4R_WB_8H_fixed: return AArch64::LD4R_WB_8H_register;
-  case AArch64::LD4R_WB_4S_fixed: return AArch64::LD4R_WB_4S_register;
-  case AArch64::LD4R_WB_2D_fixed: return AArch64::LD4R_WB_2D_register;
-
-  // Post-index of lane loads
-  case AArch64::LD2LN_WB_B_fixed: return AArch64::LD2LN_WB_B_register;
-  case AArch64::LD2LN_WB_H_fixed: return AArch64::LD2LN_WB_H_register;
-  case AArch64::LD2LN_WB_S_fixed: return AArch64::LD2LN_WB_S_register;
-  case AArch64::LD2LN_WB_D_fixed: return AArch64::LD2LN_WB_D_register;
-
-  case AArch64::LD3LN_WB_B_fixed: return AArch64::LD3LN_WB_B_register;
-  case AArch64::LD3LN_WB_H_fixed: return AArch64::LD3LN_WB_H_register;
-  case AArch64::LD3LN_WB_S_fixed: return AArch64::LD3LN_WB_S_register;
-  case AArch64::LD3LN_WB_D_fixed: return AArch64::LD3LN_WB_D_register;
-
-  case AArch64::LD4LN_WB_B_fixed: return AArch64::LD4LN_WB_B_register;
-  case AArch64::LD4LN_WB_H_fixed: return AArch64::LD4LN_WB_H_register;
-  case AArch64::LD4LN_WB_S_fixed: return AArch64::LD4LN_WB_S_register;
-  case AArch64::LD4LN_WB_D_fixed: return AArch64::LD4LN_WB_D_register;
-
-  // Post-index of lane stores
-  case AArch64::ST2LN_WB_B_fixed: return AArch64::ST2LN_WB_B_register;
-  case AArch64::ST2LN_WB_H_fixed: return AArch64::ST2LN_WB_H_register;
-  case AArch64::ST2LN_WB_S_fixed: return AArch64::ST2LN_WB_S_register;
-  case AArch64::ST2LN_WB_D_fixed: return AArch64::ST2LN_WB_D_register;
-
-  case AArch64::ST3LN_WB_B_fixed: return AArch64::ST3LN_WB_B_register;
-  case AArch64::ST3LN_WB_H_fixed: return AArch64::ST3LN_WB_H_register;
-  case AArch64::ST3LN_WB_S_fixed: return AArch64::ST3LN_WB_S_register;
-  case AArch64::ST3LN_WB_D_fixed: return AArch64::ST3LN_WB_D_register;
-
-  case AArch64::ST4LN_WB_B_fixed: return AArch64::ST4LN_WB_B_register;
-  case AArch64::ST4LN_WB_H_fixed: return AArch64::ST4LN_WB_H_register;
-  case AArch64::ST4LN_WB_S_fixed: return AArch64::ST4LN_WB_S_register;
-  case AArch64::ST4LN_WB_D_fixed: return AArch64::ST4LN_WB_D_register;
-  }
-  return Opc; // If not one we handle, return it unchanged.
+  SmallVector<SDValue, 6> Ops;
+  if (isExt)
+    Ops.push_back(N->getOperand(1));
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isUnindexed())
+    return nullptr;
+  EVT VT = LD->getMemoryVT();
+  EVT DstVT = N->getValueType(0);
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
+
+  // We're not doing validity checking here. That was done when checking
+  // if we should mark the load as indexed or not. We're just selecting
+  // the right instruction.
+  unsigned Opcode = 0;
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  bool InsertTo64 = false;
+  if (VT == MVT::i64)
+    Opcode = IsPre ? AArch64::LDRXpre : AArch64::LDRXpost;
+  else if (VT == MVT::i32) {
+    if (ExtType == ISD::NON_EXTLOAD)
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+    else if (ExtType == ISD::SEXTLOAD)
+      Opcode = IsPre ? AArch64::LDRSWpre : AArch64::LDRSWpost;
+    else {
+      Opcode = IsPre ? AArch64::LDRWpre : AArch64::LDRWpost;
+      InsertTo64 = true;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i16) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSHXpre : AArch64::LDRSHXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSHWpre : AArch64::LDRSHWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRHHpre : AArch64::LDRHHpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::i8) {
+    if (ExtType == ISD::SEXTLOAD) {
+      if (DstVT == MVT::i64)
+        Opcode = IsPre ? AArch64::LDRSBXpre : AArch64::LDRSBXpost;
+      else
+        Opcode = IsPre ? AArch64::LDRSBWpre : AArch64::LDRSBWpost;
+    } else {
+      Opcode = IsPre ? AArch64::LDRBBpre : AArch64::LDRBBpost;
+      InsertTo64 = DstVT == MVT::i64;
+      // The result of the load is only i32. It's the subreg_to_reg that makes
+      // it into an i64.
+      DstVT = MVT::i32;
+    }
+  } else if (VT == MVT::f32) {
+    Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
+  } else if (VT == MVT::f64 || VT.is64BitVector()) {
+    Opcode = IsPre ? AArch64::LDRDpre : AArch64::LDRDpost;
+  } else if (VT.is128BitVector()) {
+    Opcode = IsPre ? AArch64::LDRQpre : AArch64::LDRQpost;
+  } else
+    return nullptr;
+  SDValue Chain = LD->getChain();
+  SDValue Base = LD->getBasePtr();
+  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
+  int OffsetVal = (int)OffsetOp->getZExtValue();
+  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
+  SDValue Ops[] = { Base, Offset, Chain };
+  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i64, DstVT,
+                                       MVT::Other, Ops);
+  // Either way, we're replacing the node, so tell the caller that.
+  Done = true;
+  SDValue LoadedVal = SDValue(Res, 1);
+  if (InsertTo64) {
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    LoadedVal =
+        SDValue(CurDAG->getMachineNode(
+                    AArch64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
+                    CurDAG->getTargetConstant(0, MVT::i64), LoadedVal, SubReg),
+                0);
+  }
+
+  ReplaceUses(SDValue(N, 0), LoadedVal);
+  ReplaceUses(SDValue(N, 1), SDValue(Res, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
+
+  return nullptr;
+}
 
+SDNode *AArch64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs,
+                                        unsigned Opc, unsigned SubRegIdx) {
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  SDValue Chain = N->getOperand(0);
 
-  SmallVector<SDValue, 2> Ops;
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(2)); // Mem operand;
+  Ops.push_back(Chain);
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
 
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i),
+        CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
 
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 1)
-    ResTys.push_back(VT);
-  else if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
+  return nullptr;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc, unsigned SubRegIdx) {
   SDLoc dl(N);
-  SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
 
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(N->getOperand(1)); // Mem operand
+  Ops.push_back(N->getOperand(2)); // Incremental
+  Ops.push_back(Chain);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
+  // Update uses of write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of vector list
+  SDValue SuperReg = SDValue(Ld, 1);
   if (NumVecs == 1)
-    return VLd;
-
-  // If NumVecs > 1, the return result is a super register containing 2-4
-  // consecutive vector registers.
-  SDValue SuperReg = SDValue(VLd, 0);
-
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update users of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-
-  return NULL;
+    ReplaceUses(SDValue(N, 0), SuperReg);
+  else
+    for (unsigned i = 0; i < NumVecs; ++i)
+      ReplaceUses(SDValue(N, i),
+          CurDAG->getTargetExtractSubreg(SubRegIdx + i, dl, VT, SuperReg));
+
+  // Update the chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+  return nullptr;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, bool isUpdating,
-                                       unsigned NumVecs,
-                                       const uint16_t *Opcodes) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
+                                         unsigned Opc) {
   SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
 
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
 
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 2));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  return St;
+}
 
+SDNode *AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
   SmallVector<EVT, 2> ResTys;
-  if (isUpdating)
-    ResTys.push_back(MVT::i64);
+  ResTys.push_back(MVT::i64);   // Type of the write back register
   ResTys.push_back(MVT::Other); // Type for the Chain
 
+  // Form a REG_SEQUENCE to force register allocation.
+  bool Is128Bit = VT.getSizeInBits() == 128;
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
+
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
+  Ops.push_back(RegSeq);
+  Ops.push_back(N->getOperand(NumVecs + 1)); // base register
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Incremental
+  Ops.push_back(N->getOperand(0)); // Chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
 
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
+  return St;
+}
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+class WidenVector {
+  SelectionDAG &DAG;
+
+public:
+  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
+
+  SDValue operator()(SDValue V64Reg) {
+    EVT VT = V64Reg.getValueType();
+    unsigned NarrowSize = VT.getVectorNumElements();
+    MVT EltTy = VT.getVectorElementType().getSimpleVT();
+    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+    SDLoc DL(V64Reg);
+
+    SDValue Undef =
+        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
+    return DAG.getTargetInsertSubreg(AArch64::dsub, DL, WideTy, Undef, V64Reg);
   }
+};
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, SDLoc(V128Reg), NarrowTy,
+                                    V128Reg);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
+                                            unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
 
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue SrcReg = is64BitVector ? createDTuple(Regs) : createQTuple(Regs);
-  Ops.push_back(SrcReg);
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
 
-  // Push back the Chain
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
   Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  SDValue SuperReg = SDValue(Ld, 0);
+
+  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+  static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                              AArch64::qsub3 };
+  for (unsigned i = 0; i < NumVecs; ++i) {
+    SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT, SuperReg);
+    if (Narrow)
+      NV = NarrowVector(NV, *CurDAG);
+    ReplaceUses(SDValue(N, i), NV);
+  }
 
-  // Transfer memoperands.
-  SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
 
-  return VSt;
+  return Ld;
 }
 
-SDValue
-AArch64DAGToDAGISel::getTargetSubregToReg(int SRIdx, SDLoc DL, EVT VT, EVT VTD,
-                                          SDValue Operand) {
-  SDNode *Reg = CurDAG->getMachineNode(TargetOpcode::SUBREG_TO_REG, DL,
-                        VT, VTD, MVT::Other,
-                        CurDAG->getTargetConstant(0, MVT::i64),
-                        Operand,
-                        CurDAG->getTargetConstant(AArch64::sub_64, MVT::i32));
-  return SDValue(Reg, 0);
+SDNode *AArch64DAGToDAGISel::SelectPostLoadLane(SDNode *N, unsigned NumVecs,
+                                                unsigned Opc) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  std::vector<EVT> ResTys;
+  ResTys.push_back(MVT::i64); // Type of the write back register
+  ResTys.push_back(MVT::Untyped);
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64)); // Lane Number
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Update uses of the write back register
+  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 0));
+
+  // Update uses of the vector list
+  SDValue SuperReg = SDValue(Ld, 1);
+  if (NumVecs == 1) {
+    ReplaceUses(SDValue(N, 0),
+                Narrow ? NarrowVector(SuperReg, *CurDAG) : SuperReg);
+  } else {
+    EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
+    static unsigned QSubs[] = { AArch64::qsub0, AArch64::qsub1, AArch64::qsub2,
+                                AArch64::qsub3 };
+    for (unsigned i = 0; i < NumVecs; ++i) {
+      SDValue NV = CurDAG->getTargetExtractSubreg(QSubs[i], dl, WideVT,
+                                                  SuperReg);
+      if (Narrow)
+        NV = NarrowVector(NV, *CurDAG);
+      ReplaceUses(SDValue(N, i), NV);
+    }
+  }
+
+  // Update the Chain
+  ReplaceUses(SDValue(N, NumVecs + 1), SDValue(Ld, 2));
+
+  return Ld;
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
-                                          unsigned NumVecs,
-                                          const uint16_t *Opcodes) {
-  assert(NumVecs >=2 && NumVecs <= 4 && "Load Dup NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
+                                             unsigned Opc) {
   SDLoc dl(N);
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
 
-  EVT VT = N->getValueType(0);
-  unsigned OpcodeIndex;
-  bool is64BitVector = VT.is64BitVector();
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = is64BitVector ? 0 : 4; break;
-  case 16: OpcodeIndex = is64BitVector ? 1 : 5; break;
-  case 32: OpcodeIndex = is64BitVector ? 2 : 6; break;
-  case 64: OpcodeIndex = is64BitVector ? 3 : 7; break;
-  default: llvm_unreachable("unhandled vector duplicate lane load type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SDValue SuperReg;
   SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(2);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-  Ops.push_back(N->getOperand(0)); // Push back the Chain
-
-  SmallVector<EVT, 3> ResTys;
-  // Push back the type of return super register
-  if (NumVecs == 3)
-    ResTys.push_back(MVT::Untyped);
-  else {
-    EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                 is64BitVector ? NumVecs : NumVecs * 2);
-    ResTys.push_back(ResTy);
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of the Chain
-  SDNode *VLdDup = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 3));
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
 
   // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdDup)->setMemRefs(MemOp, MemOp + 1);
-
-  SuperReg = SDValue(VLdDup, 0);
-  unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
-    ReplaceUses(SDValue(N, Vec),
-                CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
-  // Update uses of the Chain
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return NULL;
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
 }
 
-// We only have 128-bit vector type of load/store lane instructions.
-// If it is 64-bit vector, we also select it to the 128-bit instructions.
-// Just use SUBREG_TO_REG to adapt the input to 128-bit vector and
-// EXTRACT_SUBREG to get the 64-bit vector from the 128-bit vector output.
-SDNode *AArch64DAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
-                                             bool isUpdating, unsigned NumVecs,
-                                             const uint16_t *Opcodes) {
-  assert(NumVecs >= 2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
+SDNode *AArch64DAGToDAGISel::SelectPostStoreLane(SDNode *N, unsigned NumVecs,
+                                                 unsigned Opc) {
   SDLoc dl(N);
-  unsigned AddrOpIdx = isUpdating ? 1 : 2;
-  unsigned Vec0Idx = 3;
+  EVT VT = N->getOperand(2)->getValueType(0);
+  bool Narrow = VT.getSizeInBits() == 64;
 
-  SDValue Chain = N->getOperand(0);
-  unsigned Lane =
-      cast<ConstantSDNode>(N->getOperand(Vec0Idx + NumVecs))->getZExtValue();
-  EVT VT = N->getOperand(Vec0Idx).getValueType();
-  bool is64BitVector = VT.is64BitVector();
-  EVT VT64; // 64-bit Vector Type
-
-  if (is64BitVector) {
-    VT64 = VT;
-    VT = EVT::getVectorVT(*CurDAG->getContext(), VT.getVectorElementType(),
-                          VT.getVectorNumElements() * 2);
-  }
-
-  unsigned OpcodeIndex;
-  switch (VT.getScalarType().getSizeInBits()) {
-  case 8: OpcodeIndex = 0; break;
-  case 16: OpcodeIndex = 1; break;
-  case 32: OpcodeIndex = 2; break;
-  case 64: OpcodeIndex = 3; break;
-  default: llvm_unreachable("unhandled vector lane load/store type");
-  }
-  unsigned Opc = Opcodes[OpcodeIndex];
-
-  SmallVector<EVT, 3> ResTys;
-  if (IsLoad) {
-    // Push back the type of return super register
-    if (NumVecs == 3)
-      ResTys.push_back(MVT::Untyped);
-    else {
-      EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
-                                   is64BitVector ? NumVecs : NumVecs * 2);
-      ResTys.push_back(ResTy);
-    }
-  }
-  if (isUpdating)
-    ResTys.push_back(MVT::i64); // Type of the updated register
-  ResTys.push_back(MVT::Other); // Type of Chain
-  SmallVector<SDValue, 5> Ops;
-  Ops.push_back(N->getOperand(AddrOpIdx)); // Push back the Memory Address
-  if (isUpdating) {
-    SDValue Inc = N->getOperand(AddrOpIdx + 1);
-    if (!isa<ConstantSDNode>(Inc.getNode())) // Increment in Register
-      Opc = getVLDSTRegisterUpdateOpcode(Opc);
-    Ops.push_back(Inc);
-  }
-
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  if (is64BitVector)
-    for (unsigned i = 0; i < Regs.size(); i++)
-      Regs[i] = getTargetSubregToReg(AArch64::sub_64, dl, VT, VT64, Regs[i]);
-  SDValue SuperReg = createQTuple(Regs);
-
-  Ops.push_back(SuperReg); // Source Reg
-  SDValue LaneValue = CurDAG->getTargetConstant(Lane, MVT::i32);
-  Ops.push_back(LaneValue);
-  Ops.push_back(Chain); // Push back the Chain
-
-  SDNode *VLdLn = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs);
+
+  if (Narrow)
+    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
+                   WidenVector(*CurDAG));
+
+  SDValue RegSeq = createQTuple(Regs);
+
+  SmallVector<EVT, 2> ResTys;
+  ResTys.push_back(MVT::i64);   // Type of the write back register
+  ResTys.push_back(MVT::Other);
+
+  unsigned LaneNo =
+      cast<ConstantSDNode>(N->getOperand(NumVecs + 1))->getZExtValue();
+
+  SmallVector<SDValue, 6> Ops;
+  Ops.push_back(RegSeq);
+  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
+  Ops.push_back(N->getOperand(NumVecs + 2)); // Base Register
+  Ops.push_back(N->getOperand(NumVecs + 3)); // Incremental
+  Ops.push_back(N->getOperand(0));
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
+
+  // Transfer memoperands.
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(VLdLn)->setMemRefs(MemOp, MemOp + 1);
-  if (!IsLoad)
-    return VLdLn;
-
-  // Extract the subregisters.
-  SuperReg = SDValue(VLdLn, 0);
-  unsigned Sub0 = AArch64::qsub_0;
-  // Update uses of each registers in super register
-  for (unsigned Vec = 0; Vec < NumVecs; ++Vec) {
-    SDValue SUB0 = CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg);
-    if (is64BitVector) {
-      SUB0 = CurDAG->getTargetExtractSubreg(AArch64::sub_64, dl, VT64, SUB0);
-    }
-    ReplaceUses(SDValue(N, Vec), SUB0);
+  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+  return St;
+}
+
+static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
+                                       unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       unsigned NumberOfIgnoredLowBits,
+                                       bool BiggerPattern) {
+  assert(N->getOpcode() == ISD::AND &&
+         "N must be a AND operation to call this function");
+
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // FIXME: simplify-demanded-bits in DAGCombine will probably have
+  // changed the AND node to a 32-bit mask operation. We'll have to
+  // undo that as part of the transform here if we want to catch all
+  // the opportunities.
+  // Currently the NumberOfIgnoredLowBits argument helps to recover
+  // form these situations when matching bigger pattern (bitfield insert).
+
+  // For unsigned extracts, check for a shift right and mask
+  uint64_t And_imm = 0;
+  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
+    return false;
+
+  const SDNode *Op0 = N->getOperand(0).getNode();
+
+  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
+  // simplified. Try to undo that
+  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
+
+  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
+  if (And_imm & (And_imm + 1))
+    return false;
+
+  bool ClampMSB = false;
+  uint64_t Srl_imm = 0;
+  // Handle the SRL + ANY_EXTEND case.
+  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
+      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
+    // Extend the incoming operand of the SRL to 64-bit.
+    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
+    // Make sure to clamp the MSB so that we preserve the semantics of the
+    // original operations.
+    ClampMSB = true;
+  } else if (VT == MVT::i32 && Op0->getOpcode() == ISD::TRUNCATE &&
+             isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL,
+                                   Srl_imm)) {
+    // If the shift result was truncated, we can still combine them.
+    Opd0 = Op0->getOperand(0).getOperand(0);
+
+    // Use the type of SRL node.
+    VT = Opd0->getValueType(0);
+  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
+    Opd0 = Op0->getOperand(0);
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift right has been performed.
+    // The resulting code will be at least as good as the original one
+    // plus it may expose more opportunities for bitfield insert pattern.
+    // FIXME: Currently we limit this to the bigger pattern, because
+    // some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
+         "bad amount in shift node!");
+
+  LSB = Srl_imm;
+  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
+                                  : CountTrailingOnes_64(And_imm)) -
+        1;
+  if (ClampMSB)
+    // Since we're moving the extend before the right shift operation, we need
+    // to clamp the MSB to make sure we don't shift in undefined bits instead of
+    // the zeros which would get shifted in with the original right shift
+    // operation.
+    MSB = MSB > 31 ? 31 : MSB;
+
+  Opc = VT == MVT::i32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                     unsigned &LSB, unsigned &MSB) {
+  // We are looking for the following pattern which basically extracts a single
+  // bit from the source value and places it in the LSB of the destination
+  // value, all other bits of the destination value or set to zero:
+  //
+  // Value2 = AND Value, MaskImm
+  // SRL Value2, ShiftImm
+  //
+  // with MaskImm >> ShiftImm == 1.
+  //
+  // This gets selected into a single UBFM:
+  //
+  // UBFM Value, ShiftImm, ShiftImm
+  //
+
+  if (N->getOpcode() != ISD::SRL)
+    return false;
+
+  uint64_t And_mask = 0;
+  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
+    return false;
+
+  Opd0 = N->getOperand(0).getOperand(0);
+
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  // Check whether we really have a one bit extract here.
+  if (And_mask >> Srl_imm == 0x1) {
+    if (N->getValueType(0) == MVT::i32)
+      Opc = AArch64::UBFMWri;
+    else
+      Opc = AArch64::UBFMXri;
+
+    LSB = MSB = Srl_imm;
+
+    return true;
   }
-  ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
-  if (isUpdating)
-    ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return NULL;
+
+  return false;
 }
 
-unsigned AArch64DAGToDAGISel::getTBLOpc(bool IsExt, bool Is64Bit,
-                                        unsigned NumOfVec) {
-  assert(NumOfVec >= 1 && NumOfVec <= 4 && "VST NumVecs out-of-range");
+static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
+                                       unsigned &LSB, unsigned &MSB,
+                                       bool BiggerPattern) {
+  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
+         "N must be a SHR/SRA operation to call this function");
 
-  unsigned Opc = 0;
-  switch (NumOfVec) {
+  EVT VT = N->getValueType(0);
+
+  // Here we can test the type of VT and return false when the type does not
+  // match, but since it is done prior to that call in the current context
+  // we turned that into an assert to avoid redundant code.
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "Type checking must have been done before calling this function");
+
+  // Check for AND + SRL doing a one bit extract.
+  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
+    return true;
+
+  // we're looking for a shift of a shift
+  uint64_t Shl_imm = 0;
+  uint64_t Trunc_bits = 0;
+  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
+    Opd0 = N->getOperand(0).getOperand(0);
+  } else if (VT == MVT::i32 && N->getOpcode() == ISD::SRL &&
+             N->getOperand(0).getNode()->getOpcode() == ISD::TRUNCATE) {
+    // We are looking for a shift of truncate. Truncate from i64 to i32 could
+    // be considered as setting high 32 bits as zero. Our strategy here is to
+    // always generate 64bit UBFM. This consistency will help the CSE pass
+    // later find more redundancy.
+    Opd0 = N->getOperand(0).getOperand(0);
+    Trunc_bits = Opd0->getValueType(0).getSizeInBits() - VT.getSizeInBits();
+    VT = Opd0->getValueType(0);
+    assert(VT == MVT::i64 && "the promoted type should be i64");
+  } else if (BiggerPattern) {
+    // Let's pretend a 0 shift left has been performed.
+    // FIXME: Currently we limit this to the bigger pattern case,
+    // because some optimizations expect AND and not UBFM
+    Opd0 = N->getOperand(0);
+  } else
+    return false;
+
+  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
+  uint64_t Srl_imm = 0;
+  if (!isIntImmediate(N->getOperand(1), Srl_imm))
+    return false;
+
+  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
+         "bad amount in shift node!");
+  // Note: The width operand is encoded as width-1.
+  unsigned Width = VT.getSizeInBits() - Trunc_bits - Srl_imm - 1;
+  int sLSB = Srl_imm - Shl_imm;
+  if (sLSB < 0)
+    return false;
+  LSB = sLSB;
+  MSB = LSB + Width;
+  // SRA requires a signed extraction
+  if (VT == MVT::i32)
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMWri : AArch64::UBFMWri;
+  else
+    Opc = N->getOpcode() == ISD::SRA ? AArch64::SBFMXri : AArch64::UBFMXri;
+  return true;
+}
+
+static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
+                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
+                                unsigned NumberOfIgnoredLowBits = 0,
+                                bool BiggerPattern = false) {
+  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
+    return false;
+
+  switch (N->getOpcode()) {
   default:
+    if (!N->isMachineOpcode())
+      return false;
     break;
-  case 1:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX1_8b : AArch64::TBX1_16b;
+  case ISD::AND:
+    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
+                                      NumberOfIgnoredLowBits, BiggerPattern);
+  case ISD::SRL:
+  case ISD::SRA:
+    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
+  }
+
+  unsigned NOpc = N->getMachineOpcode();
+  switch (NOpc) {
+  default:
+    return false;
+  case AArch64::SBFMWri:
+  case AArch64::UBFMWri:
+  case AArch64::SBFMXri:
+  case AArch64::UBFMXri:
+    Opc = NOpc;
+    Opd0 = N->getOperand(0);
+    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
+    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
+    return true;
+  }
+  // Unreachable
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
+  unsigned Opc, LSB, MSB;
+  SDValue Opd0;
+  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+
+  // If the bit extract operation is 64bit but the original type is 32bit, we
+  // need to add one EXTRACT_SUBREG.
+  if ((Opc == AArch64::SBFMXri || Opc == AArch64::UBFMXri) && VT == MVT::i32) {
+    SDValue Ops64[] = {Opd0, CurDAG->getTargetConstant(LSB, MVT::i64),
+                       CurDAG->getTargetConstant(MSB, MVT::i64)};
+
+    SDNode *BFM = CurDAG->getMachineNode(Opc, SDLoc(N), MVT::i64, Ops64);
+    SDValue SubReg = CurDAG->getTargetConstant(AArch64::sub_32, MVT::i32);
+    MachineSDNode *Node =
+        CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32,
+                               SDValue(BFM, 0), SubReg);
+    return Node;
+  }
+
+  SDValue Ops[] = {Opd0, CurDAG->getTargetConstant(LSB, VT),
+                   CurDAG->getTargetConstant(MSB, VT)};
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+/// Does DstMask form a complementary pair with the mask provided by
+/// BitsToBeInserted, suitable for use in a BFI instruction. Roughly speaking,
+/// this asks whether DstMask zeroes precisely those bits that will be set by
+/// the other half.
+static bool isBitfieldDstMask(uint64_t DstMask, APInt BitsToBeInserted,
+                              unsigned NumberOfIgnoredHighBits, EVT VT) {
+  assert((VT == MVT::i32 || VT == MVT::i64) &&
+         "i32 or i64 mask type expected!");
+  unsigned BitWidth = VT.getSizeInBits() - NumberOfIgnoredHighBits;
+
+  APInt SignificantDstMask = APInt(BitWidth, DstMask);
+  APInt SignificantBitsToBeInserted = BitsToBeInserted.zextOrTrunc(BitWidth);
+
+  return (SignificantDstMask & SignificantBitsToBeInserted) == 0 &&
+         (SignificantDstMask | SignificantBitsToBeInserted).isAllOnesValue();
+}
+
+// Look for bits that will be useful for later uses.
+// A bit is consider useless as soon as it is dropped and never used
+// before it as been dropped.
+// E.g., looking for useful bit of x
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// After #1, x useful bits are 0x7, then the useful bits of x, live through
+// y.
+// After #2, the useful bits of x are 0x4.
+// However, if x is used on an unpredicatable instruction, then all its bits
+// are useful.
+// E.g.
+// 1. y = x & 0x7
+// 2. z = y >> 2
+// 3. str x, [@x]
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
+
+static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  Imm = AArch64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
+  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
+  getUsefulBits(Op, UsefulBits, Depth + 1);
+}
+
+static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
+                                             uint64_t Imm, uint64_t MSB,
+                                             unsigned Depth) {
+  // inherit the bitwidth value
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    // The interesting part will be in the lower part of the result
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was starting at Imm in the argument
+    OpUsefulBits = OpUsefulBits.shl(Imm);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    // The interesting part will be shifted in the result
+    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
+    getUsefulBits(Op, OpUsefulBits, Depth + 1);
+    // The interesting part was at zero in the argument
+    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
+  }
+
+  UsefulBits &= OpUsefulBits;
+}
+
+static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
+                                  unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+
+  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+}
+
+static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
+                                              unsigned Depth) {
+  uint64_t ShiftTypeAndValue =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  APInt Mask(UsefulBits);
+  Mask.clearAllBits();
+  Mask.flipAllBits();
+
+  if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSL) {
+    // Shift Left
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.shl(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.lshr(ShiftAmt);
+  } else if (AArch64_AM::getShiftType(ShiftTypeAndValue) == AArch64_AM::LSR) {
+    // Shift Right
+    // We do not handle AArch64_AM::ASR, because the sign will change the
+    // number of useful bits
+    uint64_t ShiftAmt = AArch64_AM::getShiftValue(ShiftTypeAndValue);
+    Mask = Mask.lshr(ShiftAmt);
+    getUsefulBits(Op, Mask, Depth + 1);
+    Mask = Mask.shl(ShiftAmt);
+  } else
+    return;
+
+  UsefulBits &= Mask;
+}
+
+static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
+                                 unsigned Depth) {
+  uint64_t Imm =
+      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
+  uint64_t MSB =
+      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
+
+  if (Op.getOperand(1) == Orig)
+    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
+
+  APInt OpUsefulBits(UsefulBits);
+  OpUsefulBits = 1;
+
+  if (MSB >= Imm) {
+    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
+    --OpUsefulBits;
+    UsefulBits &= ~OpUsefulBits;
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  } else {
+    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
+    --OpUsefulBits;
+    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
+    getUsefulBits(Op, UsefulBits, Depth + 1);
+  }
+}
+
+static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
+                                SDValue Orig, unsigned Depth) {
+
+  // Users of this node should have already been instruction selected
+  // FIXME: Can we turn that into an assert?
+  if (!UserNode->isMachineOpcode())
+    return;
+
+  switch (UserNode->getMachineOpcode()) {
+  default:
+    return;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+  case AArch64::ANDWri:
+  case AArch64::ANDXri:
+    // We increment Depth only when we call the getUsefulBits
+    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::UBFMWri:
+  case AArch64::UBFMXri:
+    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
+
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+    if (UserNode->getOperand(1) != Orig)
+      return;
+    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
+                                             Depth);
+  case AArch64::BFMWri:
+  case AArch64::BFMXri:
+    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
+  }
+}
+
+static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
+  if (Depth >= 6)
+    return;
+  // Initialize UsefulBits
+  if (!Depth) {
+    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
+    // At the beginning, assume every produced bits is useful
+    UsefulBits = APInt(Bitwidth, 0);
+    UsefulBits.flipAllBits();
+  }
+  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
+
+  for (SDNode *Node : Op.getNode()->uses()) {
+    // A use cannot produce useful bits
+    APInt UsefulBitsForUse = APInt(UsefulBits);
+    getUsefulBitsForUse(Node, UsefulBitsForUse, Op, Depth);
+    UsersUsefulBits |= UsefulBitsForUse;
+  }
+  // UsefulBits contains the produced bits that are meaningful for the
+  // current definition, thus a user cannot make a bit meaningful at
+  // this point
+  UsefulBits &= UsersUsefulBits;
+}
+
+/// Create a machine node performing a notional SHL of Op by ShlAmount. If
+/// ShlAmount is negative, do a (logical) right-shift instead. If ShlAmount is
+/// 0, return Op unchanged.
+static SDValue getLeftShift(SelectionDAG *CurDAG, SDValue Op, int ShlAmount) {
+  if (ShlAmount == 0)
+    return Op;
+
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  unsigned UBFMOpc = BitWidth == 32 ? AArch64::UBFMWri : AArch64::UBFMXri;
+
+  SDNode *ShiftNode;
+  if (ShlAmount > 0) {
+    // LSL wD, wN, #Amt == UBFM wD, wN, #32-Amt, #31-Amt
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op,
+        CurDAG->getTargetConstant(BitWidth - ShlAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1 - ShlAmount, VT));
+  } else {
+    // LSR wD, wN, #Amt == UBFM wD, wN, #Amt, #32-1
+    assert(ShlAmount < 0 && "expected right shift");
+    int ShrAmount = -ShlAmount;
+    ShiftNode = CurDAG->getMachineNode(
+        UBFMOpc, SDLoc(Op), VT, Op, CurDAG->getTargetConstant(ShrAmount, VT),
+        CurDAG->getTargetConstant(BitWidth - 1, VT));
+  }
+
+  return SDValue(ShiftNode, 0);
+}
+
+/// Does this tree qualify as an attempt to move a bitfield into position,
+/// essentially "(and (shl VAL, N), Mask)".
+static bool isBitfieldPositioningOp(SelectionDAG *CurDAG, SDValue Op,
+                                    SDValue &Src, int &ShiftAmount,
+                                    int &MaskWidth) {
+  EVT VT = Op.getValueType();
+  unsigned BitWidth = VT.getSizeInBits();
+  (void)BitWidth;
+  assert(BitWidth == 32 || BitWidth == 64);
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(Op, KnownZero, KnownOne);
+
+  // Non-zero in the sense that they're not provably zero, which is the key
+  // point if we want to use this value
+  uint64_t NonZeroBits = (~KnownZero).getZExtValue();
+
+  // Discard a constant AND mask if present. It's safe because the node will
+  // already have been factored into the computeKnownBits calculation above.
+  uint64_t AndImm;
+  if (isOpcWithIntImmediate(Op.getNode(), ISD::AND, AndImm)) {
+    assert((~APInt(BitWidth, AndImm) & ~KnownZero) == 0);
+    Op = Op.getOperand(0);
+  }
+
+  uint64_t ShlImm;
+  if (!isOpcWithIntImmediate(Op.getNode(), ISD::SHL, ShlImm))
+    return false;
+  Op = Op.getOperand(0);
+
+  if (!isShiftedMask_64(NonZeroBits))
+    return false;
+
+  ShiftAmount = countTrailingZeros(NonZeroBits);
+  MaskWidth = CountTrailingOnes_64(NonZeroBits >> ShiftAmount);
+
+  // BFI encompasses sufficiently many nodes that it's worth inserting an extra
+  // LSL/LSR if the mask in NonZeroBits doesn't quite match up with the ISD::SHL
+  // amount.
+  Src = getLeftShift(CurDAG, Op, ShlImm - ShiftAmount);
+
+  return true;
+}
+
+// Given a OR operation, check if we have the following pattern
+// ubfm c, b, imm, imm2 (or something that does the same jobs, see
+//                       isBitfieldExtractOp)
+// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
+//                 countTrailingZeros(mask2) == imm2 - imm + 1
+// f = d | c
+// if yes, given reference arguments will be update so that one can replace
+// the OR instruction with:
+// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
+static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Dst,
+                                     SDValue &Src, unsigned &ImmR,
+                                     unsigned &ImmS, SelectionDAG *CurDAG) {
+  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
+
+  // Set Opc
+  EVT VT = N->getValueType(0);
+  if (VT == MVT::i32)
+    Opc = AArch64::BFMWri;
+  else if (VT == MVT::i64)
+    Opc = AArch64::BFMXri;
+  else
+    return false;
+
+  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
+  // have the expected shape. Try to undo that.
+  APInt UsefulBits;
+  getUsefulBits(SDValue(N, 0), UsefulBits);
+
+  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
+  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
+
+  // OR is commutative, check both possibilities (does llvm provide a
+  // way to do that directely, e.g., via code matcher?)
+  SDValue OrOpd1Val = N->getOperand(1);
+  SDNode *OrOpd0 = N->getOperand(0).getNode();
+  SDNode *OrOpd1 = N->getOperand(1).getNode();
+  for (int i = 0; i < 2;
+       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
+    unsigned BFXOpc;
+    int DstLSB, Width;
+    if (isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Src, ImmR, ImmS,
+                            NumberOfIgnoredLowBits, true)) {
+      // Check that the returned opcode is compatible with the pattern,
+      // i.e., same type and zero extended (U and not S)
+      if ((BFXOpc != AArch64::UBFMXri && VT == MVT::i64) ||
+          (BFXOpc != AArch64::UBFMWri && VT == MVT::i32))
+        continue;
+
+      // Compute the width of the bitfield insertion
+      DstLSB = 0;
+      Width = ImmS - ImmR + 1;
+      // FIXME: This constraint is to catch bitfield insertion we may
+      // want to widen the pattern if we want to grab general bitfied
+      // move case
+      if (Width <= 0)
+        continue;
+
+      // If the mask on the insertee is correct, we have a BFXIL operation. We
+      // can share the ImmR and ImmS values from the already-computed UBFM.
+    } else if (isBitfieldPositioningOp(CurDAG, SDValue(OrOpd0, 0), Src,
+                                       DstLSB, Width)) {
+      ImmR = (VT.getSizeInBits() - DstLSB) % VT.getSizeInBits();
+      ImmS = Width - 1;
+    } else
+      continue;
+
+    // Check the second part of the pattern
+    EVT VT = OrOpd1->getValueType(0);
+    assert((VT == MVT::i32 || VT == MVT::i64) && "unexpected OR operand");
+
+    // Compute the Known Zero for the candidate of the first operand.
+    // This allows to catch more general case than just looking for
+    // AND with imm. Indeed, simplify-demanded-bits may have removed
+    // the AND instruction because it proves it was useless.
+    APInt KnownZero, KnownOne;
+    CurDAG->computeKnownBits(OrOpd1Val, KnownZero, KnownOne);
+
+    // Check if there is enough room for the second operand to appear
+    // in the first one
+    APInt BitsToBeInserted =
+        APInt::getBitsSet(KnownZero.getBitWidth(), DstLSB, DstLSB + Width);
+
+    if ((BitsToBeInserted & ~KnownZero) != 0)
+      continue;
+
+    // Set the first operand
+    uint64_t Imm;
+    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
+        isBitfieldDstMask(Imm, BitsToBeInserted, NumberOfIgnoredHighBits, VT))
+      // In that case, we can eliminate the AND
+      Dst = OrOpd1->getOperand(0);
     else
-      Opc = Is64Bit ? AArch64::TBL1_8b : AArch64::TBL1_16b;
+      // Maybe the AND has been removed by simplify-demanded-bits
+      // or is useful because it discards more bits
+      Dst = OrOpd1Val;
+
+    // both parts match
+    return true;
+  }
+
+  return false;
+}
+
+SDNode *AArch64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
+  if (N->getOpcode() != ISD::OR)
+    return nullptr;
+
+  unsigned Opc;
+  unsigned LSB, MSB;
+  SDValue Opd0, Opd1;
+
+  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
+    return nullptr;
+
+  EVT VT = N->getValueType(0);
+  SDValue Ops[] = { Opd0,
+                    Opd1,
+                    CurDAG->getTargetConstant(LSB, VT),
+                    CurDAG->getTargetConstant(MSB, VT) };
+  return CurDAG->SelectNodeTo(N, Opc, VT, Ops);
+}
+
+SDNode *AArch64DAGToDAGISel::SelectLIBM(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned Variant;
+  unsigned Opc;
+  unsigned FRINTXOpcs[] = { AArch64::FRINTXSr, AArch64::FRINTXDr };
+
+  if (VT == MVT::f32) {
+    Variant = 0;
+  } else if (VT == MVT::f64) {
+    Variant = 1;
+  } else
+    return nullptr; // Unrecognized argument type. Fall back on default codegen.
+
+  // Pick the FRINTX variant needed to set the flags.
+  unsigned FRINTXOpc = FRINTXOpcs[Variant];
+
+  switch (N->getOpcode()) {
+  default:
+    return nullptr; // Unrecognized libm ISD node. Fall back on default codegen.
+  case ISD::FCEIL: {
+    unsigned FRINTPOpcs[] = { AArch64::FRINTPSr, AArch64::FRINTPDr };
+    Opc = FRINTPOpcs[Variant];
     break;
-  case 2:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX2_8b : AArch64::TBX2_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL2_8b : AArch64::TBL2_16b;
+  }
+  case ISD::FFLOOR: {
+    unsigned FRINTMOpcs[] = { AArch64::FRINTMSr, AArch64::FRINTMDr };
+    Opc = FRINTMOpcs[Variant];
     break;
-  case 3:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX3_8b : AArch64::TBX3_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL3_8b : AArch64::TBL3_16b;
+  }
+  case ISD::FTRUNC: {
+    unsigned FRINTZOpcs[] = { AArch64::FRINTZSr, AArch64::FRINTZDr };
+    Opc = FRINTZOpcs[Variant];
     break;
-  case 4:
-    if (IsExt)
-      Opc = Is64Bit ? AArch64::TBX4_8b : AArch64::TBX4_16b;
-    else
-      Opc = Is64Bit ? AArch64::TBL4_8b : AArch64::TBL4_16b;
+  }
+  case ISD::FROUND: {
+    unsigned FRINTAOpcs[] = { AArch64::FRINTASr, AArch64::FRINTADr };
+    Opc = FRINTAOpcs[Variant];
     break;
   }
+  }
+
+  SDLoc dl(N);
+  SDValue In = N->getOperand(0);
+  SmallVector<SDValue, 2> Ops;
+  Ops.push_back(In);
+
+  if (!TM.Options.UnsafeFPMath) {
+    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
+    Ops.push_back(SDValue(FRINTX, 1));
+  }
 
-  return Opc;
+  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
 }
 
-SDNode *AArch64DAGToDAGISel::SelectVTBL(SDNode *N, unsigned NumVecs,
-                                        bool IsExt) {
-  assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
-  SDLoc dl(N);
+bool
+AArch64DAGToDAGISel::SelectCVTFixedPosOperand(SDValue N, SDValue &FixedPos,
+                                              unsigned RegWidth) {
+  APFloat FVal(0.0);
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    FVal = CN->getValueAPF();
+  else if (LoadSDNode *LN = dyn_cast<LoadSDNode>(N)) {
+    // Some otherwise illegal constants are allowed in this case.
+    if (LN->getOperand(1).getOpcode() != AArch64ISD::ADDlow ||
+        !isa<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1)))
+      return false;
 
-  // Check the element of look up table is 64-bit or not
-  unsigned Vec0Idx = IsExt ? 2 : 1;
-  assert(!N->getOperand(Vec0Idx + 0).getValueType().is64BitVector() &&
-         "The element of lookup table for vtbl and vtbx must be 128-bit");
+    ConstantPoolSDNode *CN =
+        dyn_cast<ConstantPoolSDNode>(LN->getOperand(1)->getOperand(1));
+    FVal = cast<ConstantFP>(CN->getConstVal())->getValueAPF();
+  } else
+    return false;
 
-  // Check the return value type is 64-bit or not
-  EVT ResVT = N->getValueType(0);
-  bool is64BitRes = ResVT.is64BitVector();
+  // An FCVT[SU] instruction performs: convertToInt(Val * 2^fbits) where fbits
+  // is between 1 and 32 for a destination w-register, or 1 and 64 for an
+  // x-register.
+  //
+  // By this stage, we've detected (fp_to_[su]int (fmul Val, THIS_NODE)) so we
+  // want THIS_NODE to be 2^fbits. This is much easier to deal with using
+  // integers.
+  bool IsExact;
 
-  // Create new SDValue for vector list
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Idx,
-                               N->op_begin() + Vec0Idx + NumVecs);
-  SDValue TblReg = createQTuple(Regs);
-  unsigned Opc = getTBLOpc(IsExt, is64BitRes, NumVecs);
+  // fbits is between 1 and 64 in the worst-case, which means the fmul
+  // could have 2^64 as an actual operand. Need 65 bits of precision.
+  APSInt IntVal(65, true);
+  FVal.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact);
 
-  SmallVector<SDValue, 3> Ops;
-  if (IsExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(TblReg);
-  Ops.push_back(N->getOperand(Vec0Idx + NumVecs));
-  return CurDAG->getMachineNode(Opc, dl, ResVT, Ops);
+  // N.b. isPowerOf2 also checks for > 0.
+  if (!IsExact || !IntVal.isPowerOf2()) return false;
+  unsigned FBits = IntVal.logBase2();
+
+  // Checks above should have guaranteed that we haven't lost information in
+  // finding FBits, but it must still be in range.
+  if (FBits == 0 || FBits > RegWidth) return false;
+
+  FixedPos = CurDAG->getTargetConstant(FBits, MVT::i32);
+  return true;
 }
 
 SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
   // Dump information about the Node being selected
-  DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
+  DEBUG(errs() << "Selecting: ");
+  DEBUG(Node->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
+  // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
-    DEBUG(dbgs() << "== "; Node->dump(CurDAG); dbgs() << "\n");
+    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
-  }
-
-  switch (Node->getOpcode()) {
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_ADD_I8,
-                        AArch64::ATOMIC_LOAD_ADD_I16,
-                        AArch64::ATOMIC_LOAD_ADD_I32,
-                        AArch64::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_SUB_I8,
-                        AArch64::ATOMIC_LOAD_SUB_I16,
-                        AArch64::ATOMIC_LOAD_SUB_I32,
-                        AArch64::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_AND_I8,
-                        AArch64::ATOMIC_LOAD_AND_I16,
-                        AArch64::ATOMIC_LOAD_AND_I32,
-                        AArch64::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_OR_I8,
-                        AArch64::ATOMIC_LOAD_OR_I16,
-                        AArch64::ATOMIC_LOAD_OR_I32,
-                        AArch64::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_XOR_I8,
-                        AArch64::ATOMIC_LOAD_XOR_I16,
-                        AArch64::ATOMIC_LOAD_XOR_I32,
-                        AArch64::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_NAND_I8,
-                        AArch64::ATOMIC_LOAD_NAND_I16,
-                        AArch64::ATOMIC_LOAD_NAND_I32,
-                        AArch64::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MIN_I8,
-                        AArch64::ATOMIC_LOAD_MIN_I16,
-                        AArch64::ATOMIC_LOAD_MIN_I32,
-                        AArch64::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_MAX_I8,
-                        AArch64::ATOMIC_LOAD_MAX_I16,
-                        AArch64::ATOMIC_LOAD_MAX_I32,
-                        AArch64::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMIN_I8,
-                        AArch64::ATOMIC_LOAD_UMIN_I16,
-                        AArch64::ATOMIC_LOAD_UMIN_I32,
-                        AArch64::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_LOAD_UMAX_I8,
-                        AArch64::ATOMIC_LOAD_UMAX_I16,
-                        AArch64::ATOMIC_LOAD_UMAX_I32,
-                        AArch64::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_SWAP_I8,
-                        AArch64::ATOMIC_SWAP_I16,
-                        AArch64::ATOMIC_SWAP_I32,
-                        AArch64::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(Node,
-                        AArch64::ATOMIC_CMP_SWAP_I8,
-                        AArch64::ATOMIC_CMP_SWAP_I16,
-                        AArch64::ATOMIC_CMP_SWAP_I32,
-                        AArch64::ATOMIC_CMP_SWAP_I64);
-  case ISD::FrameIndex: {
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    EVT PtrTy = getTargetLowering()->getPointerTy();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, PtrTy);
-    return CurDAG->SelectNodeTo(Node, AArch64::ADDxxi_lsl0_s, PtrTy,
-                                TFI, CurDAG->getTargetConstant(0, PtrTy));
+    return nullptr;
   }
-  case ISD::Constant: {
-    SDNode *ResNode = 0;
-    if (cast<ConstantSDNode>(Node)->getZExtValue() == 0) {
-      // XZR and WZR are probably even better than an actual move: most of the
-      // time they can be folded into another instruction with *no* cost.
-
-      EVT Ty = Node->getValueType(0);
-      assert((Ty == MVT::i32 || Ty == MVT::i64) && "unexpected type");
-      uint16_t Register = Ty == MVT::i32 ? AArch64::WZR : AArch64::XZR;
-      ResNode = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
-                                       SDLoc(Node),
-                                       Register, Ty).getNode();
-    }
 
-    // Next best option is a move-immediate, see if we can do that.
-    if (!ResNode) {
-      ResNode = TrySelectToMoveImm(Node);
-    }
+  // Few custom selection stuff.
+  SDNode *ResNode = nullptr;
+  EVT VT = Node->getValueType(0);
 
-    if (ResNode)
-      return ResNode;
+  switch (Node->getOpcode()) {
+  default:
+    break;
 
-    // If even that fails we fall back to a lit-pool entry at the moment. Future
-    // tuning may change this to a sequence of MOVZ/MOVN/MOVK instructions.
-    ResNode = SelectToLitPool(Node);
-    assert(ResNode && "We need *some* way to materialise a constant");
+  case ISD::ADD:
+    if (SDNode *I = SelectMLAV64LaneV128(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-    Node = ResNode;
+  case ISD::LOAD: {
+    // Try to select as an indexed load. Fall through to normal processing
+    // if we can't.
+    bool Done = false;
+    SDNode *I = SelectIndexedLoad(Node, Done);
+    if (Done)
+      return I;
     break;
   }
-  case ISD::ConstantFP: {
-    if (A64Imms::isFPImm(cast<ConstantFPSDNode>(Node)->getValueAPF())) {
-      // FMOV will take care of it from TableGen
-      break;
-    }
 
-    SDNode *ResNode = LowerToFPLitPool(Node);
-    ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
+  case ISD::SRL:
+  case ISD::AND:
+  case ISD::SRA:
+    if (SDNode *I = SelectBitfieldExtractOp(Node))
+      return I;
+    break;
 
-    // We want to continue selection at this point since the litpool access
-    // generated used generic nodes for simplicity.
-    Node = ResNode;
+  case ISD::OR:
+    if (SDNode *I = SelectBitfieldInsertOp(Node))
+      return I;
     break;
+
+  case ISD::EXTRACT_VECTOR_ELT: {
+    // Extracting lane zero is a special case where we can just use a plain
+    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
+    // the rest of the compiler, especially the register allocator and copyi
+    // propagation, to reason about, so is preferred when it's possible to
+    // use it.
+    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
+    // Bail and use the default Select() for non-zero lanes.
+    if (LaneNode->getZExtValue() != 0)
+      break;
+    // If the element type is not the same as the result type, likewise
+    // bail and use the default Select(), as there's more to do than just
+    // a cross-class COPY. This catches extracts of i8 and i16 elements
+    // since they will need an explicit zext.
+    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
+      break;
+    unsigned SubReg;
+    switch (Node->getOperand(0)
+                .getValueType()
+                .getVectorElementType()
+                .getSizeInBits()) {
+    default:
+      assert(0 && "Unexpected vector element type!");
+    case 64:
+      SubReg = AArch64::dsub;
+      break;
+    case 32:
+      SubReg = AArch64::ssub;
+      break;
+    case 16: // FALLTHROUGH
+    case 8:
+      llvm_unreachable("unexpected zext-requiring extract element!");
+    }
+    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
+                                                     Node->getOperand(0));
+    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
+    DEBUG(Extract->dumpr(CurDAG));
+    DEBUG(dbgs() << "\n");
+    return Extract.getNode();
   }
-  case AArch64ISD::NEON_LD1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1WB_8B_fixed,  AArch64::LD1WB_4H_fixed,
-      AArch64::LD1WB_2S_fixed,  AArch64::LD1WB_1D_fixed,
-      AArch64::LD1WB_16B_fixed, AArch64::LD1WB_8H_fixed,
-      AArch64::LD1WB_4S_fixed,  AArch64::LD1WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2WB_8B_fixed,  AArch64::LD2WB_4H_fixed,
-      AArch64::LD2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD2WB_16B_fixed, AArch64::LD2WB_8H_fixed,
-      AArch64::LD2WB_4S_fixed,  AArch64::LD2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3WB_8B_fixed,  AArch64::LD3WB_4H_fixed,
-      AArch64::LD3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD3WB_16B_fixed, AArch64::LD3WB_8H_fixed,
-      AArch64::LD3WB_4S_fixed,  AArch64::LD3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4WB_8B_fixed,  AArch64::LD4WB_4H_fixed,
-      AArch64::LD4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD4WB_16B_fixed, AArch64::LD4WB_8H_fixed,
-      AArch64::LD4WB_4S_fixed,  AArch64::LD4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x2WB_8B_fixed,  AArch64::LD1x2WB_4H_fixed,
-      AArch64::LD1x2WB_2S_fixed,  AArch64::LD1x2WB_1D_fixed,
-      AArch64::LD1x2WB_16B_fixed, AArch64::LD1x2WB_8H_fixed,
-      AArch64::LD1x2WB_4S_fixed,  AArch64::LD1x2WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x3WB_8B_fixed,  AArch64::LD1x3WB_4H_fixed,
-      AArch64::LD1x3WB_2S_fixed,  AArch64::LD1x3WB_1D_fixed,
-      AArch64::LD1x3WB_16B_fixed, AArch64::LD1x3WB_8H_fixed,
-      AArch64::LD1x3WB_4S_fixed,  AArch64::LD1x3WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD1x4WB_8B_fixed,  AArch64::LD1x4WB_4H_fixed,
-      AArch64::LD1x4WB_2S_fixed,  AArch64::LD1x4WB_1D_fixed,
-      AArch64::LD1x4WB_16B_fixed, AArch64::LD1x4WB_8H_fixed,
-      AArch64::LD1x4WB_4S_fixed,  AArch64::LD1x4WB_2D_fixed
-    };
-    return SelectVLD(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1WB_8B_fixed,  AArch64::ST1WB_4H_fixed,
-      AArch64::ST1WB_2S_fixed,  AArch64::ST1WB_1D_fixed,
-      AArch64::ST1WB_16B_fixed, AArch64::ST1WB_8H_fixed,
-      AArch64::ST1WB_4S_fixed,  AArch64::ST1WB_2D_fixed
-    };
-    return SelectVST(Node, true, 1, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST2WB_8B_fixed,  AArch64::ST2WB_4H_fixed,
-      AArch64::ST2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST2WB_16B_fixed, AArch64::ST2WB_8H_fixed,
-      AArch64::ST2WB_4S_fixed,  AArch64::ST2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST3WB_8B_fixed,  AArch64::ST3WB_4H_fixed,
-      AArch64::ST3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST3WB_16B_fixed, AArch64::ST3WB_8H_fixed,
-      AArch64::ST3WB_4S_fixed,  AArch64::ST3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST4WB_8B_fixed,  AArch64::ST4WB_4H_fixed,
-      AArch64::ST4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST4WB_16B_fixed, AArch64::ST4WB_8H_fixed,
-      AArch64::ST4WB_4S_fixed,  AArch64::ST4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2R_8B, AArch64::LD2R_4H, AArch64::LD2R_2S,
-        AArch64::LD2R_1D, AArch64::LD2R_16B, AArch64::LD2R_8H,
-        AArch64::LD2R_4S, AArch64::LD2R_2D
-    };
-    return SelectVLDDup(Node, false, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3R_8B, AArch64::LD3R_4H, AArch64::LD3R_2S,
-        AArch64::LD3R_1D, AArch64::LD3R_16B, AArch64::LD3R_8H,
-        AArch64::LD3R_4S, AArch64::LD3R_2D
-    };
-    return SelectVLDDup(Node, false, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4R_8B, AArch64::LD4R_4H, AArch64::LD4R_2S,
-        AArch64::LD4R_1D, AArch64::LD4R_16B, AArch64::LD4R_8H,
-        AArch64::LD4R_4S, AArch64::LD4R_2D
-    };
-    return SelectVLDDup(Node, false, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD2R_WB_8B_fixed,  AArch64::LD2R_WB_4H_fixed,
-      AArch64::LD2R_WB_2S_fixed,  AArch64::LD2R_WB_1D_fixed,
-      AArch64::LD2R_WB_16B_fixed, AArch64::LD2R_WB_8H_fixed,
-      AArch64::LD2R_WB_4S_fixed,  AArch64::LD2R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD3R_WB_8B_fixed,  AArch64::LD3R_WB_4H_fixed,
-      AArch64::LD3R_WB_2S_fixed,  AArch64::LD3R_WB_1D_fixed,
-      AArch64::LD3R_WB_16B_fixed, AArch64::LD3R_WB_8H_fixed,
-      AArch64::LD3R_WB_4S_fixed,  AArch64::LD3R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4DUP_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::LD4R_WB_8B_fixed,  AArch64::LD4R_WB_4H_fixed,
-      AArch64::LD4R_WB_2S_fixed,  AArch64::LD4R_WB_1D_fixed,
-      AArch64::LD4R_WB_16B_fixed, AArch64::LD4R_WB_8H_fixed,
-      AArch64::LD4R_WB_4S_fixed,  AArch64::LD4R_WB_2D_fixed
-    };
-    return SelectVLDDup(Node, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_LD2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD2LN_WB_B_fixed, AArch64::LD2LN_WB_H_fixed,
-        AArch64::LD2LN_WB_S_fixed, AArch64::LD2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_LD3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD3LN_WB_B_fixed, AArch64::LD3LN_WB_H_fixed,
-        AArch64::LD3LN_WB_S_fixed, AArch64::LD3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_LD4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::LD4LN_WB_B_fixed, AArch64::LD4LN_WB_H_fixed,
-        AArch64::LD4LN_WB_S_fixed, AArch64::LD4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, true, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST2LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST2LN_WB_B_fixed, AArch64::ST2LN_WB_H_fixed,
-        AArch64::ST2LN_WB_S_fixed, AArch64::ST2LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST3LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST3LN_WB_B_fixed, AArch64::ST3LN_WB_H_fixed,
-        AArch64::ST3LN_WB_S_fixed, AArch64::ST3LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST4LN_UPD: {
-    static const uint16_t Opcodes[] = {
-        AArch64::ST4LN_WB_B_fixed, AArch64::ST4LN_WB_H_fixed,
-        AArch64::ST4LN_WB_S_fixed, AArch64::ST4LN_WB_D_fixed
-    };
-    return SelectVLDSTLane(Node, false, true, 4, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x2_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x2WB_8B_fixed,  AArch64::ST1x2WB_4H_fixed,
-      AArch64::ST1x2WB_2S_fixed,  AArch64::ST1x2WB_1D_fixed,
-      AArch64::ST1x2WB_16B_fixed, AArch64::ST1x2WB_8H_fixed,
-      AArch64::ST1x2WB_4S_fixed,  AArch64::ST1x2WB_2D_fixed
-    };
-    return SelectVST(Node, true, 2, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x3_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x3WB_8B_fixed,  AArch64::ST1x3WB_4H_fixed,
-      AArch64::ST1x3WB_2S_fixed,  AArch64::ST1x3WB_1D_fixed,
-      AArch64::ST1x3WB_16B_fixed, AArch64::ST1x3WB_8H_fixed,
-      AArch64::ST1x3WB_4S_fixed,  AArch64::ST1x3WB_2D_fixed
-    };
-    return SelectVST(Node, true, 3, Opcodes);
-  }
-  case AArch64ISD::NEON_ST1x4_UPD: {
-    static const uint16_t Opcodes[] = {
-      AArch64::ST1x4WB_8B_fixed,  AArch64::ST1x4WB_4H_fixed,
-      AArch64::ST1x4WB_2S_fixed,  AArch64::ST1x4WB_1D_fixed,
-      AArch64::ST1x4WB_16B_fixed, AArch64::ST1x4WB_8H_fixed,
-      AArch64::ST1x4WB_4S_fixed,  AArch64::ST1x4WB_2D_fixed
-    };
-    return SelectVST(Node, true, 4, Opcodes);
-  }
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    bool IsExt = false;
-    switch (IntNo) {
-      default:
-        break;
-      case Intrinsic::aarch64_neon_vtbx1:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl1:
-        return SelectVTBL(Node, 1, IsExt);
-      case Intrinsic::aarch64_neon_vtbx2:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl2:
-        return SelectVTBL(Node, 2, IsExt);
-      case Intrinsic::aarch64_neon_vtbx3:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl3:
-        return SelectVTBL(Node, 3, IsExt);
-      case Intrinsic::aarch64_neon_vtbx4:
-        IsExt = true;
-      case Intrinsic::aarch64_neon_vtbl4:
-        return SelectVTBL(Node, 4, IsExt);
+  case ISD::Constant: {
+    // Materialize zero constants as copies from WZR/XZR.  This allows
+    // the coalescer to propagate these into other instructions.
+    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
+    if (ConstNode->isNullValue()) {
+      if (VT == MVT::i32)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::WZR, MVT::i32).getNode();
+      else if (VT == MVT::i64)
+        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
+                                      AArch64::XZR, MVT::i64).getNode();
     }
     break;
   }
-  case ISD::INTRINSIC_VOID:
+
+  case ISD::FrameIndex: {
+    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
+    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
+    unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+    const TargetLowering *TLI = getTargetLowering();
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
+                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
+    return CurDAG->SelectNodeTo(Node, AArch64::ADDXri, MVT::i64, Ops);
+  }
   case ISD::INTRINSIC_W_CHAIN: {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
     default:
       break;
-    case Intrinsic::arm_neon_vld1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1_8B,  AArch64::LD1_4H, AArch64::LD1_2S, AArch64::LD1_1D,
-          AArch64::LD1_16B, AArch64::LD1_8H, AArch64::LD1_4S, AArch64::LD1_2D
-      };
-      return SelectVLD(Node, false, 1, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2_8B,  AArch64::LD2_4H, AArch64::LD2_2S, AArch64::LD1x2_1D,
-          AArch64::LD2_16B, AArch64::LD2_8H, AArch64::LD2_4S, AArch64::LD2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
-    }
-    case Intrinsic::arm_neon_vld3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3_8B,  AArch64::LD3_4H, AArch64::LD3_2S, AArch64::LD1x3_1D,
-          AArch64::LD3_16B, AArch64::LD3_8H, AArch64::LD3_4S, AArch64::LD3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
+    case Intrinsic::aarch64_ldaxp:
+    case Intrinsic::aarch64_ldxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_ldaxp ? AArch64::LDAXPX : AArch64::LDXPX;
+      SDValue MemAddr = Node->getOperand(2);
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+
+      SDNode *Ld = CurDAG->getMachineNode(Op, DL, MVT::i64, MVT::i64,
+                                          MVT::Other, MemAddr, Chain);
+
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
+      return Ld;
     }
-    case Intrinsic::arm_neon_vld4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4_8B,  AArch64::LD4_4H, AArch64::LD4_2S, AArch64::LD1x4_1D,
-          AArch64::LD4_16B, AArch64::LD4_8H, AArch64::LD4_4S, AArch64::LD4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
+    case Intrinsic::aarch64_stlxp:
+    case Intrinsic::aarch64_stxp: {
+      unsigned Op =
+          IntNo == Intrinsic::aarch64_stlxp ? AArch64::STLXPX : AArch64::STXPX;
+      SDLoc DL(Node);
+      SDValue Chain = Node->getOperand(0);
+      SDValue ValLo = Node->getOperand(2);
+      SDValue ValHi = Node->getOperand(3);
+      SDValue MemAddr = Node->getOperand(4);
+
+      // Place arguments in the right order.
+      SmallVector<SDValue, 7> Ops;
+      Ops.push_back(ValLo);
+      Ops.push_back(ValHi);
+      Ops.push_back(MemAddr);
+      Ops.push_back(Chain);
+
+      SDNode *St = CurDAG->getMachineNode(Op, DL, MVT::i32, MVT::Other, Ops);
+      // Transfer memoperands.
+      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
+      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
+
+      return St;
     }
-    case Intrinsic::aarch64_neon_vld1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x2_8B, AArch64::LD1x2_4H,  AArch64::LD1x2_2S,
-          AArch64::LD1x2_1D, AArch64::LD1x2_16B, AArch64::LD1x2_8H,
-          AArch64::LD1x2_4S, AArch64::LD1x2_2D
-      };
-      return SelectVLD(Node, false, 2, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x3_8B, AArch64::LD1x3_4H,  AArch64::LD1x3_2S,
-          AArch64::LD1x3_1D, AArch64::LD1x3_16B, AArch64::LD1x3_8H,
-          AArch64::LD1x3_4S, AArch64::LD1x3_2D
-      };
-      return SelectVLD(Node, false, 3, Opcodes);
-    }
-    case Intrinsic::aarch64_neon_vld1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD1x4_8B, AArch64::LD1x4_4H,  AArch64::LD1x4_2S,
-          AArch64::LD1x4_1D, AArch64::LD1x4_16B, AArch64::LD1x4_8H,
-          AArch64::LD1x4_4S, AArch64::LD1x4_2D
-      };
-      return SelectVLD(Node, false, 4, Opcodes);
-    }
-    case Intrinsic::arm_neon_vst1: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1_8B,  AArch64::ST1_4H, AArch64::ST1_2S, AArch64::ST1_1D,
-          AArch64::ST1_16B, AArch64::ST1_8H, AArch64::ST1_4S, AArch64::ST1_2D
-      };
-      return SelectVST(Node, false, 1, Opcodes);
-    }
-    case Intrinsic::arm_neon_vst2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2_8B,  AArch64::ST2_4H, AArch64::ST2_2S, AArch64::ST1x2_1D,
-          AArch64::ST2_16B, AArch64::ST2_8H, AArch64::ST2_4S, AArch64::ST2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_ld1x2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD1Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD1Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld1x4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Twov4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD1Twov1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Twov2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Threev4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD1Threev1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Threev2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD1Fourv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Fourv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 2, AArch64::LD2Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 2, AArch64::LD2Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld3r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 3, AArch64::LD3Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 3, AArch64::LD3Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld4r:
+      if (VT == MVT::v8i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8b, AArch64::dsub0);
+      else if (VT == MVT::v16i8)
+        return SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
+      else if (VT == MVT::v4i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
+      else if (VT == MVT::v8i16)
+        return SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2s, AArch64::dsub0);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectLoad(Node, 4, AArch64::LD4Rv4s, AArch64::qsub0);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv1d, AArch64::dsub0);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectLoad(Node, 4, AArch64::LD4Rv2d, AArch64::qsub0);
+      break;
+    case Intrinsic::aarch64_neon_ld2lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 2, AArch64::LD2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 2, AArch64::LD2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 2, AArch64::LD2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 2, AArch64::LD2i64);
+      break;
+    case Intrinsic::aarch64_neon_ld3lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 3, AArch64::LD3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 3, AArch64::LD3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 3, AArch64::LD3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 3, AArch64::LD3i64);
+      break;
+    case Intrinsic::aarch64_neon_ld4lane:
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectLoadLane(Node, 4, AArch64::LD4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectLoadLane(Node, 4, AArch64::LD4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectLoadLane(Node, 4, AArch64::LD4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectLoadLane(Node, 4, AArch64::LD4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3_8B,  AArch64::ST3_4H, AArch64::ST3_2S, AArch64::ST1x3_1D,
-          AArch64::ST3_16B, AArch64::ST3_8H, AArch64::ST3_4S, AArch64::ST3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+  } break;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_tbl2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBLv8i8Two
+                                                  : AArch64::TBLv16i8Two,
+                         false);
+    case Intrinsic::aarch64_neon_tbl3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBLv8i8Three
+                                                  : AArch64::TBLv16i8Three,
+                         false);
+    case Intrinsic::aarch64_neon_tbl4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBLv8i8Four
+                                                  : AArch64::TBLv16i8Four,
+                         false);
+    case Intrinsic::aarch64_neon_tbx2:
+      return SelectTable(Node, 2, VT == MVT::v8i8 ? AArch64::TBXv8i8Two
+                                                  : AArch64::TBXv16i8Two,
+                         true);
+    case Intrinsic::aarch64_neon_tbx3:
+      return SelectTable(Node, 3, VT == MVT::v8i8 ? AArch64::TBXv8i8Three
+                                                  : AArch64::TBXv16i8Three,
+                         true);
+    case Intrinsic::aarch64_neon_tbx4:
+      return SelectTable(Node, 4, VT == MVT::v8i8 ? AArch64::TBXv8i8Four
+                                                  : AArch64::TBXv16i8Four,
+                         true);
+    case Intrinsic::aarch64_neon_smull:
+    case Intrinsic::aarch64_neon_umull:
+      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
+        return N;
+      break;
     }
-    case Intrinsic::arm_neon_vst4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4_8B,  AArch64::ST4_4H, AArch64::ST4_2S, AArch64::ST1x4_1D,
-          AArch64::ST4_16B, AArch64::ST4_8H, AArch64::ST4_4S, AArch64::ST4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    break;
+  }
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    if (Node->getNumOperands() >= 3)
+      VT = Node->getOperand(2)->getValueType(0);
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_st1x2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST1Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST1Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST1Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x2: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x2_8B, AArch64::ST1x2_4H,  AArch64::ST1x2_2S,
-          AArch64::ST1x2_1D, AArch64::ST1x2_16B, AArch64::ST1x2_8H,
-          AArch64::ST1x2_4S, AArch64::ST1x2_2D
-      };
-      return SelectVST(Node, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st1x3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST1Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST1Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST1Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x3: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x3_8B, AArch64::ST1x3_4H,  AArch64::ST1x3_2S,
-          AArch64::ST1x3_1D, AArch64::ST1x3_16B, AArch64::ST1x3_8H,
-          AArch64::ST1x3_4S, AArch64::ST1x3_2D
-      };
-      return SelectVST(Node, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st1x4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST1Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST1Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST1Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::aarch64_neon_vst1x4: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST1x4_8B, AArch64::ST1x4_4H,  AArch64::ST1x4_2S,
-          AArch64::ST1x4_1D, AArch64::ST1x4_16B, AArch64::ST1x4_8H,
-          AArch64::ST1x4_4S, AArch64::ST1x4_2D
-      };
-      return SelectVST(Node, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 2, AArch64::ST2Twov16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 2, AArch64::ST2Twov8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 2, AArch64::ST2Twov4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 2, AArch64::ST2Twov2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 2, AArch64::ST1Twov1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD2LN_B, AArch64::LD2LN_H, AArch64::LD2LN_S, AArch64::LD2LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 3, AArch64::ST3Threev16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 3, AArch64::ST3Threev8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 3, AArch64::ST3Threev4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 3, AArch64::ST3Threev2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 3, AArch64::ST1Threev1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD3LN_B, AArch64::LD3LN_H, AArch64::LD3LN_S, AArch64::LD3LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4: {
+      if (VT == MVT::v8i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8b);
+      else if (VT == MVT::v16i8)
+        return SelectStore(Node, 4, AArch64::ST4Fourv16b);
+      else if (VT == MVT::v4i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4h);
+      else if (VT == MVT::v8i16)
+        return SelectStore(Node, 4, AArch64::ST4Fourv8h);
+      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2s);
+      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+        return SelectStore(Node, 4, AArch64::ST4Fourv4s);
+      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+        return SelectStore(Node, 4, AArch64::ST4Fourv2d);
+      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+        return SelectStore(Node, 4, AArch64::ST1Fourv1d);
+      break;
     }
-    case Intrinsic::arm_neon_vld4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::LD4LN_B, AArch64::LD4LN_H, AArch64::LD4LN_S, AArch64::LD4LN_D
-      };
-      return SelectVLDSTLane(Node, true, false, 4, Opcodes);
+    case Intrinsic::aarch64_neon_st2lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 2, AArch64::ST2i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 2, AArch64::ST2i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 2, AArch64::ST2i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 2, AArch64::ST2i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst2lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST2LN_B, AArch64::ST2LN_H, AArch64::ST2LN_S, AArch64::ST2LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 2, Opcodes);
+    case Intrinsic::aarch64_neon_st3lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 3, AArch64::ST3i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 3, AArch64::ST3i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 3, AArch64::ST3i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 3, AArch64::ST3i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst3lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST3LN_B, AArch64::ST3LN_H, AArch64::ST3LN_S, AArch64::ST3LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 3, Opcodes);
+    case Intrinsic::aarch64_neon_st4lane: {
+      if (VT == MVT::v16i8 || VT == MVT::v8i8)
+        return SelectStoreLane(Node, 4, AArch64::ST4i8);
+      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+        return SelectStoreLane(Node, 4, AArch64::ST4i16);
+      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+               VT == MVT::v2f32)
+        return SelectStoreLane(Node, 4, AArch64::ST4i32);
+      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+               VT == MVT::v1f64)
+        return SelectStoreLane(Node, 4, AArch64::ST4i64);
+      break;
     }
-    case Intrinsic::arm_neon_vst4lane: {
-      static const uint16_t Opcodes[] = {
-          AArch64::ST4LN_B, AArch64::ST4LN_H, AArch64::ST4LN_S, AArch64::ST4LN_D
-      };
-      return SelectVLDSTLane(Node, false, false, 4, Opcodes);
     }
-    } // End of switch IntNo
+  }
+  case AArch64ISD::LD2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x2post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD1Twov2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x3post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD1Threev2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1x4post: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD1Fourv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 1, AArch64::LD1Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD2DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 2, AArch64::LD2Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD3DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 3, AArch64::LD3Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD4DUPpost: {
+    if (VT == MVT::v8i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8b_POST, AArch64::dsub0);
+    else if (VT == MVT::v16i8)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
+    else if (VT == MVT::v4i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
+    else if (VT == MVT::v8i16)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2s_POST, AArch64::dsub0);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv4s_POST, AArch64::qsub0);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv1d_POST, AArch64::dsub0);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostLoad(Node, 4, AArch64::LD4Rv2d_POST, AArch64::qsub0);
+    break;
+  }
+  case AArch64ISD::LD1LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 1, AArch64::LD1i64_POST);
+    break;
+  }
+  case AArch64ISD::LD2LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 2, AArch64::LD2i64_POST);
+    break;
+  }
+  case AArch64ISD::LD3LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 3, AArch64::LD3i64_POST);
+    break;
+  }
+  case AArch64ISD::LD4LANEpost: {
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostLoadLane(Node, 4, AArch64::LD4i64_POST);
+    break;
+  }
+  case AArch64ISD::ST2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST2Twov2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    break;
+  }
+  case AArch64ISD::ST3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST3Threev2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    break;
+  }
+  case AArch64ISD::ST4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv4s_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST4Fourv2d_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x2post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 2, AArch64::ST1Twov2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x3post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 3, AArch64::ST1Threev2d_POST);
+    break;
+  }
+  case AArch64ISD::ST1x4post: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v8i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8b_POST);
+    else if (VT == MVT::v16i8)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
+    else if (VT == MVT::v4i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
+    else if (VT == MVT::v8i16)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
+    else if (VT == MVT::v2i32 || VT == MVT::v2f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2s_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v4f32)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv4s_POST);
+    else if (VT == MVT::v1i64 || VT == MVT::v1f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv1d_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v2f64)
+      return SelectPostStore(Node, 4, AArch64::ST1Fourv2d_POST);
+    break;
+  }
+  case AArch64ISD::ST2LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 2, AArch64::ST2i64_POST);
+    break;
+  }
+  case AArch64ISD::ST3LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 3, AArch64::ST3i64_POST);
+    break;
+  }
+  case AArch64ISD::ST4LANEpost: {
+    VT = Node->getOperand(1).getValueType();
+    if (VT == MVT::v16i8 || VT == MVT::v8i8)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
+    else if (VT == MVT::v8i16 || VT == MVT::v4i16)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
+    else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
+             VT == MVT::v2f32)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i32_POST);
+    else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
+             VT == MVT::v1f64)
+      return SelectPostStoreLane(Node, 4, AArch64::ST4i64_POST);
     break;
-  } // End of case ISD::INTRINSIC_VOID and :ISD::INTRINSIC_W_CHAIN
-  default:
-    break; // Let generic code handle it
   }
 
-  SDNode *ResNode = SelectCode(Node);
+  case ISD::FCEIL:
+  case ISD::FFLOOR:
+  case ISD::FTRUNC:
+  case ISD::FROUND:
+    if (SDNode *I = SelectLIBM(Node))
+      return I;
+    break;
+  }
 
-  DEBUG(dbgs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
-          Node->dump(CurDAG);
-        else
-          ResNode->dump(CurDAG);
-        dbgs() << "\n");
+  // Select the default instruction
+  ResNode = SelectCode(Node);
+
+  DEBUG(errs() << "=> ");
+  if (ResNode == nullptr || ResNode == Node)
+    DEBUG(Node->dump(CurDAG));
+  else
+    DEBUG(ResNode->dump(CurDAG));
+  DEBUG(errs() << "\n");
 
   return ResNode;
 }
 
-/// This pass converts a legalized DAG into a AArch64-specific DAG, ready for
-/// instruction scheduling.
-FunctionPass *llvm::createAArch64ISelDAG(AArch64TargetMachine &TM,
+/// createAArch64ISelDag - This pass converts a legalized DAG into a
+/// AArch64-specific DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new AArch64DAGToDAGISel(TM, OptLevel);
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 388973a..80d6669 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation -----===//
+//===-- AArch64ISelLowering.cpp - AArch64 DAG Lowering Implementation  ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,46 +7,87 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the interfaces that AArch64 uses to lower LLVM code into a
-// selection DAG.
+// This file implements the AArch64TargetLowering class.
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64-isel"
-#include "AArch64.h"
 #include "AArch64ISelLowering.h"
+#include "AArch64PerfectShuffle.h"
+#include "AArch64Subtarget.h"
 #include "AArch64MachineFunctionInfo.h"
 #include "AArch64TargetMachine.h"
 #include "AArch64TargetObjectFile.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Support/MathExtras.h"
-
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
-static TargetLoweringObjectFile *createTLOF(AArch64TargetMachine &TM) {
-  assert (TM.getSubtarget<AArch64Subtarget>().isTargetELF() &&
-          "unknown subtarget type");
-  return new AArch64ElfTargetObjectFile();
-}
+#define DEBUG_TYPE "aarch64-lower"
 
-AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
-  : TargetLowering(TM, createTLOF(TM)), Itins(TM.getInstrItineraryData()) {
+STATISTIC(NumTailCalls, "Number of tail calls");
+STATISTIC(NumShiftInserts, "Number of vector shift inserts");
+
+enum AlignMode {
+  StrictAlign,
+  NoStrictAlign
+};
+
+static cl::opt<AlignMode>
+Align(cl::desc("Load/store alignment support"),
+      cl::Hidden, cl::init(NoStrictAlign),
+      cl::values(
+          clEnumValN(StrictAlign,   "aarch64-strict-align",
+                     "Disallow all unaligned memory accesses"),
+          clEnumValN(NoStrictAlign, "aarch64-no-strict-align",
+                     "Allow unaligned memory accesses"),
+          clEnumValEnd));
+
+// Place holder until extr generation is tested fully.
+static cl::opt<bool>
+EnableAArch64ExtrGeneration("aarch64-extr-generation", cl::Hidden,
+                          cl::desc("Allow AArch64 (or (shift)(shift))->extract"),
+                          cl::init(true));
+
+static cl::opt<bool>
+EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
+                         cl::desc("Allow AArch64 SLI/SRI formation"),
+                         cl::init(false));
 
-  const AArch64Subtarget *Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering public interface.
+//===----------------------------------------------------------------------===//
+static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
+  if (TM.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return new AArch64_MachoTargetObjectFile();
+
+  return new AArch64_ELFTargetObjectFile();
+}
 
-  // SIMD compares set the entire lane's bits to 1
+AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
+    : TargetLowering(TM, createTLOF(TM)) {
+  Subtarget = &TM.getSubtarget<AArch64Subtarget>();
+
+  // AArch64 doesn't have comparisons which set GPRs or setcc instructions, so
+  // we have to make something up. Arbitrarily, choose ZeroOrOne.
+  setBooleanContents(ZeroOrOneBooleanContent);
+  // When comparing vectors the result sets the different elements in the
+  // vector to all-one or all-zero.
   setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
 
-  // Scalar register <-> type mapping
-  addRegisterClass(MVT::i32, &AArch64::GPR32RegClass);
-  addRegisterClass(MVT::i64, &AArch64::GPR64RegClass);
+  // Set up the register classes.
+  addRegisterClass(MVT::i32, &AArch64::GPR32allRegClass);
+  addRegisterClass(MVT::i64, &AArch64::GPR64allRegClass);
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
@@ -56,201 +97,86 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   }
 
   if (Subtarget->hasNEON()) {
-    // And the vectors
-    addRegisterClass(MVT::v1i8,  &AArch64::FPR8RegClass);
-    addRegisterClass(MVT::v1i16, &AArch64::FPR16RegClass);
-    addRegisterClass(MVT::v1i32, &AArch64::FPR32RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1f64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v8i8,  &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v4i16, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2i32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v1i64, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v2f32, &AArch64::FPR64RegClass);
-    addRegisterClass(MVT::v16i8, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v8i16, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4i32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2i64, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v4f32, &AArch64::FPR128RegClass);
-    addRegisterClass(MVT::v2f64, &AArch64::FPR128RegClass);
+    addRegisterClass(MVT::v16i8, &AArch64::FPR8RegClass);
+    addRegisterClass(MVT::v8i16, &AArch64::FPR16RegClass);
+    // Someone set us up the NEON.
+    addDRTypeForNEON(MVT::v2f32);
+    addDRTypeForNEON(MVT::v8i8);
+    addDRTypeForNEON(MVT::v4i16);
+    addDRTypeForNEON(MVT::v2i32);
+    addDRTypeForNEON(MVT::v1i64);
+    addDRTypeForNEON(MVT::v1f64);
+
+    addQRTypeForNEON(MVT::v4f32);
+    addQRTypeForNEON(MVT::v2f64);
+    addQRTypeForNEON(MVT::v16i8);
+    addQRTypeForNEON(MVT::v8i16);
+    addQRTypeForNEON(MVT::v4i32);
+    addQRTypeForNEON(MVT::v2i64);
   }
 
+  // Compute derived properties from the register classes
   computeRegisterProperties();
 
-  // We combine OR nodes for bitfield and NEON BSL operations.
-  setTargetDAGCombine(ISD::OR);
-
-  setTargetDAGCombine(ISD::AND);
-  setTargetDAGCombine(ISD::SRA);
-  setTargetDAGCombine(ISD::SRL);
-  setTargetDAGCombine(ISD::SHL);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-  setTargetDAGCombine(ISD::INTRINSIC_VOID);
-  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-
-  // AArch64 does not have i1 loads, or much of anything for i1 really.
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
-  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
-
-  setStackPointerRegisterToSaveRestore(AArch64::XSP);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-
-  // We'll lower globals to wrappers for selection.
+  // Provide all sorts of operation actions
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-
-  // A64 instructions have the comparison predicate attached to the user of the
-  // result, but having a separate comparison is valuable for matching.
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::i64, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
   setOperationAction(ISD::BR_CC, MVT::i32, Custom);
   setOperationAction(ISD::BR_CC, MVT::i64, Custom);
   setOperationAction(ISD::BR_CC, MVT::f32, Custom);
   setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT, MVT::i32, Custom);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f32, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Custom);
-
   setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-
-  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
-
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-
   setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i32, Custom);
   setOperationAction(ISD::JumpTable, MVT::i64, Custom);
 
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-  setOperationAction(ISD::VAARG, MVT::Other, Expand);
-
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-
-  setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand);
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Expand);
-  setOperationAction(ISD::CTPOP, MVT::i64, Expand);
-
-  // Legal floating-point operations.
-  setOperationAction(ISD::FABS, MVT::f32, Legal);
-  setOperationAction(ISD::FABS, MVT::f64, Legal);
-
-  setOperationAction(ISD::FCEIL, MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL, MVT::f64, Legal);
-
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FNEG, MVT::f32, Legal);
-  setOperationAction(ISD::FNEG, MVT::f64, Legal);
-
-  setOperationAction(ISD::FRINT, MVT::f32, Legal);
-  setOperationAction(ISD::FRINT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FSQRT, MVT::f32, Legal);
-  setOperationAction(ISD::FSQRT, MVT::f64, Legal);
-
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-
-  setOperationAction(ISD::ConstantFP, MVT::f32, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f64, Legal);
-  setOperationAction(ISD::ConstantFP, MVT::f128, Legal);
-
-  // Illegal floating-point operations.
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP, MVT::f64, Expand);
-
-  setOperationAction(ISD::FEXP2, MVT::f32, Expand);
-  setOperationAction(ISD::FEXP2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG2, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG2, MVT::f64, Expand);
-
-  setOperationAction(ISD::FLOG10, MVT::f32, Expand);
-  setOperationAction(ISD::FLOG10, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-
-  setOperationAction(ISD::FPOWI, MVT::f32, Expand);
-  setOperationAction(ISD::FPOWI, MVT::f64, Expand);
+  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
 
   setOperationAction(ISD::FREM, MVT::f32, Expand);
   setOperationAction(ISD::FREM, MVT::f64, Expand);
+  setOperationAction(ISD::FREM, MVT::f80, Expand);
 
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-
-  setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+  // Custom lowering hooks are needed for XOR
+  // to fold it into CSINC/CSINV.
+  setOperationAction(ISD::XOR, MVT::i32, Custom);
+  setOperationAction(ISD::XOR, MVT::i64, Custom);
 
   // Virtually no operation on f128 is legal, but LLVM can't expand them when
   // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS,       MVT::f128, Expand);
-  setOperationAction(ISD::FADD,       MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN,  MVT::f128, Expand);
-  setOperationAction(ISD::FCOS,       MVT::f128, Expand);
-  setOperationAction(ISD::FDIV,       MVT::f128, Custom);
-  setOperationAction(ISD::FMA,        MVT::f128, Expand);
-  setOperationAction(ISD::FMUL,       MVT::f128, Custom);
-  setOperationAction(ISD::FNEG,       MVT::f128, Expand);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Expand);
-  setOperationAction(ISD::FP_ROUND,   MVT::f128, Expand);
-  setOperationAction(ISD::FPOW,       MVT::f128, Expand);
-  setOperationAction(ISD::FREM,       MVT::f128, Expand);
-  setOperationAction(ISD::FRINT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSIN,       MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS,    MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT,      MVT::f128, Expand);
-  setOperationAction(ISD::FSUB,       MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC,     MVT::f128, Expand);
-  setOperationAction(ISD::SETCC,      MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC,      MVT::f128, Custom);
-  setOperationAction(ISD::SELECT,     MVT::f128, Expand);
-  setOperationAction(ISD::SELECT_CC,  MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND,  MVT::f128, Custom);
+  setOperationAction(ISD::FABS, MVT::f128, Expand);
+  setOperationAction(ISD::FADD, MVT::f128, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
+  setOperationAction(ISD::FCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FDIV, MVT::f128, Custom);
+  setOperationAction(ISD::FMA, MVT::f128, Expand);
+  setOperationAction(ISD::FMUL, MVT::f128, Custom);
+  setOperationAction(ISD::FNEG, MVT::f128, Expand);
+  setOperationAction(ISD::FPOW, MVT::f128, Expand);
+  setOperationAction(ISD::FREM, MVT::f128, Expand);
+  setOperationAction(ISD::FRINT, MVT::f128, Expand);
+  setOperationAction(ISD::FSIN, MVT::f128, Expand);
+  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
+  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
+  setOperationAction(ISD::FSUB, MVT::f128, Custom);
+  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
+  setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT, MVT::f128, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
+  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
 
   // Lowering for many of the conversions is actually specified by the non-f128
   // type. The LowerXXX function will be trivial when f128 isn't involved.
@@ -266,623 +192,583 @@ AArch64TargetLowering::AArch64TargetLowering(AArch64TargetMachine &TM)
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND,  MVT::f64, Custom);
-
-  // i128 shift operation support
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
 
-  // This prevents LLVM trying to compress double constants into a floating
-  // constant-pool entry and trying to load from there. It's of doubtful benefit
-  // for A64: we'd need LDR followed by FCVT, I believe.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f16, Expand);
+  // Variable arguments.
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
 
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
-  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  // Variable-sized objects.
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
 
+  // Exception handling.
+  // FIXME: These are guesses. Has this been defined yet?
   setExceptionPointerRegister(AArch64::X0);
   setExceptionSelectorRegister(AArch64::X1);
 
-  if (Subtarget->hasNEON()) {
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v1i64, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v16i8, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v8i16, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
-    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v16i8, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v8i16, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4i32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v1f64, Custom);
-    setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i8, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4i32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2i64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v1f64, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2i64, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Legal);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SETCC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SETCC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SETCC, MVT::v2f64, Custom);
-
-    setOperationAction(ISD::FFLOOR, MVT::v2f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v1f64, Legal);
-    setOperationAction(ISD::FFLOOR, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FCEIL, MVT::v2f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v4f32, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v1f64, Legal);
-    setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FTRUNC, MVT::v2f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v1f64, Legal);
-    setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FRINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FRINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Legal);
-    setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::FROUND, MVT::v2f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
-    setOperationAction(ISD::FROUND, MVT::v1f64, Legal);
-    setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
-
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+  // Constant pool entries
+  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
 
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i8, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v1i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
+  // BlockAddress
+  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
 
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Custom);
-
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i8, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v1i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
-    setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Custom);
-
-    // Neon does not support vector divide/remainder operations except
-    // floating-point divide.
-    setOperationAction(ISD::SDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::SDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UDIV, MVT::v1i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v16i8, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v8i16, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
-    setOperationAction(ISD::UDIV, MVT::v1i64, Expand);
-    setOperationAction(ISD::UDIV, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::SREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::SREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::UREM, MVT::v1i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v16i8, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v8i16, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v4i32, Expand);
-    setOperationAction(ISD::UREM, MVT::v1i64, Expand);
-    setOperationAction(ISD::UREM, MVT::v2i64, Expand);
-
-    setOperationAction(ISD::FREM, MVT::v2f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v4f32, Expand);
-    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-    setOperationAction(ISD::FREM, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT, MVT::v8i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v16i8, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v8i16, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2i64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
-    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-    setOperationAction(ISD::SELECT, MVT::v2f64, Expand);
-
-    setOperationAction(ISD::SELECT_CC, MVT::v8i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v16i8, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v8i16, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2i64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v4f32, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Custom);
-    setOperationAction(ISD::SELECT_CC, MVT::v2f64, Custom);
-
-    // Vector ExtLoad and TruncStore are expanded.
-    for (unsigned I = MVT::FIRST_VECTOR_VALUETYPE;
-         I <= MVT::LAST_VECTOR_VALUETYPE; ++I) {
-      MVT VT = (MVT::SimpleValueType) I;
-      setLoadExtAction(ISD::SEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::ZEXTLOAD, VT, Expand);
-      setLoadExtAction(ISD::EXTLOAD, VT, Expand);
-      for (unsigned II = MVT::FIRST_VECTOR_VALUETYPE;
-           II <= MVT::LAST_VECTOR_VALUETYPE; ++II) {
-        MVT VT1 = (MVT::SimpleValueType) II;
-        // A TruncStore has two vector types of the same number of elements
-        // and different element sizes.
-        if (VT.getVectorNumElements() == VT1.getVectorNumElements() &&
-            VT.getVectorElementType().getSizeInBits()
-                > VT1.getVectorElementType().getSizeInBits())
-          setTruncStoreAction(VT, VT1, Expand);
-      }
-    }
+  // Add/Sub overflow ops with MVT::Glues are lowered to NZCV dependences.
+  setOperationAction(ISD::ADDC, MVT::i32, Custom);
+  setOperationAction(ISD::ADDE, MVT::i32, Custom);
+  setOperationAction(ISD::SUBC, MVT::i32, Custom);
+  setOperationAction(ISD::SUBE, MVT::i32, Custom);
+  setOperationAction(ISD::ADDC, MVT::i64, Custom);
+  setOperationAction(ISD::ADDE, MVT::i64, Custom);
+  setOperationAction(ISD::SUBC, MVT::i64, Custom);
+  setOperationAction(ISD::SUBE, MVT::i64, Custom);
+
+  // AArch64 lacks both left-rotate and popcount instructions.
+  setOperationAction(ISD::ROTL, MVT::i32, Expand);
+  setOperationAction(ISD::ROTL, MVT::i64, Expand);
 
-    // There is no v1i64/v2i64 multiply, expand v1i64/v2i64 to GPR i64 multiply.
-    // FIXME: For a v2i64 multiply, we copy VPR to GPR and do 2 i64 multiplies,
-    // and then copy back to VPR. This solution may be optimized by Following 3
-    // NEON instructions:
-    //        pmull  v2.1q, v0.1d, v1.1d
-    //        pmull2 v3.1q, v0.2d, v1.2d
-    //        ins    v2.d[1], v3.d[0]
-    // As currently we can't verify the correctness of such assumption, we can
-    // do such optimization in the future.
-    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+  // AArch64 doesn't have {U|S}MUL_LOHI.
+  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
+  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
-    setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
-    setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
-    setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
-    setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
-    setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
-    setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
-    setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
-  }
 
-  setTargetDAGCombine(ISD::SETCC);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::VSELECT);
-}
+  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
+  // counterparts, which AArch64 supports directly.
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
+  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
+  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
 
-EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  // It's reasonably important that this value matches the "natural" legal
-  // promotion from i1 for scalar types. Otherwise LegalizeTypes can get itself
-  // in a twist (e.g. inserting an any_extend which then becomes i64 -> i64).
-  if (!VT.isVector()) return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
+  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
 
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  unsigned &LdrOpc,
-                                  unsigned &StrOpc) {
-  static const unsigned LoadBares[] = {AArch64::LDXR_byte, AArch64::LDXR_hword,
-                                       AArch64::LDXR_word, AArch64::LDXR_dword};
-  static const unsigned LoadAcqs[] = {AArch64::LDAXR_byte, AArch64::LDAXR_hword,
-                                     AArch64::LDAXR_word, AArch64::LDAXR_dword};
-  static const unsigned StoreBares[] = {AArch64::STXR_byte, AArch64::STXR_hword,
-                                       AArch64::STXR_word, AArch64::STXR_dword};
-  static const unsigned StoreRels[] = {AArch64::STLXR_byte,AArch64::STLXR_hword,
-                                     AArch64::STLXR_word, AArch64::STLXR_dword};
-
-  const unsigned *LoadOps, *StoreOps;
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
+  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::SREM, MVT::i32, Expand);
+  setOperationAction(ISD::SREM, MVT::i64, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
 
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
+  // Custom lower Add/Sub/Mul with overflow.
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SADDO, MVT::i64, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i64, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i64, Custom);
+  setOperationAction(ISD::SMULO, MVT::i32, Custom);
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i32, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
 
-  assert(isPowerOf2_32(Size) && Size <= 8 &&
-         "unsupported size for atomic binary op!");
+  setOperationAction(ISD::FSIN, MVT::f32, Expand);
+  setOperationAction(ISD::FSIN, MVT::f64, Expand);
+  setOperationAction(ISD::FCOS, MVT::f32, Expand);
+  setOperationAction(ISD::FCOS, MVT::f64, Expand);
+  setOperationAction(ISD::FPOW, MVT::f32, Expand);
+  setOperationAction(ISD::FPOW, MVT::f64, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
+  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
+
+  // AArch64 has implementations of a lot of rounding-like FP operations.
+  static MVT RoundingTypes[] = { MVT::f32, MVT::f64};
+  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
+    MVT Ty = RoundingTypes[I];
+    setOperationAction(ISD::FFLOOR, Ty, Legal);
+    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+    setOperationAction(ISD::FCEIL, Ty, Legal);
+    setOperationAction(ISD::FRINT, Ty, Legal);
+    setOperationAction(ISD::FTRUNC, Ty, Legal);
+    setOperationAction(ISD::FROUND, Ty, Legal);
+  }
 
-  LdrOpc = LoadOps[Log2_32(Size)];
-  StrOpc = StoreOps[Log2_32(Size)];
-}
+  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
 
-// FIXME: AArch64::DTripleRegClass and AArch64::QTripleRegClass don't really
-// have value type mapped, and they are both being defined as MVT::untyped.
-// Without knowing the MVT type, MachineLICM::getRegisterClassIDAndCost
-// would fail to figure out the register pressure correctly.
-std::pair<const TargetRegisterClass*, uint8_t>
-AArch64TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
-  uint8_t Cost = 1;
-  switch (VT.SimpleTy) {
-  default:
-    return TargetLowering::findRepresentativeClass(VT);
-  case MVT::v4i64:
-    RRC = &AArch64::QPairRegClass;
-    Cost = 2;
-    break;
-  case MVT::v8i64:
-    RRC = &AArch64::QQuadRegClass;
-    Cost = 4;
-    break;
+  if (Subtarget->isTargetMachO()) {
+    // For iOS, we don't want to the normal expansion of a libcall to
+    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
+    // traffic.
+    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
+  } else {
+    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
   }
-  return std::make_pair(RRC, Cost);
-}
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                        unsigned Size,
-                                        unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  // AArch64 does not have floating-point extending loads, i1 sign-extending
+  // load, floating-point truncating stores, or v2i32->v2i16 truncating store.
+  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+  setTruncStoreAction(MVT::f128, MVT::f16, Expand);
+  // Indexed loads and stores are supported.
+  for (unsigned im = (unsigned)ISD::PRE_INC;
+       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+    setIndexedLoadAction(im, MVT::i8, Legal);
+    setIndexedLoadAction(im, MVT::i16, Legal);
+    setIndexedLoadAction(im, MVT::i32, Legal);
+    setIndexedLoadAction(im, MVT::i64, Legal);
+    setIndexedLoadAction(im, MVT::f64, Legal);
+    setIndexedLoadAction(im, MVT::f32, Legal);
+    setIndexedStoreAction(im, MVT::i8, Legal);
+    setIndexedStoreAction(im, MVT::i16, Legal);
+    setIndexedStoreAction(im, MVT::i32, Legal);
+    setIndexedStoreAction(im, MVT::i64, Legal);
+    setIndexedStoreAction(im, MVT::f64, Legal);
+    setIndexedStoreAction(im, MVT::f32, Legal);
+  }
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  // Trap.
+  setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  const TargetRegisterClass *TRC
-    = Size == 8 ? &AArch64::GPR64RegClass : &AArch64::GPR32RegClass;
-  unsigned scratch = (!BinOpcode) ? incr : MRI.createVirtualRegister(TRC);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   <binop> scratch, dest, incr
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (BinOpcode) {
-    // All arithmetic operations we'll be creating are designed to take an extra
-    // shift or extend operand, which we can conveniently set to zero.
-
-    // Operand order needs to go the other way for NAND.
-    if (BinOpcode == AArch64::BICwww_lsl || BinOpcode == AArch64::BICxxx_lsl)
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(incr).addReg(dest).addImm(0);
-    else
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch)
-        .addReg(dest).addReg(incr).addImm(0);
-  }
+  // We combine OR nodes for bitfield operations.
+  setTargetDAGCombine(ISD::OR);
 
-  // From the stxr, the register is GPR32; from the cmp it's GPR32wsp
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+  // Vector add and sub nodes may conceal a high-half opportunity.
+  // Also, try to fold ADD into CSINC/CSINV..
+  setTargetDAGCombine(ISD::ADD);
+  setTargetDAGCombine(ISD::SUB);
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+  setTargetDAGCombine(ISD::XOR);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+  setTargetDAGCombine(ISD::ANY_EXTEND);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
+  setTargetDAGCombine(ISD::SIGN_EXTEND);
+  setTargetDAGCombine(ISD::BITCAST);
+  setTargetDAGCombine(ISD::CONCAT_VECTORS);
+  setTargetDAGCombine(ISD::STORE);
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  setTargetDAGCombine(ISD::MUL);
 
-  return BB;
-}
+  setTargetDAGCombine(ISD::SELECT);
+  setTargetDAGCombine(ISD::VSELECT);
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicBinaryMinMax(MachineInstr *MI,
-                                              MachineBasicBlock *BB,
-                                              unsigned Size,
-                                              unsigned CmpOp,
-                                              A64CC::CondCodes Cond) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
+  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
+  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
 
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
+  setStackPointerRegisterToSaveRestore(AArch64::SP);
 
-  unsigned oldval = dest;
-  DebugLoc dl = MI->getDebugLoc();
+  setSchedulingPreference(Sched::Hybrid);
 
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRC, *TRCsp;
-  if (Size == 8) {
-    TRC = &AArch64::GPR64RegClass;
-    TRCsp = &AArch64::GPR64xspRegClass;
-  } else {
-    TRC = &AArch64::GPR32RegClass;
-    TRCsp = &AArch64::GPR32wspRegClass;
-  }
+  // Enable TBZ/TBNZ
+  MaskAndBranchFoldingIsLegal = true;
 
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
+  setMinFunctionAlignment(2);
 
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
+  RequireStrictAlign = (Align == StrictAlign);
 
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
+  setHasExtractBitsInsn(true);
 
-  unsigned scratch = MRI.createVirtualRegister(TRC);
-  MRI.constrainRegClass(scratch, TRCsp);
+  if (Subtarget->hasNEON()) {
+    // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
+    // silliness like this:
+    setOperationAction(ISD::FABS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FADD, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
+    setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMA, MVT::v1f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
+    setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
+    setOperationAction(ISD::FREM, MVT::v1f64, Expand);
+    setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
+    setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
+    setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
+    setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
+    setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
 
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
+    setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
 
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   cmp incr, dest (, sign extend if necessary)
-  //   csel scratch, dest, incr, cond
-  //   stxr stxr_status, scratch, ptr
-  //   cbnz stxr_status, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
+    setOperationAction(ISD::MUL, MVT::v1i64, Expand);
 
-  // Build compare and cmov instructions.
-  MRI.constrainRegClass(incr, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(incr).addReg(oldval).addImm(0);
+    // AArch64 doesn't have a direct vector ->f32 conversion instructions for
+    // elements smaller than i32, so promote the input to i32 first.
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
+    // Similarly, there is no direct i32 -> f64 vector conversion instruction.
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
 
-  BuildMI(BB, dl, TII->get(Size == 8 ? AArch64::CSELxxxc : AArch64::CSELwwwc),
-          scratch)
-    .addReg(oldval).addReg(incr).addImm(Cond);
+    // AArch64 doesn't have MUL.2d:
+    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
+    setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
+    // Likewise, narrowing and extending vector loads/stores aren't handled
+    // directly.
+    for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+         VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
+
+      setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
+                         Expand);
+
+      setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+      setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+      setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
+
+      for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
+           InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
+        setTruncStoreAction((MVT::SimpleValueType)VT,
+                            (MVT::SimpleValueType)InnerVT, Expand);
+      setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
+      setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+    }
 
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
+    // AArch64 has implementations of a lot of rounding-like FP operations.
+    static MVT RoundingVecTypes[] = {MVT::v2f32, MVT::v4f32, MVT::v2f64 };
+    for (unsigned I = 0; I < array_lengthof(RoundingVecTypes); ++I) {
+      MVT Ty = RoundingVecTypes[I];
+      setOperationAction(ISD::FFLOOR, Ty, Legal);
+      setOperationAction(ISD::FNEARBYINT, Ty, Legal);
+      setOperationAction(ISD::FCEIL, Ty, Legal);
+      setOperationAction(ISD::FRINT, Ty, Legal);
+      setOperationAction(ISD::FTRUNC, Ty, Legal);
+      setOperationAction(ISD::FROUND, Ty, Legal);
+    }
+  }
+}
 
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status)
-    .addReg(scratch).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loopMBB);
+void AArch64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
+  if (VT == MVT::v2f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
 
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
+  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
+    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
 
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
+    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
+    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
+  }
 
-  MI->eraseFromParent();   // The instruction is gone now.
+  // Mark vector float intrinsics as expand.
+  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
+    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
+    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
+  }
 
-  return BB;
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
+
+  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
+  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
+
+  // CNT supports only B element sizes.
+  if (VT != MVT::v8i8 && VT != MVT::v16i8)
+    setOperationAction(ISD::CTPOP, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
+  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
+
+  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
+  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
+
+  if (Subtarget->isLittleEndian()) {
+    for (unsigned im = (unsigned)ISD::PRE_INC;
+         im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
+      setIndexedLoadAction(im, VT.getSimpleVT(), Legal);
+      setIndexedStoreAction(im, VT.getSimpleVT(), Legal);
+    }
+  }
 }
 
-MachineBasicBlock *
-AArch64TargetLowering::emitAtomicCmpSwap(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned Size) const {
-  unsigned dest    = MI->getOperand(0).getReg();
-  unsigned ptr     = MI->getOperand(1).getReg();
-  unsigned oldval  = MI->getOperand(2).getReg();
-  unsigned newval  = MI->getOperand(3).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-  const TargetRegisterClass *TRCsp;
-  TRCsp = Size == 8 ? &AArch64::GPR64xspRegClass : &AArch64::GPR32wspRegClass;
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldxr dest, [ptr]
-  //   cmp dest, oldval
-  //   b.ne exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-
-  unsigned CmpOp = Size == 8 ? AArch64::CMPxx_lsl : AArch64::CMPww_lsl;
-  MRI.constrainRegClass(dest, TRCsp);
-  BuildMI(BB, dl, TII->get(CmpOp))
-    .addReg(dest).addReg(oldval).addImm(0);
-  BuildMI(BB, dl, TII->get(AArch64::Bcc))
-    .addImm(A64CC::NE).addMBB(exitMBB);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex stxr_status, newval, [ptr]
-  //   cbnz stxr_status, loop1MBB
-  BB = loop2MBB;
-  unsigned stxr_status = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
-  MRI.constrainRegClass(stxr_status, &AArch64::GPR32wspRegClass);
-
-  BuildMI(BB, dl, TII->get(strOpc), stxr_status).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(AArch64::CBNZw))
-    .addReg(stxr_status).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent();   // The instruction is gone now.
-
-  return BB;
+void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR64RegClass);
+  addTypeForNEON(VT, MVT::v2i32);
+}
+
+void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
+  addRegisterClass(VT, &AArch64::FPR128RegClass);
+  addTypeForNEON(VT, MVT::v4i32);
+}
+
+EVT AArch64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
+  if (!VT.isVector())
+    return MVT::i32;
+  return VT.changeVectorElementTypeToInteger();
+}
+
+/// computeKnownBitsForTargetNode - Determine which of the bits specified in
+/// Mask are known to be either zero or one and return them in the
+/// KnownZero/KnownOne bitsets.
+void AArch64TargetLowering::computeKnownBitsForTargetNode(
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const SelectionDAG &DAG, unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  default:
+    break;
+  case AArch64ISD::CSEL: {
+    APInt KnownZero2, KnownOne2;
+    DAG.computeKnownBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
+    DAG.computeKnownBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
+    KnownZero &= KnownZero2;
+    KnownOne &= KnownOne2;
+    break;
+  }
+  case ISD::INTRINSIC_W_CHAIN: {
+   ConstantSDNode *CN = cast<ConstantSDNode>(Op->getOperand(1));
+    Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+    switch (IntID) {
+    default: return;
+    case Intrinsic::aarch64_ldaxr:
+    case Intrinsic::aarch64_ldxr: {
+      unsigned BitWidth = KnownOne.getBitWidth();
+      EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      return;
+    }
+    }
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID: {
+    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    switch (IntNo) {
+    default:
+      break;
+    case Intrinsic::aarch64_neon_umaxv:
+    case Intrinsic::aarch64_neon_uminv: {
+      // Figure out the datatype of the vector operand. The UMINV instruction
+      // will zero extend the result, so we can mark as known zero all the
+      // bits larger than the element datatype. 32-bit or larget doesn't need
+      // this as those are legal types and will be handled by isel directly.
+      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
+      unsigned BitWidth = KnownZero.getBitWidth();
+      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
+        assert(BitWidth >= 8 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
+        KnownZero |= Mask;
+      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
+        assert(BitWidth >= 16 && "Unexpected width!");
+        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+        KnownZero |= Mask;
+      }
+      break;
+    } break;
+    }
+  }
+  }
+}
+
+MVT AArch64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
+  return MVT::i64;
+}
+
+unsigned AArch64TargetLowering::getMaximalGlobalOffset() const {
+  // FIXME: On AArch64, this depends on the type.
+  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
+  // and the offset has to be a multiple of the related size in bytes.
+  return 4095;
+}
+
+FastISel *
+AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                      const TargetLibraryInfo *libInfo) const {
+  return AArch64::createFastISel(funcInfo, libInfo);
+}
+
+const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default:
+    return nullptr;
+  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
+  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
+  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
+  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
+  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
+  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
+  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
+  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
+  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
+  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
+  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
+  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
+  case AArch64ISD::TLSDESC_CALL:      return "AArch64ISD::TLSDESC_CALL";
+  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
+  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
+  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
+  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
+  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
+  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
+  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
+  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
+  case AArch64ISD::FMIN:              return "AArch64ISD::FMIN";
+  case AArch64ISD::FMAX:              return "AArch64ISD::FMAX";
+  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
+  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
+  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
+  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
+  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
+  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
+  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
+  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
+  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
+  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
+  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
+  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
+  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
+  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
+  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
+  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
+  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
+  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
+  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
+  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
+  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
+  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
+  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
+  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
+  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
+  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
+  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
+  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
+  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
+  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
+  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
+  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
+  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
+  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
+  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
+  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
+  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
+  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
+  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
+  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
+  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
+  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
+  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
+  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
+  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
+  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
+  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
+  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
+  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
+  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
+  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
+  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
+  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
+  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
+  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
+  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
+  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
+  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
+  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
+  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
+  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
+  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
+  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
+  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
+  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
+  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
+  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
+  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
+  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
+  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
+  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
+  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
+  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
+  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
+  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
+  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
+  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
+  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
+  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
+  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
+  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
+  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
+  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
+  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
+  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
+  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
+  }
 }
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                                     MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction using conditional branches
-  // and loads, giving an instruciton sequence like:
-  //     str q0, [sp]
-  //     b.ne IfTrue
-  //     b Finish
-  // IfTrue:
-  //     str q1, [sp]
-  // Finish:
-  //     ldr q0, [sp]
-  //
-  // Using virtual registers would probably not be beneficial since COPY
-  // instructions are expensive for f128 (there's no actual instruction to
-  // implement them).
-  //
-  // An alternative would be to do an integer-CSEL on some address. E.g.:
-  //     mov x0, sp
-  //     add x1, sp, #16
-  //     str q0, [x0]
-  //     str q1, [x1]
-  //     csel x0, x0, x1, ne
-  //     ldr q0, [x0]
-  //
-  // It's unclear which approach is actually optimal.
+  // We materialise the F128CSEL pseudo-instruction as some control flow and a
+  // phi node:
+
+  // OrigBB:
+  //     [... previous instrs leading to comparison ...]
+  //     b.ne TrueBB
+  //     b EndBB
+  // TrueBB:
+  //     ; Fallthrough
+  // EndBB:
+  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
+
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   MachineFunction *MF = MBB->getParent();
   const BasicBlock *LLVM_BB = MBB->getBasicBlock();
@@ -906,49 +792,24 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
                 MBB->end());
   EndBB->transferSuccessorsAndUpdatePHIs(MBB);
 
-  // We need somewhere to store the f128 value needed.
-  int ScratchFI = MF->getFrameInfo()->CreateSpillStackObject(16, 16);
-
-  //     [... start of incoming MBB ...]
-  //     str qIFFALSE, [sp]
-  //     b.cc IfTrue
-  //     b Done
-  BuildMI(MBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfFalseReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-  BuildMI(MBB, DL, TII->get(AArch64::Bcc))
-    .addImm(CondCode)
-    .addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(AArch64::Bimm))
-    .addMBB(EndBB);
+  BuildMI(MBB, DL, TII->get(AArch64::Bcc)).addImm(CondCode).addMBB(TrueBB);
+  BuildMI(MBB, DL, TII->get(AArch64::B)).addMBB(EndBB);
   MBB->addSuccessor(TrueBB);
   MBB->addSuccessor(EndBB);
 
+  // TrueBB falls through to the end.
+  TrueBB->addSuccessor(EndBB);
+
   if (!NZCVKilled) {
-    // NZCV is live-through TrueBB.
     TrueBB->addLiveIn(AArch64::NZCV);
     EndBB->addLiveIn(AArch64::NZCV);
   }
 
-  // IfTrue:
-  //     str qIFTRUE, [sp]
-  BuildMI(TrueBB, DL, TII->get(AArch64::LSFP128_STR))
-    .addReg(IfTrueReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
-
-  // Note: fallthrough. We can rely on LLVM adding a branch if it reorders the
-  // blocks.
-  TrueBB->addSuccessor(EndBB);
-
-  // Done:
-  //     ldr qDEST, [sp]
-  //     [... rest of incoming MBB ...]
-  MachineInstr *StartOfEnd = EndBB->begin();
-  BuildMI(*EndBB, StartOfEnd, DL, TII->get(AArch64::LSFP128_LDR), DestReg)
-    .addFrameIndex(ScratchFI)
-    .addImm(0);
+  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(AArch64::PHI), DestReg)
+      .addReg(IfTrueReg)
+      .addMBB(TrueBB)
+      .addReg(IfFalseReg)
+      .addMBB(MBB);
 
   MI->eraseFromParent();
   return EndBB;
@@ -956,853 +817,1140 @@ AArch64TargetLowering::EmitF128CSEL(MachineInstr *MI,
 
 MachineBasicBlock *
 AArch64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *MBB) const {
+                                                 MachineBasicBlock *BB) const {
   switch (MI->getOpcode()) {
-  default: llvm_unreachable("Unhandled instruction with custom inserter");
+  default:
+#ifndef NDEBUG
+    MI->dump();
+#endif
+    assert(0 && "Unexpected instruction for custom inserter!");
+    break;
+
   case AArch64::F128CSEL:
-    return EmitF128CSEL(MI, MBB);
-  case AArch64::ATOMIC_LOAD_ADD_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ADDwww_lsl);
-  case AArch64::ATOMIC_LOAD_ADD_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ADDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_SUB_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::SUBwww_lsl);
-  case AArch64::ATOMIC_LOAD_SUB_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::SUBxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_AND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ANDwww_lsl);
-  case AArch64::ATOMIC_LOAD_AND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ANDxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_OR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::ORRwww_lsl);
-  case AArch64::ATOMIC_LOAD_OR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::ORRxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_XOR_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::EORwww_lsl);
-  case AArch64::ATOMIC_LOAD_XOR_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::EORxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_NAND_I8:
-    return emitAtomicBinary(MI, MBB, 1, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I16:
-    return emitAtomicBinary(MI, MBB, 2, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I32:
-    return emitAtomicBinary(MI, MBB, 4, AArch64::BICwww_lsl);
-  case AArch64::ATOMIC_LOAD_NAND_I64:
-    return emitAtomicBinary(MI, MBB, 8, AArch64::BICxxx_lsl);
-
-  case AArch64::ATOMIC_LOAD_MIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::GT);
-  case AArch64::ATOMIC_LOAD_MIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::GT);
-
-  case AArch64::ATOMIC_LOAD_MAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_sxtb, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_sxth, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LT);
-  case AArch64::ATOMIC_LOAD_MAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LT);
-
-  case AArch64::ATOMIC_LOAD_UMIN_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::HI);
-  case AArch64::ATOMIC_LOAD_UMIN_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::HI);
-
-  case AArch64::ATOMIC_LOAD_UMAX_I8:
-    return emitAtomicBinaryMinMax(MI, MBB, 1, AArch64::CMPww_uxtb, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I16:
-    return emitAtomicBinaryMinMax(MI, MBB, 2, AArch64::CMPww_uxth, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I32:
-    return emitAtomicBinaryMinMax(MI, MBB, 4, AArch64::CMPww_lsl, A64CC::LO);
-  case AArch64::ATOMIC_LOAD_UMAX_I64:
-    return emitAtomicBinaryMinMax(MI, MBB, 8, AArch64::CMPxx_lsl, A64CC::LO);
-
-  case AArch64::ATOMIC_SWAP_I8:
-    return emitAtomicBinary(MI, MBB, 1, 0);
-  case AArch64::ATOMIC_SWAP_I16:
-    return emitAtomicBinary(MI, MBB, 2, 0);
-  case AArch64::ATOMIC_SWAP_I32:
-    return emitAtomicBinary(MI, MBB, 4, 0);
-  case AArch64::ATOMIC_SWAP_I64:
-    return emitAtomicBinary(MI, MBB, 8, 0);
-
-  case AArch64::ATOMIC_CMP_SWAP_I8:
-    return emitAtomicCmpSwap(MI, MBB, 1);
-  case AArch64::ATOMIC_CMP_SWAP_I16:
-    return emitAtomicCmpSwap(MI, MBB, 2);
-  case AArch64::ATOMIC_CMP_SWAP_I32:
-    return emitAtomicCmpSwap(MI, MBB, 4);
-  case AArch64::ATOMIC_CMP_SWAP_I64:
-    return emitAtomicCmpSwap(MI, MBB, 8);
+    return EmitF128CSEL(MI, BB);
+
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::PATCHPOINT:
+    return emitPatchPoint(MI, BB);
   }
+  llvm_unreachable("Unexpected instruction for custom inserter!");
 }
 
+//===----------------------------------------------------------------------===//
+// AArch64 Lowering private implementation.
+//===----------------------------------------------------------------------===//
 
-const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  case AArch64ISD::BR_CC:          return "AArch64ISD::BR_CC";
-  case AArch64ISD::Call:           return "AArch64ISD::Call";
-  case AArch64ISD::FPMOV:          return "AArch64ISD::FPMOV";
-  case AArch64ISD::GOTLoad:        return "AArch64ISD::GOTLoad";
-  case AArch64ISD::BFI:            return "AArch64ISD::BFI";
-  case AArch64ISD::EXTR:           return "AArch64ISD::EXTR";
-  case AArch64ISD::Ret:            return "AArch64ISD::Ret";
-  case AArch64ISD::SBFX:           return "AArch64ISD::SBFX";
-  case AArch64ISD::SELECT_CC:      return "AArch64ISD::SELECT_CC";
-  case AArch64ISD::SETCC:          return "AArch64ISD::SETCC";
-  case AArch64ISD::TC_RETURN:      return "AArch64ISD::TC_RETURN";
-  case AArch64ISD::THREAD_POINTER: return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESCCALL:    return "AArch64ISD::TLSDESCCALL";
-  case AArch64ISD::WrapperLarge:   return "AArch64ISD::WrapperLarge";
-  case AArch64ISD::WrapperSmall:   return "AArch64ISD::WrapperSmall";
-
-  case AArch64ISD::NEON_MOVIMM:
-    return "AArch64ISD::NEON_MOVIMM";
-  case AArch64ISD::NEON_MVNIMM:
-    return "AArch64ISD::NEON_MVNIMM";
-  case AArch64ISD::NEON_FMOVIMM:
-    return "AArch64ISD::NEON_FMOVIMM";
-  case AArch64ISD::NEON_CMP:
-    return "AArch64ISD::NEON_CMP";
-  case AArch64ISD::NEON_CMPZ:
-    return "AArch64ISD::NEON_CMPZ";
-  case AArch64ISD::NEON_TST:
-    return "AArch64ISD::NEON_TST";
-  case AArch64ISD::NEON_QSHLs:
-    return "AArch64ISD::NEON_QSHLs";
-  case AArch64ISD::NEON_QSHLu:
-    return "AArch64ISD::NEON_QSHLu";
-  case AArch64ISD::NEON_VDUP:
-    return "AArch64ISD::NEON_VDUP";
-  case AArch64ISD::NEON_VDUPLANE:
-    return "AArch64ISD::NEON_VDUPLANE";
-  case AArch64ISD::NEON_REV16:
-    return "AArch64ISD::NEON_REV16";
-  case AArch64ISD::NEON_REV32:
-    return "AArch64ISD::NEON_REV32";
-  case AArch64ISD::NEON_REV64:
-    return "AArch64ISD::NEON_REV64";
-  case AArch64ISD::NEON_UZP1:
-    return "AArch64ISD::NEON_UZP1";
-  case AArch64ISD::NEON_UZP2:
-    return "AArch64ISD::NEON_UZP2";
-  case AArch64ISD::NEON_ZIP1:
-    return "AArch64ISD::NEON_ZIP1";
-  case AArch64ISD::NEON_ZIP2:
-    return "AArch64ISD::NEON_ZIP2";
-  case AArch64ISD::NEON_TRN1:
-    return "AArch64ISD::NEON_TRN1";
-  case AArch64ISD::NEON_TRN2:
-    return "AArch64ISD::NEON_TRN2";
-  case AArch64ISD::NEON_LD1_UPD:
-    return "AArch64ISD::NEON_LD1_UPD";
-  case AArch64ISD::NEON_LD2_UPD:
-    return "AArch64ISD::NEON_LD2_UPD";
-  case AArch64ISD::NEON_LD3_UPD:
-    return "AArch64ISD::NEON_LD3_UPD";
-  case AArch64ISD::NEON_LD4_UPD:
-    return "AArch64ISD::NEON_LD4_UPD";
-  case AArch64ISD::NEON_ST1_UPD:
-    return "AArch64ISD::NEON_ST1_UPD";
-  case AArch64ISD::NEON_ST2_UPD:
-    return "AArch64ISD::NEON_ST2_UPD";
-  case AArch64ISD::NEON_ST3_UPD:
-    return "AArch64ISD::NEON_ST3_UPD";
-  case AArch64ISD::NEON_ST4_UPD:
-    return "AArch64ISD::NEON_ST4_UPD";
-  case AArch64ISD::NEON_LD1x2_UPD:
-    return "AArch64ISD::NEON_LD1x2_UPD";
-  case AArch64ISD::NEON_LD1x3_UPD:
-    return "AArch64ISD::NEON_LD1x3_UPD";
-  case AArch64ISD::NEON_LD1x4_UPD:
-    return "AArch64ISD::NEON_LD1x4_UPD";
-  case AArch64ISD::NEON_ST1x2_UPD:
-    return "AArch64ISD::NEON_ST1x2_UPD";
-  case AArch64ISD::NEON_ST1x3_UPD:
-    return "AArch64ISD::NEON_ST1x3_UPD";
-  case AArch64ISD::NEON_ST1x4_UPD:
-    return "AArch64ISD::NEON_ST1x4_UPD";
-  case AArch64ISD::NEON_LD2DUP:
-    return "AArch64ISD::NEON_LD2DUP";
-  case AArch64ISD::NEON_LD3DUP:
-    return "AArch64ISD::NEON_LD3DUP";
-  case AArch64ISD::NEON_LD4DUP:
-    return "AArch64ISD::NEON_LD4DUP";
-  case AArch64ISD::NEON_LD2DUP_UPD:
-    return "AArch64ISD::NEON_LD2DUP_UPD";
-  case AArch64ISD::NEON_LD3DUP_UPD:
-    return "AArch64ISD::NEON_LD3DUP_UPD";
-  case AArch64ISD::NEON_LD4DUP_UPD:
-    return "AArch64ISD::NEON_LD4DUP_UPD";
-  case AArch64ISD::NEON_LD2LN_UPD:
-    return "AArch64ISD::NEON_LD2LN_UPD";
-  case AArch64ISD::NEON_LD3LN_UPD:
-    return "AArch64ISD::NEON_LD3LN_UPD";
-  case AArch64ISD::NEON_LD4LN_UPD:
-    return "AArch64ISD::NEON_LD4LN_UPD";
-  case AArch64ISD::NEON_ST2LN_UPD:
-    return "AArch64ISD::NEON_ST2LN_UPD";
-  case AArch64ISD::NEON_ST3LN_UPD:
-    return "AArch64ISD::NEON_ST3LN_UPD";
-  case AArch64ISD::NEON_ST4LN_UPD:
-    return "AArch64ISD::NEON_ST4LN_UPD";
-  case AArch64ISD::NEON_VEXTRACT:
-    return "AArch64ISD::NEON_VEXTRACT";
+//===----------------------------------------------------------------------===//
+// Lowering Code
+//===----------------------------------------------------------------------===//
+
+/// changeIntCCToAArch64CC - Convert a DAG integer condition code to an AArch64
+/// CC
+static AArch64CC::CondCode changeIntCCToAArch64CC(ISD::CondCode CC) {
+  switch (CC) {
   default:
-    return NULL;
+    llvm_unreachable("Unknown condition code!");
+  case ISD::SETNE:
+    return AArch64CC::NE;
+  case ISD::SETEQ:
+    return AArch64CC::EQ;
+  case ISD::SETGT:
+    return AArch64CC::GT;
+  case ISD::SETGE:
+    return AArch64CC::GE;
+  case ISD::SETLT:
+    return AArch64CC::LT;
+  case ISD::SETLE:
+    return AArch64CC::LE;
+  case ISD::SETUGT:
+    return AArch64CC::HI;
+  case ISD::SETUGE:
+    return AArch64CC::HS;
+  case ISD::SETULT:
+    return AArch64CC::LO;
+  case ISD::SETULE:
+    return AArch64CC::LS;
   }
 }
 
-static const uint16_t AArch64FPRArgRegs[] = {
-  AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
-  AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7
-};
-static const unsigned NumFPRArgRegs = llvm::array_lengthof(AArch64FPRArgRegs);
+/// changeFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64 CC.
+static void changeFPCCToAArch64CC(ISD::CondCode CC,
+                                  AArch64CC::CondCode &CondCode,
+                                  AArch64CC::CondCode &CondCode2) {
+  CondCode2 = AArch64CC::AL;
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown FP condition!");
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    CondCode = AArch64CC::EQ;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    CondCode = AArch64CC::GT;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    CondCode = AArch64CC::GE;
+    break;
+  case ISD::SETOLT:
+    CondCode = AArch64CC::MI;
+    break;
+  case ISD::SETOLE:
+    CondCode = AArch64CC::LS;
+    break;
+  case ISD::SETONE:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GT;
+    break;
+  case ISD::SETO:
+    CondCode = AArch64CC::VC;
+    break;
+  case ISD::SETUO:
+    CondCode = AArch64CC::VS;
+    break;
+  case ISD::SETUEQ:
+    CondCode = AArch64CC::EQ;
+    CondCode2 = AArch64CC::VS;
+    break;
+  case ISD::SETUGT:
+    CondCode = AArch64CC::HI;
+    break;
+  case ISD::SETUGE:
+    CondCode = AArch64CC::PL;
+    break;
+  case ISD::SETLT:
+  case ISD::SETULT:
+    CondCode = AArch64CC::LT;
+    break;
+  case ISD::SETLE:
+  case ISD::SETULE:
+    CondCode = AArch64CC::LE;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    CondCode = AArch64CC::NE;
+    break;
+  }
+}
 
-static const uint16_t AArch64ArgRegs[] = {
-  AArch64::X0, AArch64::X1, AArch64::X2, AArch64::X3,
-  AArch64::X4, AArch64::X5, AArch64::X6, AArch64::X7
-};
-static const unsigned NumArgRegs = llvm::array_lengthof(AArch64ArgRegs);
+/// changeVectorFPCCToAArch64CC - Convert a DAG fp condition code to an AArch64
+/// CC usable with the vector instructions. Fewer operations are available
+/// without a real NZCV register, so we have to use less efficient combinations
+/// to get the same effect.
+static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
+                                        AArch64CC::CondCode &CondCode,
+                                        AArch64CC::CondCode &CondCode2,
+                                        bool &Invert) {
+  Invert = false;
+  switch (CC) {
+  default:
+    // Mostly the scalar mappings work fine.
+    changeFPCCToAArch64CC(CC, CondCode, CondCode2);
+    break;
+  case ISD::SETUO:
+    Invert = true; // Fallthrough
+  case ISD::SETO:
+    CondCode = AArch64CC::MI;
+    CondCode2 = AArch64CC::GE;
+    break;
+  case ISD::SETUEQ:
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE:
+    // All of the compare-mask comparisons are ordered, but we can switch
+    // between the two by a double inversion. E.g. ULE == !OGT.
+    Invert = true;
+    changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+    break;
+  }
+}
+
+static bool isLegalArithImmed(uint64_t C) {
+  // Matches AArch64DAGToDAGISel::SelectArithImmed().
+  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
+}
 
-static bool CC_AArch64NoMoreRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                 CCValAssign::LocInfo LocInfo,
-                                 ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  // Mark all remaining general purpose registers as allocated. We don't
-  // backtrack: if (for example) an i128 gets put on the stack, no subsequent
-  // i64 will go in registers (C.11).
-  for (unsigned i = 0; i < NumArgRegs; ++i)
-    State.AllocateReg(AArch64ArgRegs[i]);
+static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                              SDLoc dl, SelectionDAG &DAG) {
+  EVT VT = LHS.getValueType();
+
+  if (VT.isFloatingPoint())
+    return DAG.getNode(AArch64ISD::FCMP, dl, VT, LHS, RHS);
+
+  // The CMP instruction is just an alias for SUBS, and representing it as
+  // SUBS means that it's possible to get CSE with subtract operations.
+  // A later phase can perform the optimization of setting the destination
+  // register to WZR/XZR if it ends up being unused.
+  unsigned Opcode = AArch64ISD::SUBS;
+
+  if (RHS.getOpcode() == ISD::SUB && isa<ConstantSDNode>(RHS.getOperand(0)) &&
+      cast<ConstantSDNode>(RHS.getOperand(0))->getZExtValue() == 0 &&
+      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    // We'd like to combine a (CMP op1, (sub 0, op2) into a CMN instruction on
+    // the grounds that "op1 - (-op2) == op1 + op2". However, the C and V flags
+    // can be set differently by this operation. It comes down to whether
+    // "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are then
+    // everything is fine. If not then the optimization is wrong. Thus general
+    // comparisons are only valid if op2 != 0.
+
+    // So, finally, the only LLVM-native comparisons that don't mention C and V
+    // are SETEQ and SETNE. They're the only ones we can safely use CMN for in
+    // the absence of information about op2.
+    Opcode = AArch64ISD::ADDS;
+    RHS = RHS.getOperand(1);
+  } else if (LHS.getOpcode() == ISD::AND && isa<ConstantSDNode>(RHS) &&
+             cast<ConstantSDNode>(RHS)->getZExtValue() == 0 &&
+             !isUnsignedIntSetCC(CC)) {
+    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+    // of the signed comparisons.
+    Opcode = AArch64ISD::ANDS;
+    RHS = LHS.getOperand(1);
+    LHS = LHS.getOperand(0);
+  }
 
-  return false;
+  return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
+      .getValue(1);
 }
 
-#include "AArch64GenCallingConv.inc"
+static SDValue getAArch64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
+                             SDValue &AArch64cc, SelectionDAG &DAG, SDLoc dl) {
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
+    EVT VT = RHS.getValueType();
+    uint64_t C = RHSC->getZExtValue();
+    if (!isLegalArithImmed(C)) {
+      // Constant does not fit, try adjusting it by one?
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETLT:
+      case ISD::SETGE:
+        if ((VT == MVT::i32 && C != 0x80000000 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0x80000000ULL &&
+             isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULT:
+      case ISD::SETUGE:
+        if ((VT == MVT::i32 && C != 0 &&
+             isLegalArithImmed((uint32_t)(C - 1))) ||
+            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
+          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
+          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETLE:
+      case ISD::SETGT:
+        if ((VT == MVT::i32 && C != 0x7fffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      case ISD::SETULE:
+      case ISD::SETUGT:
+        if ((VT == MVT::i32 && C != 0xffffffff &&
+             isLegalArithImmed((uint32_t)(C + 1))) ||
+            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
+             isLegalArithImmed(C + 1ULL))) {
+          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
+          RHS = DAG.getConstant(C, VT);
+        }
+        break;
+      }
+    }
+  }
 
-CCAssignFn *AArch64TargetLowering::CCAssignFnForNode(CallingConv::ID CC) const {
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+  AArch64cc = DAG.getConstant(AArch64CC, MVT::i32);
+  return Cmp;
+}
 
-  switch(CC) {
-  default: llvm_unreachable("Unsupported calling convention");
-  case CallingConv::Fast:
-  case CallingConv::C:
-    return CC_A64_APCS;
+static std::pair<SDValue, SDValue>
+getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
+  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
+         "Unsupported value type");
+  SDValue Value, Overflow;
+  SDLoc DL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  unsigned Opc = 0;
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::UADDO:
+    Opc = AArch64ISD::ADDS;
+    CC = AArch64CC::HS;
+    break;
+  case ISD::SSUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::VS;
+    break;
+  case ISD::USUBO:
+    Opc = AArch64ISD::SUBS;
+    CC = AArch64CC::LO;
+    break;
+  // Multiply needs a little bit extra work.
+  case ISD::SMULO:
+  case ISD::UMULO: {
+    CC = AArch64CC::NE;
+    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
+    if (Op.getValueType() == MVT::i32) {
+      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      // For a 32 bit multiply with overflow check we want the instruction
+      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
+      // need to generate the following pattern:
+      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
+      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
+      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
+                                DAG.getConstant(0, MVT::i64));
+      // On AArch64 the upper 32 bits are always zero extended for a 32 bit
+      // operation. We need to clear out the upper 32 bits, because we used a
+      // widening multiply that wrote all 64 bits. In the end this should be a
+      // noop.
+      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
+      if (IsSigned) {
+        // The signed overflow check requires more than just a simple check for
+        // any bit set in the upper 32 bits of the result. These bits could be
+        // just the sign bits of a negative number. To perform the overflow
+        // check we have to arithmetic shift right the 32nd bit of the result by
+        // 31 bits. Then we compare the result to the upper 32 bits.
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
+                                        DAG.getConstant(32, MVT::i64));
+        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
+        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
+                                        DAG.getConstant(31, MVT::i64));
+        // It is important that LowerBits is last, otherwise the arithmetic
+        // shift will not be folded into the compare (SUBS).
+        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
+        Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                       .getValue(1);
+      } else {
+        // The overflow check for unsigned multiply is easy. We only need to
+        // check if any of the upper 32 bits are set. This can be done with a
+        // CMP (shifted register). For that we need to generate the following
+        // pattern:
+        // (i64 AArch64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
+        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
+                                        DAG.getConstant(32, MVT::i64));
+        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+        Overflow =
+            DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                        UpperBits).getValue(1);
+      }
+      break;
+    }
+    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
+    // For the 64 bit multiply
+    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
+    if (IsSigned) {
+      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
+      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
+                                      DAG.getConstant(63, MVT::i64));
+      // It is important that LowerBits is last, otherwise the arithmetic
+      // shift will not be folded into the compare (SUBS).
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow = DAG.getNode(AArch64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
+                     .getValue(1);
+    } else {
+      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
+      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
+      Overflow =
+          DAG.getNode(AArch64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
+                      UpperBits).getValue(1);
+    }
+    break;
+  }
+  } // switch (...)
+
+  if (Opc) {
+    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
+
+    // Emit the AArch64 operation with overflow check.
+    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
+    Overflow = Value.getValue(1);
   }
+  return std::make_pair(Value, Overflow);
 }
 
-void
-AArch64TargetLowering::SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG,
-                                           SDLoc DL, SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                                             RTLIB::Libcall Call) const {
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-  SmallVector<SDValue, 8> MemOps;
+  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
-  unsigned FirstVariadicGPR = CCInfo.getFirstUnallocated(AArch64ArgRegs,
-                                                         NumArgRegs);
-  unsigned FirstVariadicFPR = CCInfo.getFirstUnallocated(AArch64FPRArgRegs,
-                                                         NumFPRArgRegs);
+static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
+  SDValue Sel = Op.getOperand(0);
+  SDValue Other = Op.getOperand(1);
 
-  unsigned GPRSaveSize = 8 * (NumArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
+  // If neither operand is a SELECT_CC, give up.
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    std::swap(Sel, Other);
+  if (Sel.getOpcode() != ISD::SELECT_CC)
+    return Op;
 
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+  // The folding we want to perform is:
+  // (xor x, (select_cc a, b, cc, 0, -1) )
+  //   -->
+  // (csel x, (xor x, -1), cc ...)
+  //
+  // The latter will get matched to a CSINV instruction.
 
-    for (unsigned i = FirstVariadicGPR; i < NumArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(AArch64ArgRegs[i], &AArch64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                                   MachinePointerInfo::getStack(i * 8),
-                                   false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
-    }
-  }
+  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
+  SDValue LHS = Sel.getOperand(0);
+  SDValue RHS = Sel.getOperand(1);
+  SDValue TVal = Sel.getOperand(2);
+  SDValue FVal = Sel.getOperand(3);
+  SDLoc dl(Sel);
 
-  if (getSubtarget()->hasFPARMv8()) {
-  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-  int FPRIdx = 0;
-    // According to the AArch64 Procedure Call Standard, section B.1/B.3, we
-    // can omit a register save area if we know we'll never use registers of
-    // that class.
-    if (FPRSaveSize != 0) {
-      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
+  // FIXME: This could be generalized to non-integer comparisons.
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return Op;
 
-      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
+  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
 
-      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-        unsigned VReg = MF.addLiveIn(AArch64FPRArgRegs[i],
-            &AArch64::FPR128RegClass);
-        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
-        SDValue Store = DAG.getStore(Val.getValue(1), DL, Val, FIN,
-            MachinePointerInfo::getStack(i * 16),
-            false, false, 0);
-        MemOps.push_back(Store);
-        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-            DAG.getConstant(16, getPointerTy()));
-      }
-    }
-    FuncInfo->setVariadicFPRIdx(FPRIdx);
-    FuncInfo->setVariadicFPRSize(FPRSaveSize);
+  // The the values aren't constants, this isn't the pattern we're looking for.
+  if (!CFVal || !CTVal)
+    return Op;
+
+  // We can commute the SELECT_CC by inverting the condition.  This
+  // might be needed to make this fit into a CSINV pattern.
+  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+    std::swap(TVal, FVal);
+    std::swap(CTVal, CFVal);
+    CC = ISD::getSetCCInverse(CC, true);
   }
 
-  unsigned StackOffset = RoundUpToAlignment(CCInfo.getNextStackOffset(), 8);
-  int StackIdx = MFI->CreateFixedObject(8, StackOffset, true);
+  // If the constants line up, perform the transform!
+  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
-  FuncInfo->setVariadicStackIdx(StackIdx);
-  FuncInfo->setVariadicGPRIdx(GPRIdx);
-  FuncInfo->setVariadicGPRSize(GPRSaveSize);
+    FVal = Other;
+    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
+                       DAG.getConstant(-1ULL, Other.getValueType()));
 
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    return DAG.getNode(AArch64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
+                       CCVal, Cmp);
   }
+
+  return Op;
 }
 
+static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-SDValue
-AArch64TargetLowering::LowerFormalArguments(SDValue Chain,
-                                      CallingConv::ID CallConv, bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForNode(CallConv));
+  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
 
-  SmallVector<SDValue, 16> ArgValues;
+  unsigned Opc;
+  bool ExtraOp = false;
+  switch (Op.getOpcode()) {
+  default:
+    assert(0 && "Invalid code");
+  case ISD::ADDC:
+    Opc = AArch64ISD::ADDS;
+    break;
+  case ISD::SUBC:
+    Opc = AArch64ISD::SUBS;
+    break;
+  case ISD::ADDE:
+    Opc = AArch64ISD::ADCS;
+    ExtraOp = true;
+    break;
+  case ISD::SUBE:
+    Opc = AArch64ISD::SBCS;
+    ExtraOp = true;
+    break;
+  }
 
-  SDValue ArgValue;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+  if (!ExtraOp)
+    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
+  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
+                     Op.getOperand(2));
+}
 
-    if (Flags.isByVal()) {
-      // Byval is used for small structs and HFAs in the PCS, but the system
-      // should work in a non-compliant manner for larger structs.
-      EVT PtrTy = getPointerTy();
-      int Size = Flags.getByValSize();
-      unsigned NumRegs = (Size + 7) / 8;
+static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
 
-      uint32_t BEAlign = 0;
-      if (Size < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-Size;
-      unsigned FrameIdx = MFI->CreateFixedObject(8 * NumRegs,
-                                                 VA.getLocMemOffset() + BEAlign,
-                                                 false);
-      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
-      InVals.push_back(FrameIdxN);
+  AArch64CC::CondCode CC;
+  // The actual operation that sets the overflow or carry flag.
+  SDValue Value, Overflow;
+  std::tie(Value, Overflow) = getAArch64XALUOOp(CC, Op, DAG);
 
-      continue;
-    } else if (VA.isRegLoc()) {
-      MVT RegVT = VA.getLocVT();
-      const TargetRegisterClass *RC = getRegClassFor(RegVT);
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
 
-      ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc());
+  // We use an inverted condition, because the conditional select is inverted
+  // too. This will allow it to be selected to a single instruction:
+  // CSINC Wd, WZR, WZR, invert(cond).
+  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
+  Overflow = DAG.getNode(AArch64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal,
+                         CCVal, Overflow);
 
-      int FI = MFI->CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
-                                      VA.getLocMemOffset(), true);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
 
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      ArgValue = DAG.getLoad(VA.getLocVT(), dl, Chain, FIN,
-                             MachinePointerInfo::getFixedStack(FI),
-                             false, false, false, 0);
+// Prefetch operands are:
+// 1: Address to prefetch
+// 2: bool isWrite
+// 3: int locality (0 = no locality ... 3 = extreme locality)
+// 4: bool isDataCache
+static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+  // The data thing is not used.
+  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+  bool IsStream = !Locality;
+  // When the locality number is set
+  if (Locality) {
+    // The front-end should have filtered out the out-of-range values
+    assert(Locality <= 3 && "Prefetch locality out-of-range");
+    // The locality degree is the opposite of the cache speed.
+    // Put the number the other way around.
+    // The encoding starts at 0 for level 1
+    Locality = 3 - Locality;
+  }
 
+  // built the mask value encoding the expected behavior.
+  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
+                   (Locality << 1) |    // Cache level bits
+                   (unsigned)IsStream;  // Stream bit
+  return DAG.getNode(AArch64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
+                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
+}
 
-    }
+SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
 
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      ArgValue = DAG.getNode(ISD::BITCAST,dl, VA.getValVT(), ArgValue);
-      break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned DestSize = VA.getValVT().getSizeInBits();
-      unsigned DestSubReg;
-
-      switch (DestSize) {
-      case 8: DestSubReg = AArch64::sub_8; break;
-      case 16: DestSubReg = AArch64::sub_16; break;
-      case 32: DestSubReg = AArch64::sub_32; break;
-      case 64: DestSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-      ArgValue = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl,
-                                   VA.getValVT(), ArgValue,
-                                   DAG.getTargetConstant(DestSubReg, MVT::i32)),
-                         0);
-      break;
-    }
-    }
+  return LowerF128Call(Op, DAG, LC);
+}
 
-    InVals.push_back(ArgValue);
+SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
   }
 
-  if (isVarArg)
-    SaveVarArgRegisters(CCInfo, DAG, dl, Chain);
+  RTLIB::Libcall LC;
+  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  unsigned StackArgSize = CCInfo.getNextStackOffset();
-  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
-    // This is a non-standard ABI so by fiat I say we're allowed to make full
-    // use of the stack area to be popped, which must be aligned to 16 bytes in
-    // any case:
-    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
+  // FP_ROUND node has a second operand indicating whether it is known to be
+  // precise. That doesn't take part in the LibCall so we can't directly use
+  // LowerF128Call.
+  SDValue SrcVal = Op.getOperand(0);
+  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
+                     /*isSigned*/ false, SDLoc(Op)).first;
+}
 
-    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
-    // a multiple of 16.
-    FuncInfo->setArgumentStackToRestore(StackArgSize);
+static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT InVT = Op.getOperand(0).getValueType();
+  EVT VT = Op.getValueType();
 
-    // This realignment carries over to the available bytes below. Our own
-    // callers will guarantee the space is free by giving an aligned value to
-    // CALLSEQ_START.
+  // FP_TO_XINT conversion from the same type are legal.
+  if (VT.getSizeInBits() == InVT.getSizeInBits())
+    return Op;
+
+  if (InVT == MVT::v2f64 || InVT == MVT::v4f32) {
+    SDLoc dl(Op);
+    SDValue Cv =
+        DAG.getNode(Op.getOpcode(), dl, InVT.changeVectorElementTypeToInteger(),
+                    Op.getOperand(0));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
+  } else if (InVT == MVT::v2f32) {
+    SDLoc dl(Op);
+    SDValue Ext = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, VT, Ext);
   }
-  // Even if we're not expected to free up the space, it's useful to know how
-  // much is there while considering tail calls (because we can reuse it).
-  FuncInfo->setBytesInStackArgArea(StackArgSize);
 
-  return Chain;
+  // Type changing conversions are illegal.
+  return SDValue();
 }
 
-SDValue
-AArch64TargetLowering::LowerReturn(SDValue Chain,
-                                   CallingConv::ID CallConv, bool isVarArg,
-                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                   const SmallVectorImpl<SDValue> &OutVals,
-                                   SDLoc dl, SelectionDAG &DAG) const {
-  // CCValAssign - represent the assignment of the return value to a location.
-  SmallVector<CCValAssign, 16> RVLocs;
+SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  if (Op.getOperand(0).getValueType().isVector())
+    return LowerVectorFP_TO_INT(Op, DAG);
 
-  // CCState - Info about the registers and stack slots.
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
+  if (Op.getOperand(0).getValueType() != MVT::f128) {
+    // It's legal except when f128 is involved
+    return Op;
+  }
 
-  // Analyze outgoing return values.
-  CCInfo.AnalyzeReturn(Outs, CCAssignFnForNode(CallConv));
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
 
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
+  SmallVector<SDValue, 2> Ops;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
+    Ops.push_back(Op.getOperand(i));
 
-  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
-    // PCS: "If the type, T, of the result of a function is such that
-    // void func(T arg) would require that arg be passed as a value in a
-    // register (or set of registers) according to the rules in 5.4, then the
-    // result is returned in the same registers as would be used for such an
-    // argument.
-    //
-    // Otherwise, the caller shall reserve a block of memory of sufficient
-    // size and alignment to hold the result. The address of the memory block
-    // shall be passed as an additional argument to the function in x8."
-    //
-    // This is implemented in two places. The register-return values are dealt
-    // with here, more complex returns are passed as an sret parameter, which
-    // means we don't have to worry about it during actual return.
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Only register-returns should be created by PCS");
+  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
+                     SDLoc(Op)).first;
+}
 
+static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  // Warning: We maintain cost tables in AArch64TargetTransformInfo.cpp.
+  // Any additional optimization in this function should be recorded
+  // in the cost tables.
+  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
+  SDValue In = Op.getOperand(0);
+  EVT InVT = In.getValueType();
 
-    SDValue Arg = OutVals[i];
+  // v2i32 to v2f32 is legal.
+  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
+    return Op;
 
-    // There's no convenient note in the ABI about this as there is for normal
-    // arguments, but it says return values are passed in the same registers as
-    // an argument would be. I believe that includes the comments about
-    // unspecified higher bits, putting the burden of widening on the *caller*
-    // for return values.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-      // Floating-point values should only be extended when they're going into
-      // memory, which can't happen here so an integer extend is acceptable.
-      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+  // This function only handles v2f64 outputs.
+  if (VT == MVT::v2f64) {
+    // Extend the input argument to a v2i64 that we can feed into the
+    // floating point conversion. Zero or sign extend based on whether
+    // we're doing a signed or unsigned float conversion.
+    unsigned Opc =
+        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
+    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
+    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
+  }
 
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  // Scalarize v2i64 to v2f32 conversions.
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
+                               DAG.getConstant(i, MVT::i64));
+    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
+    BuildVectorOps.push_back(Sclr);
   }
 
-  RetOps[0] = Chain;  // Update chain.
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, BuildVectorOps);
+}
 
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
+SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isVector())
+    return LowerVectorINT_TO_FP(Op, DAG);
 
-  return DAG.getNode(AArch64ISD::Ret, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
-}
+  // i128 conversions are libcalls.
+  if (Op.getOperand(0).getValueType() == MVT::i128)
+    return SDValue();
+
+  // Other conversions are legal, unless it's to the completely software-based
+  // fp128.
+  if (Op.getValueType() != MVT::f128)
+    return Op;
 
-unsigned AArch64TargetLowering::getByValTypeAlignment(Type *Ty) const {
-  // This is a new backend. For anything more precise than this a FE should
-  // set an explicit alignment.
-  return 4;
+  RTLIB::Libcall LC;
+  if (Op.getOpcode() == ISD::SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  else
+    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+
+  return LowerF128Call(Op, DAG, LC);
 }
 
-SDValue
-AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                 SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG                     = CLI.DAG;
-  SDLoc &dl                             = CLI.DL;
-  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
-  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
-  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
-  SDValue Chain                         = CLI.Chain;
-  SDValue Callee                        = CLI.Callee;
-  bool &IsTailCall                      = CLI.IsTailCall;
-  CallingConv::ID CallConv              = CLI.CallConv;
-  bool IsVarArg                         = CLI.IsVarArg;
+SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  // For iOS, we want to call an alternative entry point: __sincos_stret,
+  // which returns the values in two S / D registers.
+  SDLoc dl(Op);
+  SDValue Arg = Op.getOperand(0);
+  EVT ArgVT = Arg.getValueType();
+  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
 
-  MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
-  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
-  bool IsStructRet = !Outs.empty() && Outs[0].Flags.isSRet();
-  bool IsSibCall = false;
+  ArgListTy Args;
+  ArgListEntry Entry;
 
-  if (IsTailCall) {
-    IsTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
-                    IsVarArg, IsStructRet, MF.getFunction()->hasStructRetAttr(),
-                                                   Outs, OutVals, Ins, DAG);
+  Entry.Node = Arg;
+  Entry.Ty = ArgTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
 
-    // A sibling call is one where we're under the usual C ABI and not planning
-    // to change that but can still do a tail call:
-    if (!TailCallOpt && IsTailCall)
-      IsSibCall = true;
-  }
+  const char *LibcallName =
+      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
+  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CallConv));
+  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::Fast, RetTy, Callee, &Args, 0);
 
-  // On AArch64 (and all other architectures I'm aware of) the most this has to
-  // do is adjust the stack pointer.
-  unsigned NumBytes = RoundUpToAlignment(CCInfo.getNextStackOffset(), 16);
-  if (IsSibCall) {
-    // Since we're not changing the ABI to make this a tail call, the memory
-    // operands are already available in the caller's incoming argument space.
-    NumBytes = 0;
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  return CallResult.first;
+}
+
+SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unimplemented operand");
+    return SDValue();
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::SETCC:
+    return LowerSETCC(Op, DAG);
+  case ISD::BR_CC:
+    return LowerBR_CC(Op, DAG);
+  case ISD::SELECT:
+    return LowerSELECT(Op, DAG);
+  case ISD::SELECT_CC:
+    return LowerSELECT_CC(Op, DAG);
+  case ISD::JumpTable:
+    return LowerJumpTable(Op, DAG);
+  case ISD::ConstantPool:
+    return LowerConstantPool(Op, DAG);
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VACOPY:
+    return LowerVACOPY(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  case ISD::ADDC:
+  case ISD::ADDE:
+  case ISD::SUBC:
+  case ISD::SUBE:
+    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return LowerXALUO(Op, DAG);
+  case ISD::FADD:
+    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
+  case ISD::FSUB:
+    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
+  case ISD::FMUL:
+    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FDIV:
+    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
+  case ISD::FP_ROUND:
+    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP_EXTEND:
+    return LowerFP_EXTEND(Op, DAG);
+  case ISD::FRAMEADDR:
+    return LowerFRAMEADDR(Op, DAG);
+  case ISD::RETURNADDR:
+    return LowerRETURNADDR(Op, DAG);
+  case ISD::INSERT_VECTOR_ELT:
+    return LowerINSERT_VECTOR_ELT(Op, DAG);
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
+  case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
+  case ISD::VECTOR_SHUFFLE:
+    return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::EXTRACT_SUBVECTOR:
+    return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL:
+    return LowerVectorSRA_SRL_SHL(Op, DAG);
+  case ISD::SHL_PARTS:
+    return LowerShiftLeftParts(Op, DAG);
+  case ISD::SRL_PARTS:
+  case ISD::SRA_PARTS:
+    return LowerShiftRightParts(Op, DAG);
+  case ISD::CTPOP:
+    return LowerCTPOP(Op, DAG);
+  case ISD::FCOPYSIGN:
+    return LowerFCOPYSIGN(Op, DAG);
+  case ISD::AND:
+    return LowerVectorAND(Op, DAG);
+  case ISD::OR:
+    return LowerVectorOR(Op, DAG);
+  case ISD::XOR:
+    return LowerXOR(Op, DAG);
+  case ISD::PREFETCH:
+    return LowerPREFETCH(Op, DAG);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return LowerINT_TO_FP(Op, DAG);
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    return LowerFP_TO_INT(Op, DAG);
+  case ISD::FSINCOS:
+    return LowerFSINCOS(Op, DAG);
   }
+}
 
-  // FPDiff is the byte offset of the call's argument area from the callee's.
-  // Stores to callee stack arguments will be placed in FixedStackSlots offset
-  // by this amount for a tail call. In a sibling call it must be 0 because the
-  // caller will deallocate the entire stack and the callee still expects its
-  // arguments to begin at SP+0. Completely unused for non-tail calls.
-  int FPDiff = 0;
+/// getFunctionAlignment - Return the Log2 alignment of this function.
+unsigned AArch64TargetLowering::getFunctionAlignment(const Function *F) const {
+  return 2;
+}
 
-  if (IsTailCall && !IsSibCall) {
-    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
+//===----------------------------------------------------------------------===//
+//                      Calling Convention Implementation
+//===----------------------------------------------------------------------===//
 
-    // FPDiff will be negative if this tail call requires more space than we
-    // would automatically have in our incoming argument space. Positive if we
-    // can actually shrink the stack.
-    FPDiff = NumReusableBytes - NumBytes;
+#include "AArch64GenCallingConv.inc"
 
-    // The stack pointer must be 16-byte aligned at all times it's used for a
-    // memory operation, which in practice means at *all* times and in
-    // particular across call boundaries. Therefore our own arguments started at
-    // a 16-byte aligned SP and the delta applied for the tail call should
-    // satisfy the same constraint.
-    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
+/// Selects the correct CCAssignFn for a the given CallingConvention
+/// value.
+CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                     bool IsVarArg) const {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unsupported calling convention.");
+  case CallingConv::WebKit_JS:
+    return CC_AArch64_WebKit_JS;
+  case CallingConv::C:
+  case CallingConv::Fast:
+    if (!Subtarget->isTargetDarwin())
+      return CC_AArch64_AAPCS;
+    return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS;
   }
+}
 
-  if (!IsSibCall)
-    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                                 dl);
-
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, AArch64::XSP,
-                                        getPointerTy());
+SDValue AArch64TargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  SmallVector<SDValue, 8> MemOpChains;
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  // Assign locations to all of the incoming arguments.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
+  // At this point, Ins[].VT may already be promoted to i32. To correctly
+  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+  // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
+  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
+  // LocVT.
+  unsigned NumArgs = Ins.size();
+  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
+  unsigned CurArgIdx = 0;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ValVT = Ins[i].VT;
+    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
+    CurArgIdx = Ins[i].OrigArgIndex;
+
+    // Get type of the original argument.
+    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
+    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
+    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+      ValVT = MVT::i8;
+    else if (ActualMVT == MVT::i16)
+      ValVT = MVT::i16;
+
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+    bool Res =
+        AssignFn(i, ValVT, ValVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
+    assert(!Res && "Call operand has unhandled type");
+    (void)Res;
+  }
+  assert(ArgLocs.size() == Ins.size());
+  SmallVector<SDValue, 16> ArgValues;
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
-    ISD::ArgFlagsTy Flags = Outs[i].Flags;
-    SDValue Arg = OutVals[i];
-
-    // Callee does the actual widening, so all extensions just use an implicit
-    // definition of the rest of the Loc. Aesthetically, this would be nicer as
-    // an ANY_EXTEND, but that isn't valid for floating-point types and this
-    // alternative works on integer types too.
-    switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::SExt:
-    case CCValAssign::ZExt:
-    case CCValAssign::AExt:
-    case CCValAssign::FPExt: {
-      unsigned SrcSize = VA.getValVT().getSizeInBits();
-      unsigned SrcSubReg;
-
-      switch (SrcSize) {
-      case 8: SrcSubReg = AArch64::sub_8; break;
-      case 16: SrcSubReg = AArch64::sub_16; break;
-      case 32: SrcSubReg = AArch64::sub_32; break;
-      case 64: SrcSubReg = AArch64::sub_64; break;
-      default: llvm_unreachable("Unexpected argument promotion");
-      }
 
-      Arg = SDValue(DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl,
-                                    VA.getLocVT(),
-                                    DAG.getUNDEF(VA.getLocVT()),
-                                    Arg,
-                                    DAG.getTargetConstant(SrcSubReg, MVT::i32)),
-                    0);
+    if (Ins[i].Flags.isByVal()) {
+      // Byval is used for HFAs in the PCS, but the system should work in a
+      // non-compliant manner for larger structs.
+      EVT PtrTy = getPointerTy();
+      int Size = Ins[i].Flags.getByValSize();
+      unsigned NumRegs = (Size + 7) / 8;
 
-      break;
-    }
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
-      break;
-    }
+      // FIXME: This works on big-endian for composite byvals, which are the common
+      // case. It should also work for fundamental types too.
+      unsigned FrameIdx =
+        MFI->CreateFixedObject(8 * NumRegs, VA.getLocMemOffset(), false);
+      SDValue FrameIdxN = DAG.getFrameIndex(FrameIdx, PtrTy);
+      InVals.push_back(FrameIdxN);
 
-    if (VA.isRegLoc()) {
-      // A normal register (sub-) argument. For now we just note it down because
-      // we want to copy things into registers as late as possible to avoid
-      // register-pressure (and possibly worse).
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       continue;
-    }
+    } if (VA.isRegLoc()) {
+      // Arguments stored in registers.
+      EVT RegVT = VA.getLocVT();
+
+      SDValue ArgValue;
+      const TargetRegisterClass *RC;
+
+      if (RegVT == MVT::i32)
+        RC = &AArch64::GPR32RegClass;
+      else if (RegVT == MVT::i64)
+        RC = &AArch64::GPR64RegClass;
+      else if (RegVT == MVT::f32)
+        RC = &AArch64::FPR32RegClass;
+      else if (RegVT == MVT::f64 || RegVT.is64BitVector())
+        RC = &AArch64::FPR64RegClass;
+      else if (RegVT == MVT::f128 || RegVT.is128BitVector())
+        RC = &AArch64::FPR128RegClass;
+      else
+        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
 
-    assert(VA.isMemLoc() && "unexpected argument location");
+      // Transform the arguments in physical registers into virtual ones.
+      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
 
-    SDValue DstAddr;
-    MachinePointerInfo DstInfo;
-    if (IsTailCall) {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize() :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
-      int32_t Offset = VA.getLocMemOffset() + FPDiff;
-      int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      // If this is an 8, 16 or 32-bit value, it is really passed promoted
+      // to 64 bits.  Insert an assert[sz]ext to capture this, then
+      // truncate to the right size.
+      switch (VA.getLocInfo()) {
+      default:
+        llvm_unreachable("Unknown loc info!");
+      case CCValAssign::Full:
+        break;
+      case CCValAssign::BCvt:
+        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
+        break;
+      case CCValAssign::AExt:
+      case CCValAssign::SExt:
+      case CCValAssign::ZExt:
+        // SelectionDAGBuilder will insert appropriate AssertZExt & AssertSExt
+        // nodes after our lowering.
+        assert(RegVT == Ins[i].VT && "incorrect register location selected");
+        break;
+      }
 
-      DstAddr = DAG.getFrameIndex(FI, getPointerTy());
-      DstInfo = MachinePointerInfo::getFixedStack(FI);
+      InVals.push_back(ArgValue);
+
+    } else { // VA.isRegLoc()
+      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
+      unsigned ArgOffset = VA.getLocMemOffset();
+      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
 
-      // Make sure any stack arguments overlapping with where we're storing are
-      // loaded before this eventual operation. Otherwise they'll be clobbered.
-      Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
-    } else {
-      uint32_t OpSize = Flags.isByVal() ? Flags.getByValSize()*8 :
-                                          VA.getLocVT().getSizeInBits();
-      OpSize = (OpSize + 7) / 8;
       uint32_t BEAlign = 0;
-      if (OpSize < 8 && !getSubtarget()->isLittle())
-        BEAlign = 8-OpSize;
-      SDValue PtrOff = DAG.getIntPtrConstant(VA.getLocMemOffset() + BEAlign);
+      if (ArgSize < 8 && !Subtarget->isLittleEndian())
+        BEAlign = 8 - ArgSize;
 
-      DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, PtrOff);
-      DstInfo = MachinePointerInfo::getStack(VA.getLocMemOffset());
-    }
+      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset + BEAlign, true);
 
-    if (Flags.isByVal()) {
-      SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i64);
-      SDValue Cpy = DAG.getMemcpy(Chain, dl, DstAddr, Arg, SizeNode,
-                                  Flags.getByValAlign(),
-                                  /*isVolatile = */ false,
-                                  /*alwaysInline = */ false,
-                                  DstInfo, MachinePointerInfo(0));
-      MemOpChains.push_back(Cpy);
-    } else {
-      // Normal stack argument, put it where it's needed.
-      SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo,
-                                   false, false, 0);
-      MemOpChains.push_back(Store);
-    }
-  }
+      // Create load nodes to retrieve arguments from the stack.
+      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
+      SDValue ArgValue;
 
-  // The loads and stores generated above shouldn't clash with each
-  // other. Combining them with this TokenFactor notes that fact for the rest of
-  // the backend.
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+      ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+      switch (VA.getLocInfo()) {
+      default:
+        break;
+      case CCValAssign::SExt:
+        ExtType = ISD::SEXTLOAD;
+        break;
+      case CCValAssign::ZExt:
+        ExtType = ISD::ZEXTLOAD;
+        break;
+      case CCValAssign::AExt:
+        ExtType = ISD::EXTLOAD;
+        break;
+      }
 
-  // Most of the rest of the instructions need to be glued together; we don't
-  // want assignments to actual registers used by a call to be rearranged by a
-  // well-meaning scheduler.
-  SDValue InFlag;
+      ArgValue = DAG.getExtLoad(ExtType, DL, VA.getValVT(), Chain, FIN,
+                                MachinePointerInfo::getFixedStack(FI),
+                                VA.getLocVT(),
+                                false, false, false, 0);
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
+      InVals.push_back(ArgValue);
+    }
   }
 
-  // The linker is responsible for inserting veneers when necessary to put a
-  // function call destination in range, so we don't need to bother with a
-  // wrapper here.
-  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, dl, getPointerTy());
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy());
-  }
+  // varargs
+  if (isVarArg) {
+    if (!Subtarget->isTargetDarwin()) {
+      // The AAPCS variadic function ABI is identical to the non-variadic
+      // one. As a result there may be more arguments in registers and we should
+      // save them for future reference.
+      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
+    }
 
-  // We don't usually want to end the call-sequence here because we would tidy
-  // the frame up *after* the call, however in the ABI-changing tail-call case
-  // we've carefully laid out the parameters so that when sp is reset they'll be
-  // in the correct location.
-  if (IsTailCall && !IsSibCall) {
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(0, true), InFlag, dl);
-    InFlag = Chain.getValue(1);
+    AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+    // This will point to the next argument passed via stack.
+    unsigned StackOffset = CCInfo.getNextStackOffset();
+    // We currently pass all varargs at 8-byte alignment.
+    StackOffset = ((StackOffset + 7) & ~7);
+    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
   }
 
-  // We produce the following DAG scheme for the actual call instruction:
-  //     (AArch64Call Chain, Callee, reg1, ..., regn, preserveMask, inflag?
-  //
-  // Most arguments aren't going to be used and just keep the values live as
-  // far as LLVM is concerned. It's expected to be selected as simply "bl
-  // callee" (for a direct, non-tail call).
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned StackArgSize = CCInfo.getNextStackOffset();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  if (DoesCalleeRestoreStack(CallConv, TailCallOpt)) {
+    // This is a non-standard ABI so by fiat I say we're allowed to make full
+    // use of the stack area to be popped, which must be aligned to 16 bytes in
+    // any case:
+    StackArgSize = RoundUpToAlignment(StackArgSize, 16);
 
-  if (IsTailCall) {
-    // Each tail call may have to adjust the stack by a different amount, so
-    // this information must travel along with the operation for eventual
-    // consumption by emitEpilogue.
-    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
+    // If we're expected to restore the stack (e.g. fastcc) then we'll be adding
+    // a multiple of 16.
+    FuncInfo->setArgumentStackToRestore(StackArgSize);
+
+    // This realignment carries over to the available bytes below. Our own
+    // callers will guarantee the space is free by giving an aligned value to
+    // CALLSEQ_START.
   }
+  // Even if we're not expected to free up the space, it's useful to know how
+  // much is there while considering tail calls (because we can reuse it).
+  FuncInfo->setBytesInStackArgArea(StackArgSize);
 
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
+  return Chain;
+}
 
+void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
+                                                SelectionDAG &DAG, SDLoc DL,
+                                                SDValue &Chain) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
-  // Add a register mask operand representing the call-preserved registers. This
-  // is used later in codegen to constrain register-allocation.
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const uint32_t *Mask = TRI->getCallPreservedMask(CallConv);
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  SmallVector<SDValue, 8> MemOps;
 
-  // If we needed glue, put it in as the last argument.
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
+  static const MCPhysReg GPRArgRegs[] = { AArch64::X0, AArch64::X1, AArch64::X2,
+                                          AArch64::X3, AArch64::X4, AArch64::X5,
+                                          AArch64::X6, AArch64::X7 };
+  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
+  unsigned FirstVariadicGPR =
+      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
 
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
+  int GPRIdx = 0;
+  if (GPRSaveSize != 0) {
+    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
 
-  if (IsTailCall) {
-    return DAG.getNode(AArch64ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
+
+    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
+      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &AArch64::GPR64RegClass);
+      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getStack(i * 8), false, false, 0);
+      MemOps.push_back(Store);
+      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                        DAG.getConstant(8, getPointerTy()));
+    }
   }
+  FuncInfo->setVarArgsGPRIndex(GPRIdx);
+  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
 
-  Chain = DAG.getNode(AArch64ISD::Call, dl, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
+  if (Subtarget->hasFPARMv8()) {
+    static const MCPhysReg FPRArgRegs[] = {
+        AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3,
+        AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7};
+    static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
+    unsigned FirstVariadicFPR =
+        CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
+
+    unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
+    int FPRIdx = 0;
+    if (FPRSaveSize != 0) {
+      FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
 
-  // Now we can reclaim the stack, just as well do it before working out where
-  // our return value is.
-  if (!IsSibCall) {
-    uint64_t CalleePopBytes
-      = DoesCalleeRestoreStack(CallConv, TailCallOpt) ? NumBytes : 0;
+      SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
 
-    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                               DAG.getIntPtrConstant(CalleePopBytes, true),
-                               InFlag, dl);
-    InFlag = Chain.getValue(1);
+      for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
+        unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &AArch64::FPR128RegClass);
+        SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::f128);
+
+        SDValue Store =
+            DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                         MachinePointerInfo::getStack(i * 16), false, false, 0);
+        MemOps.push_back(Store);
+        FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
+                          DAG.getConstant(16, getPointerTy()));
+      }
+    }
+    FuncInfo->setVarArgsFPRIndex(FPRIdx);
+    FuncInfo->setVarArgsFPRSize(FPRSaveSize);
   }
 
-  return LowerCallResult(Chain, InFlag, CallConv,
-                         IsVarArg, Ins, dl, DAG, InVals);
+  if (!MemOps.empty()) {
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
 }
 
-SDValue
-AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
-                                      CallingConv::ID CallConv, bool IsVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc dl, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const {
+/// LowerCallResult - Lower the result values of a call into the
+/// appropriate copies out of appropriate physical registers.
+SDValue AArch64TargetLowering::LowerCallResult(
+    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
+    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
+    SDValue ThisVal) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, CCAssignFnForNode(CallConv));
+  CCInfo.AnalyzeCallResult(Ins, RetCC);
 
+  // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign VA = RVLocs[i];
 
-    // Return values that are too big to fit into registers should use an sret
-    // pointer, so this can be a lot simpler than the main argument code.
-    assert(VA.isRegLoc() && "Memory locations not expected for call return");
+    // Pass 'this' value directly from the argument to return value, to avoid
+    // reg unit interference
+    if (i == 0 && isThisReturn) {
+      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
+             "unexpected return calling convention register assignment");
+      InVals.push_back(ThisVal);
+      continue;
+    }
 
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
-                                     InFlag);
+    SDValue Val =
+        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
     Chain = Val.getValue(1);
     InFlag = Val.getValue(2);
 
     switch (VA.getLocInfo()) {
-    default: llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full: break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
       break;
-    case CCValAssign::ZExt:
-    case CCValAssign::SExt:
-    case CCValAssign::AExt:
-      // Floating-point arguments only get extended/truncated if they're going
-      // in memory, so using the integer operation is acceptable here.
-      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+    case CCValAssign::BCvt:
+      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
       break;
     }
 
@@ -1812,17 +1960,12 @@ AArch64TargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   return Chain;
 }
 
-bool
-AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const {
-
+bool AArch64TargetLowering::isEligibleForTailCallOptimization(
+    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+    bool isCalleeStructRet, bool isCallerStructRet,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<SDValue> &OutVals,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   // For CallingConv::C this function knows whether the ABI needs
   // changing. That's not true for other conventions so they will have to opt in
   // manually.
@@ -1838,7 +1981,8 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
   for (Function::const_arg_iterator i = CallerF->arg_begin(),
-         e = CallerF->arg_end(); i != e; ++i)
+                                    e = CallerF->arg_end();
+       i != e; ++i)
     if (i->hasByValAttr())
       return false;
 
@@ -1854,10 +1998,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 
   // I want anyone implementing a new calling convention to think long and hard
   // about this assert.
-  assert((!IsVarArg || CalleeCC == CallingConv::C)
-         && "Unexpected variadic calling convention");
+  assert((!isVarArg || CalleeCC == CallingConv::C) &&
+         "Unexpected variadic calling convention");
 
-  if (IsVarArg && !Outs.empty()) {
+  if (isVarArg && !Outs.empty()) {
     // At least two cases here: if caller is fastcc then we can't have any
     // memory arguments (we'd be expected to clean up the stack afterwards). If
     // caller is C then we could potentially use its argument area.
@@ -1865,10 +2009,10 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // FIXME: for now we take the most conservative of these in both cases:
     // disallow all variadic memory operands.
     SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                    getTargetMachine(), ArgLocs, *DAG.getContext());
 
-    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+    CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, true));
     for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
       if (!ArgLocs[i].isRegLoc())
         return false;
@@ -1880,12 +2024,12 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     SmallVector<CCValAssign, 16> RVLocs1;
     CCState CCInfo1(CalleeCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs1, *DAG.getContext());
-    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForNode(CalleeCC));
+    CCInfo1.AnalyzeCallResult(Ins, CCAssignFnForCall(CalleeCC, isVarArg));
 
     SmallVector<CCValAssign, 16> RVLocs2;
     CCState CCInfo2(CallerCC, false, DAG.getMachineFunction(),
                     getTargetMachine(), RVLocs2, *DAG.getContext());
-    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForNode(CallerCC));
+    CCInfo2.AnalyzeCallResult(Ins, CCAssignFnForCall(CallerCC, isVarArg));
 
     if (RVLocs1.size() != RVLocs2.size())
       return false;
@@ -1909,28 +2053,18 @@ AArch64TargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     return true;
 
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CalleeCC, IsVarArg, DAG.getMachineFunction(),
+  CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
                  getTargetMachine(), ArgLocs, *DAG.getContext());
 
-  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForNode(CalleeCC));
+  CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
 
-  const AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   // If the stack arguments for this call would fit into our own save area then
   // the call can be made tail.
   return CCInfo.getNextStackOffset() <= FuncInfo->getBytesInStackArgArea();
 }
 
-bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
-                                                   bool TailCallOpt) const {
-  return CallCC == CallingConv::Fast && TailCallOpt;
-}
-
-bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
-  return CallCC == CallingConv::Fast;
-}
-
 SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
                                                    SelectionDAG &DAG,
                                                    MachineFrameInfo *MFI,
@@ -1946,7 +2080,8 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
 
   // Add a chain value for each stack argument corresponding
   for (SDNode::use_iterator U = DAG.getEntryNode().getNode()->use_begin(),
-         UE = DAG.getEntryNode().getNode()->use_end(); U != UE; ++U)
+                            UE = DAG.getEntryNode().getNode()->use_end();
+       U != UE; ++U)
     if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
         if (FI->getIndex() < 0) {
@@ -1959,625 +2094,609 @@ SDValue AArch64TargetLowering::addTokenForArgument(SDValue Chain,
             ArgChains.push_back(SDValue(L, 1));
         }
 
-   // Build a tokenfactor for all the chains.
-   return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
-                      &ArgChains[0], ArgChains.size());
+  // Build a tokenfactor for all the chains.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
 }
 
-static A64CC::CondCodes IntCCToA64CC(ISD::CondCode CC) {
-  switch (CC) {
-  case ISD::SETEQ:  return A64CC::EQ;
-  case ISD::SETGT:  return A64CC::GT;
-  case ISD::SETGE:  return A64CC::GE;
-  case ISD::SETLT:  return A64CC::LT;
-  case ISD::SETLE:  return A64CC::LE;
-  case ISD::SETNE:  return A64CC::NE;
-  case ISD::SETUGT: return A64CC::HI;
-  case ISD::SETUGE: return A64CC::HS;
-  case ISD::SETULT: return A64CC::LO;
-  case ISD::SETULE: return A64CC::LS;
-  default: llvm_unreachable("Unexpected condition code");
-  }
-}
-
-bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Val) const {
-  // icmp is implemented using adds/subs immediate, which take an unsigned
-  // 12-bit immediate, optionally shifted left by 12 bits.
-
-  // Symmetric by using adds/subs
-  if (Val < 0)
-    Val = -Val;
-
-  return (Val & ~0xfff) == 0 || (Val & ~0xfff000) == 0;
-}
-
-SDValue AArch64TargetLowering::getSelectableIntSetCC(SDValue LHS, SDValue RHS,
-                                        ISD::CondCode CC, SDValue &A64cc,
-                                        SelectionDAG &DAG, SDLoc &dl) const {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    int64_t C = 0;
-    EVT VT = RHSC->getValueType(0);
-    bool knownInvalid = false;
-
-    // I'm not convinced the rest of LLVM handles these edge cases properly, but
-    // we can at least get it right.
-    if (isSignedIntSetCC(CC)) {
-      C = RHSC->getSExtValue();
-    } else if (RHSC->getZExtValue() > INT64_MAX) {
-      // A 64-bit constant not representable by a signed 64-bit integer is far
-      // too big to fit into a SUBS immediate anyway.
-      knownInvalid = true;
-    } else {
-      C = RHSC->getZExtValue();
-    }
-
-    if (!knownInvalid && !isLegalICmpImmediate(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default: break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if (isLegalICmpImmediate(C-1)) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          RHS = DAG.getConstant(C-1, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if (isLegalICmpImmediate(C+1)) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          RHS = DAG.getConstant(C+1, VT);
-        }
-        break;
-      }
-    }
-  }
-
-  A64CC::CondCodes CondCode = IntCCToA64CC(CC);
-  A64cc = DAG.getConstant(CondCode, MVT::i32);
-  return DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                     DAG.getCondCode(CC));
+bool AArch64TargetLowering::DoesCalleeRestoreStack(CallingConv::ID CallCC,
+                                                   bool TailCallOpt) const {
+  return CallCC == CallingConv::Fast && TailCallOpt;
 }
 
-static A64CC::CondCodes FPCCToA64CC(ISD::CondCode CC,
-                                    A64CC::CondCodes &Alternative) {
-  A64CC::CondCodes CondCode = A64CC::Invalid;
-  Alternative = A64CC::Invalid;
-
-  switch (CC) {
-  default: llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = A64CC::EQ; break;
-  case ISD::SETGT:
-  case ISD::SETOGT: CondCode = A64CC::GT; break;
-  case ISD::SETGE:
-  case ISD::SETOGE: CondCode = A64CC::GE; break;
-  case ISD::SETOLT: CondCode = A64CC::MI; break;
-  case ISD::SETOLE: CondCode = A64CC::LS; break;
-  case ISD::SETONE: CondCode = A64CC::MI; Alternative = A64CC::GT; break;
-  case ISD::SETO:   CondCode = A64CC::VC; break;
-  case ISD::SETUO:  CondCode = A64CC::VS; break;
-  case ISD::SETUEQ: CondCode = A64CC::EQ; Alternative = A64CC::VS; break;
-  case ISD::SETUGT: CondCode = A64CC::HI; break;
-  case ISD::SETUGE: CondCode = A64CC::PL; break;
-  case ISD::SETLT:
-  case ISD::SETULT: CondCode = A64CC::LT; break;
-  case ISD::SETLE:
-  case ISD::SETULE: CondCode = A64CC::LE; break;
-  case ISD::SETNE:
-  case ISD::SETUNE: CondCode = A64CC::NE; break;
-  }
-  return CondCode;
+bool AArch64TargetLowering::IsTailCallConvention(CallingConv::ID CallCC) const {
+  return CallCC == CallingConv::Fast;
 }
 
+/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
+/// and add input and output parameter nodes.
 SDValue
-AArch64TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetBlockAddress(BA, PtrVT, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(/*Alignment=*/ 4, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
-
+AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                 SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc &DL = CLI.DL;
+  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
+  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
+  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
+  SDValue Chain = CLI.Chain;
+  SDValue Callee = CLI.Callee;
+  bool &IsTailCall = CLI.IsTailCall;
+  CallingConv::ID CallConv = CLI.CallConv;
+  bool IsVarArg = CLI.IsVarArg;
 
-// (BRCOND chain, val, dest)
-SDValue
-AArch64TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue TheBit = Op.getOperand(1);
-  SDValue DestBB = Op.getOperand(2);
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
+  bool IsThisReturn = false;
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
+  bool IsSibCall = false;
 
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  if (IsTailCall) {
+    // Check if it's really possible to do a tail call.
+    IsTailCall = isEligibleForTailCallOptimization(
+        Callee, CallConv, IsVarArg, IsStructRet,
+        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
+    if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
 
-  return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other, Chain,
-                     A64CMP, DAG.getConstant(A64CC::NE, MVT::i32),
-                     DestBB);
-}
+    // A sibling call is one where we're under the usual C ABI and not planning
+    // to change that but can still do a tail call:
+    if (!TailCallOpt && IsTailCall)
+      IsSibCall = true;
 
-// (BR_CC chain, condcode, lhs, rhs, dest)
-SDValue
-AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue DestBB = Op.getOperand(4);
+    if (IsTailCall)
+      ++NumTailCalls;
+  }
 
-  if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to runtime calls by a routine which sets
-    // LHS, RHS and CC appropriately for the rest of this function to continue.
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), ArgLocs, *DAG.getContext());
 
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
+  if (IsVarArg) {
+    // Handle fixed and variable vector arguments differently.
+    // Variable vector arguments always go into memory.
+    unsigned NumArgs = Outs.size();
+
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ArgVT = Outs[i].VT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
+                                               /*IsVarArg=*/ !Outs[i].IsFixed);
+      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
+    }
+  } else {
+    // At this point, Outs[].VT may already be promoted to i32. To correctly
+    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
+    // i8 to CC_AArch64_AAPCS with i32 being ValVT and i8 being LocVT.
+    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
+    // we use a special version of AnalyzeCallOperands to pass in ValVT and
+    // LocVT.
+    unsigned NumArgs = Outs.size();
+    for (unsigned i = 0; i != NumArgs; ++i) {
+      MVT ValVT = Outs[i].VT;
+      // Get type of the original argument.
+      EVT ActualVT = getValueType(CLI.getArgs()[Outs[i].OrigArgIndex].Ty,
+                                  /*AllowUnknown*/ true);
+      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
+      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
+      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
+        ValVT = MVT::i8;
+      else if (ActualMVT == MVT::i16)
+        ValVT = MVT::i16;
+
+      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+      bool Res = AssignFn(i, ValVT, ValVT, CCValAssign::Full, ArgFlags, CCInfo);
+      assert(!Res && "Call operand has unhandled type");
+      (void)Res;
     }
   }
 
-  if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
-
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+  // Get a count of how many bytes are to be pushed on the stack.
+  unsigned NumBytes = CCInfo.getNextStackOffset();
 
-    return DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                       Chain, CmpOp, A64cc, DestBB);
+  if (IsSibCall) {
+    // Since we're not changing the ABI to make this a tail call, the memory
+    // operands are already available in the caller's incoming argument space.
+    NumBytes = 0;
   }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                                 Chain, SetCC, A64cc, DestBB);
+  // FPDiff is the byte offset of the call's argument area from the callee's.
+  // Stores to callee stack arguments will be placed in FixedStackSlots offset
+  // by this amount for a tail call. In a sibling call it must be 0 because the
+  // caller will deallocate the entire stack and the callee still expects its
+  // arguments to begin at SP+0. Completely unused for non-tail calls.
+  int FPDiff = 0;
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64BR_CC = DAG.getNode(AArch64ISD::BR_CC, dl, MVT::Other,
-                           A64BR_CC, SetCC, A64cc, DestBB);
+  if (IsTailCall && !IsSibCall) {
+    unsigned NumReusableBytes = FuncInfo->getBytesInStackArgArea();
 
-  }
+    // Since callee will pop argument stack as a tail call, we must keep the
+    // popped size 16-byte aligned.
+    NumBytes = RoundUpToAlignment(NumBytes, 16);
 
-  return A64BR_CC;
-}
+    // FPDiff will be negative if this tail call requires more space than we
+    // would automatically have in our incoming argument space. Positive if we
+    // can actually shrink the stack.
+    FPDiff = NumReusableBytes - NumBytes;
 
-SDValue
-AArch64TargetLowering::LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                                       RTLIB::Libcall Call) const {
-  ArgListTy Args;
-  ArgListEntry Entry;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
-    EVT ArgVT = Op.getOperand(i).getValueType();
-    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-    Entry.Node = Op.getOperand(i); Entry.Ty = ArgTy;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Args.push_back(Entry);
+    // The stack pointer must be 16-byte aligned at all times it's used for a
+    // memory operation, which in practice means at *all* times and in
+    // particular across call boundaries. Therefore our own arguments started at
+    // a 16-byte aligned SP and the delta applied for the tail call should
+    // satisfy the same constraint.
+    assert(FPDiff % 16 == 0 && "unaligned stack on tail call");
   }
-  SDValue Callee = DAG.getExternalSymbol(getLibcallName(Call), getPointerTy());
 
-  Type *RetTy = Op.getValueType().getTypeForEVT(*DAG.getContext());
+  // Adjust the stack pointer for the new arguments...
+  // These operations are automatically eliminated by the prolog/epilog pass
+  if (!IsSibCall)
+    Chain =
+        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
 
-  // By default, the input chain to this libcall is the entry node of the
-  // function. If the libcall is going to be emitted as a tail call then
-  // isUsedByReturnOnly will change it to the right chain if the return
-  // node which is being folded has a non-entry input chain.
-  SDValue InChain = DAG.getEntryNode();
+  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy());
 
-  // isTailCall may be true since the callee does not reference caller stack
-  // frame. Check if it's in the right position.
-  SDValue TCChain = InChain;
-  bool isTailCall = isInTailCallPosition(DAG, Op.getNode(), TCChain);
-  if (isTailCall)
-    InChain = TCChain;
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
 
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, false, false, false, false,
-                    0, getLibcallCallingConv(Call), isTailCall,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, SDLoc(Op));
-  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  // Walk the register/memloc assignments, inserting copies/loads.
+  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
+       ++i, ++realArgIdx) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = OutVals[realArgIdx];
+    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
 
-  if (!CallInfo.second.getNode())
-    // It's a tailcall, return the chain (which is the DAG root).
-    return DAG.getRoot();
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      if (Outs[realArgIdx].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to 8-bits by the caller.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
+      }
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::FPExt:
+      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
 
-  return CallInfo.first;
-}
+    if (VA.isRegLoc()) {
+      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+        assert(VA.getLocVT() == MVT::i64 &&
+               "unexpected calling convention register assignment");
+        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
+               "unexpected use of 'returned'");
+        IsThisReturn = true;
+      }
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+    } else {
+      assert(VA.isMemLoc());
 
-SDValue
-AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
+      SDValue DstAddr;
+      MachinePointerInfo DstInfo;
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+      // FIXME: This works on big-endian for composite byvals, which are the
+      // common case. It should also work for fundamental types too.
+      uint32_t BEAlign = 0;
+      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                        : VA.getLocVT().getSizeInBits();
+      OpSize = (OpSize + 7) / 8;
+      if (!Subtarget->isLittleEndian() && !Flags.isByVal()) {
+        if (OpSize < 8)
+          BEAlign = 8 - OpSize;
+      }
+      unsigned LocMemOffset = VA.getLocMemOffset();
+      int32_t Offset = LocMemOffset + BEAlign;
+      SDValue PtrOff = DAG.getIntPtrConstant(Offset);
+      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+
+      if (IsTailCall) {
+        Offset = Offset + FPDiff;
+        int FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+
+        DstAddr = DAG.getFrameIndex(FI, getPointerTy());
+        DstInfo = MachinePointerInfo::getFixedStack(FI);
+
+        // Make sure any stack arguments overlapping with where we're storing
+        // are loaded before this eventual operation. Otherwise they'll be
+        // clobbered.
+        Chain = addTokenForArgument(Chain, DAG, MF.getFrameInfo(), FI);
+      } else {
+        SDValue PtrOff = DAG.getIntPtrConstant(Offset);
 
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
-}
+        DstAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
+        DstInfo = MachinePointerInfo::getStack(LocMemOffset);
+      }
 
-SDValue
-AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
+      if (Outs[i].Flags.isByVal()) {
+        SDValue SizeNode =
+            DAG.getConstant(Outs[i].Flags.getByValSize(), MVT::i64);
+        SDValue Cpy = DAG.getMemcpy(
+            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            /*isVolatile = */ false,
+            /*alwaysInline = */ false, DstInfo, MachinePointerInfo());
 
-  RTLIB::Libcall LC;
-  LC  = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
+        MemOpChains.push_back(Cpy);
+      } else {
+        // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
+        // promoted to a legal register type i32, we should truncate Arg back to
+        // i1/i8/i16.
+        if (Arg.getValueType().isSimple() &&
+            Arg.getValueType().getSimpleVT() == MVT::i32 &&
+            (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
+             VA.getLocVT() == MVT::i16))
+          Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
+
+        SDValue Store =
+            DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, false, false, 0);
+        MemOpChains.push_back(Store);
+      }
+    }
+  }
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  EVT OpVT = Vec.getValueType();
-  unsigned Opc = IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT;
-
-  if (VT.getVectorNumElements() == 1) {
-    assert(OpVT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == OpVT.getSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
-  }
-
-  if (VT.getSizeInBits() > OpVT.getSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2f32 && VT == MVT::v2i64 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(ISD::FP_EXTEND, dl, MVT::v2f64, Vec);
-    return DAG.getNode(Opc, dl, VT, Vec);
-  } else if (VT.getSizeInBits() < OpVT.getSizeInBits()) {
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   OpVT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(Opc, dl, CastVT, Vec);
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Vec);
-  }
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
-
-static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
-  // We custom lower concat_vectors with 4, 8, or 16 operands that are all the
-  // same operand and of type v1* using the DUP instruction.
-  unsigned NumOps = Op->getNumOperands();
-  if (NumOps == 2) {
-    assert(Op.getValueType().getSizeInBits() == 128 && "unexpected concat");
-    return Op;
+  // Build a sequence of copy-to-reg nodes chained together with token chain
+  // and flag operands which copy the outgoing args into the appropriate regs.
+  SDValue InFlag;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InFlag);
+    InFlag = Chain.getValue(1);
   }
 
-  if (NumOps != 4 && NumOps != 8 && NumOps != 16)
-    return SDValue();
-
-  // Must be a single value for VDUP.
-  SDValue Op0 = Op.getOperand(0);
-  for (unsigned i = 1; i < NumOps; ++i) {
-    SDValue OpN = Op.getOperand(i);
-    if (Op0 != OpN)
-      return SDValue();
+  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
+  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
+  // node so that legalize doesn't hack it.
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      Subtarget->isTargetMachO()) {
+    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+      const GlobalValue *GV = G->getGlobal();
+      bool InternalLinkage = GV->hasInternalLinkage();
+      if (InternalLinkage)
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+      else {
+        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
+                                            AArch64II::MO_GOT);
+        Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+      }
+    } else if (ExternalSymbolSDNode *S =
+                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
+      const char *Sym = S->getSymbol();
+      Callee =
+          DAG.getTargetExternalSymbol(Sym, getPointerTy(), AArch64II::MO_GOT);
+      Callee = DAG.getNode(AArch64ISD::LOADgot, DL, getPointerTy(), Callee);
+    }
+  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = G->getGlobal();
+    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
+  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    const char *Sym = S->getSymbol();
+    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
   }
 
-  // Verify the value type.
-  EVT EltVT = Op0.getValueType();
-  switch (NumOps) {
-  default: llvm_unreachable("Unexpected number of operands");
-  case 4:
-    if (EltVT != MVT::v1i16 && EltVT != MVT::v1i32)
-      return SDValue();
-    break;
-  case 8:
-    if (EltVT != MVT::v1i8 && EltVT != MVT::v1i16)
-      return SDValue();
-    break;
-  case 16:
-    if (EltVT != MVT::v1i8)
-      return SDValue();
-    break;
+  // We don't usually want to end the call-sequence here because we would tidy
+  // the frame up *after* the call, however in the ABI-changing tail-call case
+  // we've carefully laid out the parameters so that when sp is reset they'll be
+  // in the correct location.
+  if (IsTailCall && !IsSibCall) {
+    Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                               DAG.getIntPtrConstant(0, true), InFlag, DL);
+    InFlag = Chain.getValue(1);
   }
 
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  // VDUP produces better code for constants.
-  if (Op0->getOpcode() == ISD::BUILD_VECTOR)
-    return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Op0->getOperand(0));
-  return DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, Op0,
-                     DAG.getConstant(0, MVT::i64));
-}
+  std::vector<SDValue> Ops;
+  Ops.push_back(Chain);
+  Ops.push_back(Callee);
 
-SDValue
-AArch64TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG, IsSigned);
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
+  if (IsTailCall) {
+    // Each tail call may have to adjust the stack by a different amount, so
+    // this information must travel along with the operation for eventual
+    // consumption by emitEpilogue.
+    Ops.push_back(DAG.getTargetConstant(FPDiff, MVT::i32));
   }
 
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  // Add argument registers to the end of the list so that they are known live
+  // into the call.
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+  // Add a register mask operand representing the call-preserved registers.
+  const uint32_t *Mask;
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  if (IsThisReturn) {
+    // For 'this' returns, use the X0-preserving mask if applicable
+    Mask = ARI->getThisReturnPreservedMask(CallConv);
+    if (!Mask) {
+      IsThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
+    Mask = ARI->getCallPreservedMask(CallConv);
 
-SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
 
-  if (verifyReturnAddressArgumentIsConstant(Op, DAG))
-    return SDValue();
+  if (InFlag.getNode())
+    Ops.push_back(InFlag);
 
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, MVT::i64);
-    return DAG.getLoad(VT, dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
 
-  // Return X30, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(AArch64::X30, getRegClassFor(MVT::i64));
-  return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, MVT::i64);
-}
+  // If we're doing a tall call, use a TC_RETURN here rather than an
+  // actual call instruction.
+  if (IsTailCall)
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, NodeTys, Ops);
 
+  // Returns a chain and a flag for retval copy to use.
+  Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  InFlag = Chain.getValue(1);
 
-SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG)
-                                              const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
+  uint64_t CalleePopBytes = DoesCalleeRestoreStack(CallConv, TailCallOpt)
+                                ? RoundUpToAlignment(NumBytes, 16)
+                                : 0;
 
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = AArch64::X29;
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(),
-                            false, false, false, 0);
-  return FrameAddr;
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
+                             DAG.getIntPtrConstant(CalleePopBytes, true),
+                             InFlag, DL);
+  if (!Ins.empty())
+    InFlag = Chain.getValue(1);
+
+  // Handle result values, copying them out of physregs into vregs that we
+  // return.
+  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
+                         InVals, IsThisReturn,
+                         IsThisReturn ? OutVals[0] : SDValue());
+}
+
+bool AArch64TargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
-AArch64TargetLowering::LowerGlobalAddressELFLarge(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Large);
-  assert(getTargetMachine().getRelocationModel() == Reloc::Static);
+AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                                   bool isVarArg,
+                                   const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                   const SmallVectorImpl<SDValue> &OutVals,
+                                   SDLoc DL, SelectionDAG &DAG) const {
+  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS
+                          ? RetCC_AArch64_WebKit_JS
+                          : RetCC_AArch64_AAPCS;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
+                 getTargetMachine(), RVLocs, *DAG.getContext());
+  CCInfo.AnalyzeReturn(Outs, RetCC);
 
-  EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
+  // Copy the result values into the output registers.
+  SDValue Flag;
+  SmallVector<SDValue, 4> RetOps(1, Chain);
+  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
+       ++i, ++realRVLocIdx) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue Arg = OutVals[realRVLocIdx];
+
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full:
+      if (Outs[i].ArgVT == MVT::i1) {
+        // AAPCS requires i1 to be zero-extended to i8 by the producer of the
+        // value. This is strictly redundant on Darwin (which uses "zeroext
+        // i1"), but will be optimised out before ISel.
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
+        Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      }
+      break;
+    case CCValAssign::BCvt:
+      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
 
-  SDValue GlobalAddr = DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, AArch64II::MO_ABS_G0_NC));
+  RetOps[0] = Chain; // Update chain.
 
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
 
-  return GlobalAddr;
+  return DAG.getNode(AArch64ISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELFSmall(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small);
+//===----------------------------------------------------------------------===//
+//  Other Lowering Code
+//===----------------------------------------------------------------------===//
 
+SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
+                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
-  SDLoc dl(Op);
-  const GlobalAddressSDNode *GN = cast<GlobalAddressSDNode>(Op);
-  const GlobalValue *GV = GN->getGlobal();
-  unsigned Alignment = GV->getAlignment();
-  Reloc::Model RelocM = getTargetMachine().getRelocationModel();
-  if (GV->isWeakForLinker() && GV->isDeclaration() && RelocM == Reloc::Static) {
-    // Weak undefined symbols can't use ADRP/ADD pair since they should evaluate
-    // to zero when they remain undefined. In PIC mode the GOT can take care of
-    // this, but in absolute mode we use a constant pool load.
-    SDValue PoolAddr;
-    PoolAddr = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_NO_FLAG),
-                           DAG.getTargetConstantPool(GV, PtrVT, 0, 0,
-                                                     AArch64II::MO_LO12),
-                           DAG.getConstant(8, MVT::i32));
-    SDValue GlobalAddr = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), PoolAddr,
-                                     MachinePointerInfo::getConstantPool(),
-                                     /*isVolatile=*/ false,
-                                     /*isNonTemporal=*/ true,
-                                     /*isInvariant=*/ true, 8);
-    if (GN->getOffset() != 0)
-      return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalAddr,
-                         DAG.getConstant(GN->getOffset(), PtrVT));
-
-    return GlobalAddr;
-  }
-
-  if (Alignment == 0) {
-    const PointerType *GVPtrTy = cast<PointerType>(GV->getType());
-    if (GVPtrTy->getElementType()->isSized()) {
-      Alignment
-        = getDataLayout()->getABITypeAlignment(GVPtrTy->getElementType());
-    } else {
-      // Be conservative if we can't guess, not that it really matters:
-      // functions and labels aren't valid for loads, and the methods used to
-      // actually calculate an address work with any alignment.
-      Alignment = 1;
-    }
+  SDLoc DL(Op);
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  unsigned char OpFlags =
+      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
+
+  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
+         "unexpected offset in global node");
+
+  // This also catched the large code model case for Darwin.
+  if ((OpFlags & AArch64II::MO_GOT) != 0) {
+    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
+    // FIXME: Once remat is capable of dealing with instructions with register
+    // operands, expand this into two nodes instead of using a wrapper node.
+    return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
   }
 
-  unsigned char HiFixup, LoFixup;
-  bool UseGOT = getSubtarget()->GVIsIndirectSymbol(GV, RelocM);
-
-  if (UseGOT) {
-    HiFixup = AArch64II::MO_GOT;
-    LoFixup = AArch64II::MO_GOT_LO12;
-    Alignment = 8;
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
   } else {
-    HiFixup = AArch64II::MO_NO_FLAG;
-    LoFixup = AArch64II::MO_LO12;
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
+    // the only correct model on Darwin.
+    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
+                                            OpFlags | AArch64II::MO_PAGE);
+    unsigned char LoFlags = OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC;
+    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
   }
+}
 
-  // AArch64's small model demands the following sequence:
-  // ADRP x0, somewhere
-  // ADD x0, x0, #:lo12:somewhere ; (or LDR directly).
-  SDValue GlobalRef = DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             HiFixup),
-                                  DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                             LoFixup),
-                                  DAG.getConstant(Alignment, MVT::i32));
+/// \brief Convert a TLS address reference into the correct sequence of loads
+/// and calls to compute the variable's address (for Darwin, currently) and
+/// return an SDValue containing the final node.
 
-  if (UseGOT) {
-    GlobalRef = DAG.getNode(AArch64ISD::GOTLoad, dl, PtrVT, DAG.getEntryNode(),
-                            GlobalRef);
-  }
+/// Darwin only has one TLS scheme which must be capable of dealing with the
+/// fully general situation, in the worst case. This means:
+///     + "extern __thread" declaration.
+///     + Defined in a possibly unknown dynamic library.
+///
+/// The general system is that each __thread variable has a [3 x i64] descriptor
+/// which contains information used by the runtime to calculate the address. The
+/// only part of this the compiler needs to know about is the first xword, which
+/// contains a function pointer that must be called with the address of the
+/// entire descriptor in "x0".
+///
+/// Since this descriptor may be in a different unit, in general even the
+/// descriptor must be accessed via an indirect load. The "ideal" code sequence
+/// is:
+///     adrp x0, _var@TLVPPAGE
+///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
+///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
+///                                      ; the function pointer
+///     blr x1                           ; Uses descriptor address in x0
+///     ; Address of _var is now in x0.
+///
+/// If the address of _var's descriptor *is* known to the linker, then it can
+/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
+/// a slight efficiency gain.
+SDValue
+AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
 
-  if (GN->getOffset() != 0)
-    return DAG.getNode(ISD::ADD, dl, PtrVT, GlobalRef,
-                       DAG.getConstant(GN->getOffset(), PtrVT));
+  SDLoc DL(Op);
+  MVT PtrVT = getPointerTy();
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
 
-  return GlobalRef;
-}
+  SDValue TLVPAddr =
+      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+  SDValue DescAddr = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TLVPAddr);
 
-SDValue
-AArch64TargetLowering::LowerGlobalAddressELF(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  // TableGen doesn't have easy access to the CodeModel or RelocationModel, so
-  // we make those distinctions here.
-
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return LowerGlobalAddressELFSmall(Op, DAG);
-  case CodeModel::Large:
-    return LowerGlobalAddressELFLarge(Op, DAG);
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
+  // The first entry in the descriptor is a function pointer that we must call
+  // to obtain the address of the variable.
+  SDValue Chain = DAG.getEntryNode();
+  SDValue FuncTLVGet =
+      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
+                  false, true, true, 8);
+  Chain = FuncTLVGet.getValue(1);
 
-SDValue
-AArch64TargetLowering::LowerConstantPool(SDValue Op,
-                                         SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT PtrVT = getPointerTy();
-  ConstantPoolSDNode *CN = cast<ConstantPoolSDNode>(Op);
-  const Constant *C = CN->getConstVal();
-
-  switch(getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    // The most efficient code is PC-relative anyway for the small memory model,
-    // so we don't need to worry about relocation model.
-    return DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_NO_FLAG),
-                       DAG.getTargetConstantPool(C, PtrVT, 0, 0,
-                                                 AArch64II::MO_LO12),
-                       DAG.getConstant(CN->getAlignment(), MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, DL, PtrVT,
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G3),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetConstantPool(C, PtrVT, 0, 0, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
+
+  // Finally, we can make the call. This is just a degenerate version of a
+  // normal AArch64 call node: x0 takes the address of the descriptor, and
+  // returns the address of the variable in this thread.
+  Chain = DAG.getCopyToReg(Chain, DL, AArch64::X0, DescAddr, SDValue());
+  Chain =
+      DAG.getNode(AArch64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
+                  Chain, FuncTLVGet, DAG.getRegister(AArch64::X0, MVT::i64),
+                  DAG.getRegisterMask(Mask), Chain.getValue(1));
+  return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
 }
 
-SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
-                                                SDValue DescAddr,
-                                                SDLoc DL,
-                                                SelectionDAG &DAG) const {
+/// When accessing thread-local variables under either the general-dynamic or
+/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
+/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
+/// is a function pointer to carry out the resolution. This function takes the
+/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
+/// other registers (except LR, NZCV) are preserved.
+///
+/// Thus, the ideal call sequence on AArch64 is:
+///
+///     adrp x0, :tlsdesc:thread_var
+///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
+///     add x0, x0, :tlsdesc_lo12:thread_var
+///     .tlsdesccall thread_var
+///     blr x8
+///     (TPIDR_EL0 offset now in x0).
+///
+/// The ".tlsdesccall" directive instructs the assembler to insert a particular
+/// relocation to help the linker relax this sequence if it turns out to be too
+/// conservative.
+///
+/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
+/// is harmless.
+SDValue AArch64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
+                                                   SDValue DescAddr, SDLoc DL,
+                                                   SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy();
 
   // The function we need to call is simply the first entry in the GOT for this
   // descriptor, load it in preparation.
-  SDValue Func, Chain;
-  Func = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                     DescAddr);
+  SDValue Func = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, SymAddr);
+
+  // TLS calls preserve all registers except those that absolutely must be
+  // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
+  // silly).
+  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
+  const AArch64RegisterInfo *ARI =
+      static_cast<const AArch64RegisterInfo *>(TRI);
+  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
 
   // The function takes only one argument: the address of the descriptor itself
   // in X0.
-  SDValue Glue;
+  SDValue Glue, Chain;
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, AArch64::X0, DescAddr, Glue);
   Glue = Chain.getValue(1);
 
-  // Finally, there's a special calling-convention which means that the lookup
-  // must preserve all registers (except X0, obviously).
-  const TargetRegisterInfo *TRI  = getTargetMachine().getRegisterInfo();
-  const AArch64RegisterInfo *A64RI
-    = static_cast<const AArch64RegisterInfo *>(TRI);
-  const uint32_t *Mask = A64RI->getTLSDescCallPreservedMask();
-
   // We're now ready to populate the argument list, as with a normal call:
-  std::vector<SDValue> Ops;
+  SmallVector<SDValue, 6> Ops;
   Ops.push_back(Chain);
   Ops.push_back(Func);
   Ops.push_back(SymAddr);
@@ -2586,22 +2705,18 @@ SDValue AArch64TargetLowering::LowerTLSDescCall(SDValue SymAddr,
   Ops.push_back(Glue);
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(AArch64ISD::TLSDESCCALL, DL, NodeTys, &Ops[0],
-                      Ops.size());
+  Chain = DAG.getNode(AArch64ISD::TLSDESC_CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
-  // After the call, the offset from TPIDR_EL0 is in X0, copy it out and pass it
-  // back to the generic handling code.
   return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Glue);
 }
 
 SDValue
-AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  assert(getSubtarget()->isTargetELF() &&
-         "TLS not implemented for non-ELF targets");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small
-         && "TLS only supported in small memory model");
+AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
+  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+         "ELF TLS only supported in small memory model");
   const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
 
   TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -2613,39 +2728,22 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
 
   SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
 
-  if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL),
-                        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                   AArch64II::MO_GOTTPREL_LO12),
-                        DAG.getConstant(8, MVT::i32));
-    TPOff = DAG.getNode(AArch64ISD::GOTLoad, DL, PtrVT, DAG.getEntryNode(),
-                        TPOff);
-  } else if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_TPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(1, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0);
-
-    TPOff = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
+  if (Model == TLSModel::LocalExec) {
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                       DAG.getTargetConstant(16, MVT::i32)),
+                    0);
+    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+                                       DAG.getTargetConstant(0, MVT::i32)),
+                    0);
+  } else if (Model == TLSModel::InitialExec) {
+    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+    TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
   } else if (Model == TLSModel::LocalDynamic) {
     // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
     // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
@@ -2653,367 +2751,354 @@ AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
     // calculation.
 
     // These accesses will need deduplicating if there's more than one.
-    AArch64MachineFunctionInfo* MFI = DAG.getMachineFunction()
-      .getInfo<AArch64MachineFunctionInfo>();
+    AArch64FunctionInfo *MFI =
+        DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
     MFI->incNumLocalDynamicTLSAccesses();
 
-
-    // Get the location of _TLS_MODULE_BASE_:
-    SDValue HiDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC);
-    SDValue LoDesc = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                AArch64II::MO_TLSDESC_LO12);
-    SDValue DescAddr = DAG.getNode(AArch64ISD::WrapperSmall, DL, PtrVT,
-                                   HiDesc, LoDesc,
-                                   DAG.getConstant(8, MVT::i32));
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT);
-
-    ThreadBase = LowerTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Get the variable's offset from _TLS_MODULE_BASE_
-    SDValue HiVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(GV, DL, MVT::i64, 0,
-                                               AArch64II::MO_DTPREL_G0_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZxii, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
-    TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKxii, DL, PtrVT,
-                                       TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)), 0);
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetExternalSymbol(
+        "_TLS_MODULE_BASE_", PtrVT,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
+                                                  AArch64II::MO_TLS);
+
+    // Now we can calculate the offset from TPIDR_EL0 to this module's
+    // thread-local area.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
+
+    // Now use :dtprel_whatever: operations to calculate this variable's offset
+    // in its thread-storage area.
+    SDValue HiVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+    SDValue LoVar = DAG.getTargetGlobalAddress(
+        GV, DL, MVT::i64, 0,
+        AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+
+    SDValue DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+                                   DAG.getTargetConstant(16, MVT::i32)),
+                0);
+    DTPOff =
+        SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
+                                   DAG.getTargetConstant(0, MVT::i32)),
+                0);
+
+    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
+  } else if (Model == TLSModel::GeneralDynamic) {
+    // Accesses used in this sequence go via the TLS descriptor which lives in
+    // the GOT. Prepare an address we can use to handle this.
+    SDValue HiDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGE);
+    SDValue LoDesc = DAG.getTargetGlobalAddress(
+        GV, DL, PtrVT, 0,
+        AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    // First argument to the descriptor call is the address of the descriptor
+    // itself.
+    SDValue DescAddr = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, HiDesc);
+    DescAddr = DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
+
+    // The call needs a relocation too for linker relaxation. It doesn't make
+    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
+    // the address.
+    SDValue SymAddr =
+        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
+
+    // Finally we can make a call to calculate the offset from tpidr_el0.
+    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
   } else
-      llvm_unreachable("Unsupported TLS access model");
-
+    llvm_unreachable("Unsupported ELF TLS access model");
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
 }
 
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                    bool IsSigned) {
+SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  if (Subtarget->isTargetDarwin())
+    return LowerDarwinGlobalTLSAddress(Op, DAG);
+  else if (Subtarget->isTargetELF())
+    return LowerELFGlobalTLSAddress(Op, DAG);
+
+  llvm_unreachable("Unexpected platform trying to use TLS");
+}
+SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+  SDValue LHS = Op.getOperand(2);
+  SDValue RHS = Op.getOperand(3);
+  SDValue Dest = Op.getOperand(4);
   SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  SDValue Vec = Op.getOperand(0);
-  unsigned Opc = IsSigned ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
 
-  if (VT.getVectorNumElements() == 1) {
-    assert(VT == MVT::v1f64 && "Unexpected vector type!");
-    if (VT.getSizeInBits() == Vec.getValueSizeInBits())
-      return Op;
-    return DAG.UnrollVectorOp(Op.getNode());
-  }
+  // Handle f128 first, since lowering it will result in comparing the return
+  // value of a libcall against zero, which is just what the rest of LowerBR_CC
+  // is expecting to deal with.
+  if (LHS.getValueType() == MVT::f128) {
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
-  if (VT.getSizeInBits() < Vec.getValueSizeInBits()) {
-    assert(Vec.getValueType() == MVT::v2i64 && VT == MVT::v2f32 &&
-           "Unexpected vector type!");
-    Vec = DAG.getNode(Opc, dl, MVT::v2f64, Vec);
-    return DAG.getNode(ISD::FP_ROUND, dl, VT, Vec, DAG.getIntPtrConstant(0));
-  } else if (VT.getSizeInBits() > Vec.getValueSizeInBits()) {
-    unsigned CastOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-    EVT CastVT = EVT::getIntegerVT(*DAG.getContext(),
-                                   VT.getVectorElementType().getSizeInBits());
-    CastVT =
-        EVT::getVectorVT(*DAG.getContext(), CastVT, VT.getVectorNumElements());
-    Vec = DAG.getNode(CastOpc, dl, CastVT, Vec);
+    // If softenSetCCOperands returned a scalar, we need to compare the result
+    // against zero to select between true and false values.
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
   }
 
-  return DAG.getNode(Opc, dl, VT, Vec);
-}
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
+  // instruction.
+  unsigned Opc = LHS.getOpcode();
+  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
+      cast<ConstantSDNode>(RHS)->isOne() &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
+           "Unexpected condition code.");
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
+      return SDValue();
 
-SDValue
-AArch64TargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG,
-                                      bool IsSigned) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG, IsSigned);
-  if (Op.getValueType() != MVT::f128) {
-    // Legal for everything except f128.
-    return Op;
-  }
+    // The actual operation with overflow check.
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, LHS.getValue(0), DAG);
 
-  RTLIB::Libcall LC;
-  if (IsSigned)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+    if (CC == ISD::SETNE)
+      OFCC = getInvertedCondCode(OFCC);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
 
-  return LowerF128ToCall(Op, DAG, LC);
-}
+    return DAG.getNode(AArch64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
+                       CCVal, Overflow);
+  }
 
+  if (LHS.getValueType().isInteger()) {
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    // If the RHS of the comparison is zero, we can potentially fold this
+    // to a specialized branch.
+    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
+    if (RHSC && RHSC->getZExtValue() == 0) {
+      if (CC == ISD::SETEQ) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
 
-SDValue
-AArch64TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  SDLoc dl(JT);
-  EVT PtrVT = getPointerTy();
+        return DAG.getNode(AArch64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
+      } else if (CC == ISD::SETNE) {
+        // See if we can use a TBZ to fold in an AND as well.
+        // TBZ has a smaller branch displacement than CBZ.  If the offset is
+        // out of bounds, a late MI-layer pass rewrites branches.
+        // 403.gcc is an example that hits this case.
+        if (LHS.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(LHS.getOperand(1)) &&
+            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
+          SDValue Test = LHS.getOperand(0);
+          uint64_t Mask = LHS.getConstantOperandVal(1);
+
+          // TBNZ only operates on i64's, but the ext should be free.
+          if (Test.getValueType() == MVT::i32)
+            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
+
+          return DAG.getNode(AArch64ISD::TBNZ, dl, MVT::Other, Chain, Test,
+                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
+        }
 
-  // When compiling PIC, jump tables get put in the code section so a static
-  // relocation-style is acceptable for both cases.
-  switch (getTargetMachine().getCodeModel()) {
-  case CodeModel::Small:
-    return DAG.getNode(AArch64ISD::WrapperSmall, dl, PtrVT,
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT),
-                       DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                              AArch64II::MO_LO12),
-                       DAG.getConstant(1, MVT::i32));
-  case CodeModel::Large:
-    return DAG.getNode(
-      AArch64ISD::WrapperLarge, dl, PtrVT,
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G3),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G2_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G1_NC),
-      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_ABS_G0_NC));
-  default:
-    llvm_unreachable("Only small and large code models supported now");
-  }
-}
+        return DAG.getNode(AArch64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
+      }
+    }
 
-// (SELECT testbit, iftrue, iffalse)
-SDValue
-AArch64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue TheBit = Op.getOperand(0);
-  SDValue IfTrue = Op.getOperand(1);
-  SDValue IfFalse = Op.getOperand(2);
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Cmp);
+  }
 
-  // AArch64 BooleanContents is the default UndefinedBooleanContent, which means
-  // that as the consumer we are responsible for ignoring rubbish in higher
-  // bits.
-  TheBit = DAG.getNode(ISD::AND, dl, MVT::i32, TheBit,
-                       DAG.getConstant(1, MVT::i32));
-  SDValue A64CMP = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, TheBit,
-                               DAG.getConstant(0, TheBit.getValueType()),
-                               DAG.getCondCode(ISD::SETNE));
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue BR1 =
+      DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
+                       Cmp);
+  }
 
-  return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                     A64CMP, IfTrue, IfFalse,
-                     DAG.getConstant(A64CC::NE, MVT::i32));
+  return BR1;
 }
 
-static SDValue LowerVectorSETCC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+SDValue AArch64TargetLowering::LowerFCOPYSIGN(SDValue Op,
+                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  bool Invert = false;
-  SDValue Op0, Op1;
-  unsigned Opcode;
+  SDLoc DL(Op);
 
-  if (LHS.getValueType().isInteger()) {
+  SDValue In1 = Op.getOperand(0);
+  SDValue In2 = Op.getOperand(1);
+  EVT SrcVT = In2.getValueType();
+  if (SrcVT != VT) {
+    if (SrcVT == MVT::f32 && VT == MVT::f64)
+      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
+    else if (SrcVT == MVT::f64 && VT == MVT::f32)
+      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
+    else
+      // FIXME: Src type is different, bail out for now. Can VT really be a
+      // vector type?
+      return SDValue();
+  }
 
-    // Attempt to use Vector Integer Compare Mask Test instruction.
-    // TST = icmp ne (and (op0, op1), zero).
-    if (CC == ISD::SETNE) {
-      if (((LHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(RHS.getNode())) ||
-          ((RHS.getOpcode() == ISD::AND) &&
-           ISD::isBuildVectorAllZeros(LHS.getNode()))) {
-
-        SDValue AndOp = (LHS.getOpcode() == ISD::AND) ? LHS : RHS;
-        SDValue NewLHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(0));
-        SDValue NewRHS = DAG.getNode(ISD::BITCAST, DL, VT, AndOp.getOperand(1));
-        return DAG.getNode(AArch64ISD::NEON_TST, DL, VT, NewLHS, NewRHS);
-      }
+  EVT VecVT;
+  EVT EltVT;
+  SDValue EltMask, VecVal1, VecVal2;
+  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
+    EltVT = MVT::i32;
+    VecVT = MVT::v4i32;
+    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::ssub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
+    } else {
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
-
-    // Attempt to use Vector Integer Compare Mask against Zero instr (Signed).
-    // Note: Compare against Zero does not support unsigned predicates.
-    if ((ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-         ISD::isBuildVectorAllZeros(LHS.getNode())) &&
-        !isUnsignedIntSetCC(CC)) {
-
-      // If LHS is the zero value, swap operands and CondCode.
-      if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-        CC = getSetCCSwappedOperands(CC);
-        Op0 = RHS;
-      } else
-        Op0 = LHS;
-
-      // Ensure valid CondCode for Compare Mask against Zero instruction:
-      // EQ, GE, GT, LE, LT.
-      if (ISD::SETNE == CC) {
-        Invert = true;
-        CC = ISD::SETEQ;
-      }
-
-      // Using constant type to differentiate integer and FP compares with zero.
-      Op1 = DAG.getConstant(0, MVT::i32);
-      Opcode = AArch64ISD::NEON_CMPZ;
-
+  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
+    EltVT = MVT::i64;
+    VecVT = MVT::v2i64;
+
+    // We want to materialize a mask with the the high bit set, but the AdvSIMD
+    // immediate moves cannot materialize that in a single instruction for
+    // 64-bit elements. Instead, materialize zero and then negate it.
+    EltMask = DAG.getConstant(0, EltVT);
+
+    if (!VT.isVector()) {
+      VecVal1 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In1);
+      VecVal2 = DAG.getTargetInsertSubreg(AArch64::dsub, DL, VecVT,
+                                          DAG.getUNDEF(VecVT), In2);
     } else {
-      // Attempt to use Vector Integer Compare Mask instr (Signed/Unsigned).
-      // Ensure valid CondCode for Compare Mask instr: EQ, GE, GT, UGE, UGT.
-      bool Swap = false;
-      switch (CC) {
-      default:
-        llvm_unreachable("Illegal integer comparison.");
-      case ISD::SETEQ:
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-        break;
-      case ISD::SETNE:
-        Invert = true;
-        CC = ISD::SETEQ;
-        break;
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETLT:
-      case ISD::SETLE:
-        Swap = true;
-        CC = getSetCCSwappedOperands(CC);
-      }
-
-      if (Swap)
-        std::swap(LHS, RHS);
-
-      Opcode = AArch64ISD::NEON_CMP;
-      Op0 = LHS;
-      Op1 = RHS;
+      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
+      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
     }
+  } else {
+    llvm_unreachable("Invalid type for copysign!");
+  }
 
-    // Generate Compare Mask instr or Compare Mask against Zero instr.
-    SDValue NeonCmp =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+  std::vector<SDValue> BuildVectorOps;
+  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
+    BuildVectorOps.push_back(EltMask);
 
-    if (Invert)
-      NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, BuildVectorOps);
 
-    return NeonCmp;
+  // If we couldn't materialize the mask above, then the mask vector will be
+  // the zero vector, and we need to negate it here.
+  if (VT == MVT::f64 || VT == MVT::v2f64) {
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
+    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
   }
 
-  // Now handle Floating Point cases.
-  // Attempt to use Vector Floating Point Compare Mask against Zero instruction.
-  if (ISD::isBuildVectorAllZeros(RHS.getNode()) ||
-      ISD::isBuildVectorAllZeros(LHS.getNode())) {
-
-    // If LHS is the zero value, swap operands and CondCode.
-    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
-      CC = getSetCCSwappedOperands(CC);
-      Op0 = RHS;
-    } else
-      Op0 = LHS;
+  SDValue Sel =
+      DAG.getNode(AArch64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
 
-    // Using constant type to differentiate integer and FP compares with zero.
-    Op1 = DAG.getConstantFP(0, MVT::f32);
-    Opcode = AArch64ISD::NEON_CMPZ;
-  } else {
-    // Attempt to use Vector Floating Point Compare Mask instruction.
-    Op0 = LHS;
-    Op1 = RHS;
-    Opcode = AArch64ISD::NEON_CMP;
-  }
+  if (VT == MVT::f32)
+    return DAG.getTargetExtractSubreg(AArch64::ssub, DL, VT, Sel);
+  else if (VT == MVT::f64)
+    return DAG.getTargetExtractSubreg(AArch64::dsub, DL, VT, Sel);
+  else
+    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
+}
 
-  SDValue NeonCmpAlt;
-  // Some register compares have to be implemented with swapped CC and operands,
-  // e.g.: OLT implemented as OGT with swapped operands.
-  bool SwapIfRegArgs = false;
+SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
+    return SDValue();
 
-  // Ensure valid CondCode for FP Compare Mask against Zero instruction:
-  // EQ, GE, GT, LE, LT.
-  // And ensure valid CondCode for FP Compare Mask instruction: EQ, GE, GT.
-  switch (CC) {
-  default:
-    llvm_unreachable("Illegal FP comparison");
-  case ISD::SETUNE:
-  case ISD::SETNE:
-    Invert = true; // Fallthrough
-  case ISD::SETOEQ:
-  case ISD::SETEQ:
-    CC = ISD::SETEQ;
-    break;
-  case ISD::SETOLT:
-  case ISD::SETLT:
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGT:
-  case ISD::SETGT:
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETOLE:
-  case ISD::SETLE:
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETOGE:
-  case ISD::SETGE:
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUGE:
-    Invert = true;
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULE:
-    Invert = true;
-    CC = ISD::SETGT;
-    break;
-  case ISD::SETUGT:
-    Invert = true;
-    CC = ISD::SETLE;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETULT:
-    Invert = true;
-    CC = ISD::SETGE;
-    break;
-  case ISD::SETUEQ:
-    Invert = true; // Fallthrough
-  case ISD::SETONE:
-    // Expand this to (OGT |OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGT));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  case ISD::SETUO:
-    Invert = true; // Fallthrough
-  case ISD::SETO:
-    // Expand this to (OGE | OLT).
-    NeonCmpAlt =
-        DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(ISD::SETGE));
-    CC = ISD::SETLT;
-    SwapIfRegArgs = true;
-    break;
-  }
+  // While there is no integer popcount instruction, it can
+  // be more efficiently lowered to the following sequence that uses
+  // AdvSIMD registers/instructions as long as the copies to/from
+  // the AdvSIMD registers are cheap.
+  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
+  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
+  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
+  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
+  SDValue Val = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
 
-  if (Opcode == AArch64ISD::NEON_CMP && SwapIfRegArgs) {
-    CC = getSetCCSwappedOperands(CC);
-    std::swap(Op0, Op1);
+  SDValue VecVal;
+  if (VT == MVT::i32) {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
+    VecVal = DAG.getTargetInsertSubreg(AArch64::ssub, DL, MVT::v8i8, ZeroVec,
+                                       VecVal);
+  } else {
+    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
   }
 
-  // Generate FP Compare Mask instr or FP Compare Mask against Zero instr
-  SDValue NeonCmp = DAG.getNode(Opcode, DL, VT, Op0, Op1, DAG.getCondCode(CC));
+  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
+  SDValue UaddLV = DAG.getNode(
+      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+      DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, MVT::i32), CtPop);
 
-  if (NeonCmpAlt.getNode())
-    NeonCmp = DAG.getNode(ISD::OR, DL, VT, NeonCmp, NeonCmpAlt);
+  if (VT == MVT::i64)
+    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
+  return UaddLV;
+}
 
-  if (Invert)
-    NeonCmp = DAG.getNOT(DL, NeonCmp, VT);
+SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
-  return NeonCmp;
-}
+  if (Op.getValueType().isVector())
+    return LowerVSETCC(Op, DAG);
 
-// (SETCC lhs, rhs, condcode)
-SDValue
-AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  EVT VT = Op.getValueType();
+  SDLoc dl(Op);
 
-  if (VT.isVector())
-    return LowerVectorSETCC(Op, DAG);
+  // We chose ZeroOrOneBooleanContents, so use zero and one.
+  EVT VT = Op.getValueType();
+  SDValue TVal = DAG.getConstant(1, VT);
+  SDValue FVal = DAG.getConstant(0, VT);
 
+  // Handle f128 first, since one possible outcome is a normal integer
+  // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons will be lowered to libcalls giving a valid LHS and RHS
-    // for the rest of the function (some i32 or i64 values).
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, use it.
-    if (RHS.getNode() == 0) {
+    if (!RHS.getNode()) {
       assert(LHS.getValueType() == Op.getValueType() &&
              "Unexpected setcc expansion!");
       return LHS;
@@ -3021,205 +3106,403 @@ AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+    SDValue CCVal;
+    SDValue Cmp =
+        getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+  }
+
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  if (CC2 == AArch64CC::AL) {
+    changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                       CmpOp, DAG.getConstant(1, VT), DAG.getConstant(0, VT),
-                       A64cc);
+    // Note that we inverted the condition above, so we reverse the order of
+    // the true and false operands here.  This will allow the setcc to be
+    // matched to a single CSINC instruction.
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+  } else {
+    // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
+    // totally clean.  Some of them require two CSELs to implement.  As is in
+    // this case, we emit the first CSEL and then emit a second using the output
+    // of the first as the RHS.  We're effectively OR'ing the two CC's together.
+
+    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
+    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+    SDValue CS1 =
+        DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
+}
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue CmpOp = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT,
-                                     CmpOp, DAG.getConstant(1, VT),
-                                     DAG.getConstant(0, VT), A64cc);
+/// A SELECT_CC operation is really some kind of max or min if both values being
+/// compared are, in some sense, equal to the results in either case. However,
+/// it is permissible to compare f32 values and produce directly extended f64
+/// values.
+///
+/// Extending the comparison operands would also be allowed, but is less likely
+/// to happen in practice since their use is right here. Note that truncate
+/// operations would *not* be semantically equivalent.
+static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
+  if (Cmp == Result)
+    return true;
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                               DAG.getConstant(1, VT), A64SELECT_CC, A64cc);
+  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
+  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
+  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
+      Result.getValueType() == MVT::f64) {
+    bool Lossy;
+    APFloat CmpVal = CCmp->getValueAPF();
+    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
+    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
   }
 
-  return A64SELECT_CC;
+  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
 }
 
-static SDValue LowerVectorSELECT_CC(SDValue Op, SelectionDAG &DAG) {
-  SDLoc dl(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  EVT IfTrueVT = IfTrue.getValueType();
-  EVT CondVT = IfTrueVT.changeVectorElementTypeToInteger();
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
+SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  SDValue CC = Op->getOperand(0);
+  SDValue TVal = Op->getOperand(1);
+  SDValue FVal = Op->getOperand(2);
+  SDLoc DL(Op);
 
-  // If LHS & RHS are floating point and IfTrue & IfFalse are vectors, we will
-  // use NEON compare.
-  if ((LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64)) {
-    EVT EltVT = LHS.getValueType();
-    unsigned EltNum = 128 / EltVT.getSizeInBits();
-    EVT VT = EVT::getVectorVT(*DAG.getContext(), EltVT, EltNum);
-    unsigned SubConstant =
-        (LHS.getValueType() == MVT::f32) ? AArch64::sub_32 :AArch64::sub_64;
-    EVT CEltT = (LHS.getValueType() == MVT::f32) ? MVT::i32 : MVT::i64;
-    EVT CVT = EVT::getVectorVT(*DAG.getContext(), CEltT, EltNum);
-
-    LHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), LHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-    RHS
-      = SDValue(DAG.getMachineNode(TargetOpcode::SUBREG_TO_REG, dl,
-                  VT, DAG.getTargetConstant(0, MVT::i32), RHS,
-                  DAG.getTargetConstant(SubConstant, MVT::i32)), 0);
-
-    SDValue VSetCC = DAG.getSetCC(dl, CVT, LHS, RHS, CC);
-    SDValue ResCC = LowerVectorSETCC(VSetCC, DAG);
-    if (CEltT.getSizeInBits() < IfTrueVT.getSizeInBits()) {
-      EVT DUPVT =
-          EVT::getVectorVT(*DAG.getContext(), CEltT,
-                           IfTrueVT.getSizeInBits() / CEltT.getSizeInBits());
-      ResCC = DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, DUPVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
-    } else {
-      // FIXME: If IfTrue & IfFalse hold v1i8, v1i16 or v1i32, this function
-      // can't handle them and will hit this assert.
-      assert(CEltT.getSizeInBits() == IfTrueVT.getSizeInBits() &&
-             "Vector of IfTrue & IfFalse is too small.");
-
-      unsigned ExEltNum =
-          EltNum * IfTrueVT.getSizeInBits() / ResCC.getValueSizeInBits();
-      EVT ExVT = EVT::getVectorVT(*DAG.getContext(), CEltT, ExEltNum);
-      ResCC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ExVT, ResCC,
-                          DAG.getConstant(0, MVT::i64, false));
-      ResCC = DAG.getNode(ISD::BITCAST, dl, CondVT, ResCC);
-    }
-    SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                  ResCC, IfTrue, IfFalse);
-    return VSelect;
-  }
-
-  // Here we handle the case that LHS & RHS are integer and IfTrue & IfFalse are
-  // vectors.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  EVT SEVT = MVT::i32;
-  if (IfTrue.getValueType().getVectorElementType().getSizeInBits() > 32)
-    SEVT = MVT::i64;
-  SDValue AllOne = DAG.getConstant(-1, SEVT);
-  SDValue AllZero = DAG.getConstant(0, SEVT);
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, SEVT, SetCC,
-                                     AllOne, AllZero, A64cc);
-
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, AllOne, A64SELECT_CC, A64cc);
-  }
-  SDValue VDup;
-  if (IfTrue.getValueType().getVectorNumElements() == 1)
-    VDup = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, CondVT, A64SELECT_CC);
+  unsigned Opc = CC.getOpcode();
+  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
+  // instruction.
+  if (CC.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
+    // Only lower legal XALUO ops.
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
+      return SDValue();
+
+    AArch64CC::CondCode OFCC;
+    SDValue Value, Overflow;
+    std::tie(Value, Overflow) = getAArch64XALUOOp(OFCC, CC.getValue(0), DAG);
+    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
+
+    return DAG.getNode(AArch64ISD::CSEL, DL, Op.getValueType(), TVal, FVal,
+                       CCVal, Overflow);
+  }
+
+  if (CC.getOpcode() == ISD::SETCC)
+    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
+                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
   else
-    VDup = DAG.getNode(AArch64ISD::NEON_VDUP, dl, CondVT, A64SELECT_CC);
-  SDValue VSelect = DAG.getNode(ISD::VSELECT, dl, IfTrue.getValueType(),
-                                VDup, IfTrue, IfFalse);
-  return VSelect;
+    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
+                           FVal, ISD::SETNE);
 }
 
-// (SELECT_CC lhs, rhs, iftrue, iffalse, condcode)
-SDValue
-AArch64TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
+SDValue AArch64TargetLowering::LowerSELECT_CC(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
-  SDValue IfTrue = Op.getOperand(2);
-  SDValue IfFalse = Op.getOperand(3);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-
-  if (IfTrue.getValueType().isVector())
-    return LowerVectorSELECT_CC(Op, DAG);
+  SDValue TVal = Op.getOperand(2);
+  SDValue FVal = Op.getOperand(3);
+  SDLoc dl(Op);
 
+  // Handle f128 first, because it will result in a comparison of some RTLIB
+  // call result against zero.
   if (LHS.getValueType() == MVT::f128) {
-    // f128 comparisons are lowered to libcalls, but slot in nicely here
-    // afterwards.
     softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
 
     // If softenSetCCOperands returned a scalar, we need to compare the result
     // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
+    if (!RHS.getNode()) {
       RHS = DAG.getConstant(0, LHS.getValueType());
       CC = ISD::SETNE;
     }
   }
 
+  // Handle integers first.
   if (LHS.getValueType().isInteger()) {
-    SDValue A64cc;
+    assert((LHS.getValueType() == RHS.getValueType()) &&
+           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
+
+    unsigned Opcode = AArch64ISD::CSEL;
+
+    // If both the TVal and the FVal are constants, see if we can swap them in
+    // order to for a CSINV or CSINC out of them.
+    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
+    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
+
+    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
+      std::swap(TVal, FVal);
+      std::swap(CTVal, CFVal);
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (TVal.getOpcode() == ISD::XOR) {
+      // If TVal is a NOT we want to swap TVal and FVal so that we can match
+      // with a CSINV rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
+
+      if (CVal && CVal->isAllOnesValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (TVal.getOpcode() == ISD::SUB) {
+      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
+      // that we can match with a CSNEG rather than a CSEL.
+      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
+
+      if (CVal && CVal->isNullValue()) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+    } else if (CTVal && CFVal) {
+      const int64_t TrueVal = CTVal->getSExtValue();
+      const int64_t FalseVal = CFVal->getSExtValue();
+      bool Swap = false;
 
-    // Integers are handled in a separate function because the combinations of
-    // immediates and tests can get hairy and we may want to fiddle things.
-    SDValue CmpOp = getSelectableIntSetCC(LHS, RHS, CC, A64cc, DAG, dl);
+      // If both TVal and FVal are constants, see if FVal is the
+      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
+      // instead of a CSEL in that case.
+      if (TrueVal == ~FalseVal) {
+        Opcode = AArch64ISD::CSINV;
+      } else if (TrueVal == -FalseVal) {
+        Opcode = AArch64ISD::CSNEG;
+      } else if (TVal.getValueType() == MVT::i32) {
+        // If our operands are only 32-bit wide, make sure we use 32-bit
+        // arithmetic for the check whether we can use CSINC. This ensures that
+        // the addition in the check will wrap around properly in case there is
+        // an overflow (which would not be the case if we do the check with
+        // 64-bit arithmetic).
+        const uint32_t TrueVal32 = CTVal->getZExtValue();
+        const uint32_t FalseVal32 = CFVal->getZExtValue();
+
+        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
+          Opcode = AArch64ISD::CSINC;
+
+          if (TrueVal32 > FalseVal32) {
+            Swap = true;
+          }
+        }
+        // 64-bit check whether we can use CSINC.
+      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
+        Opcode = AArch64ISD::CSINC;
+
+        if (TrueVal > FalseVal) {
+          Swap = true;
+        }
+      }
+
+      // Swap TVal and FVal if necessary.
+      if (Swap) {
+        std::swap(TVal, FVal);
+        std::swap(CTVal, CFVal);
+        CC = ISD::getSetCCInverse(CC, true);
+      }
+
+      if (Opcode != AArch64ISD::CSEL) {
+        // Drop FVal since we can get its value by simply inverting/negating
+        // TVal.
+        FVal = TVal;
+      }
+    }
+
+    SDValue CCVal;
+    SDValue Cmp = getAArch64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
 
-    return DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(), CmpOp,
-                       IfTrue, IfFalse, A64cc);
+    EVT VT = Op.getValueType();
+    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
   }
 
-  // Note that some LLVM floating-point CondCodes can't be lowered to a single
-  // conditional branch, hence FPCCToA64CC can set a second test, where either
-  // passing is sufficient.
-  A64CC::CondCodes CondCode, Alternative = A64CC::Invalid;
-  CondCode = FPCCToA64CC(CC, Alternative);
-  SDValue A64cc = DAG.getConstant(CondCode, MVT::i32);
-  SDValue SetCC = DAG.getNode(AArch64ISD::SETCC, dl, MVT::i32, LHS, RHS,
-                              DAG.getCondCode(CC));
-  SDValue A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl,
-                                     Op.getValueType(),
-                                     SetCC, IfTrue, IfFalse, A64cc);
+  // Now we know we're dealing with FP values.
+  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == RHS.getValueType());
+  EVT VT = Op.getValueType();
+
+  // Try to match this select into a max/min operation, which have dedicated
+  // opcode in the instruction set.
+  // FIXME: This is not correct in the presence of NaNs, so we only enable this
+  // in no-NaNs mode.
+  if (getTargetMachine().Options.NoNaNsFPMath) {
+    SDValue MinMaxLHS = TVal, MinMaxRHS = FVal;
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxRHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxLHS)) {
+      CC = ISD::getSetCCSwappedOperands(CC);
+      std::swap(MinMaxLHS, MinMaxRHS);
+    }
 
-  if (Alternative != A64CC::Invalid) {
-    A64cc = DAG.getConstant(Alternative, MVT::i32);
-    A64SELECT_CC = DAG.getNode(AArch64ISD::SELECT_CC, dl, Op.getValueType(),
-                               SetCC, IfTrue, A64SELECT_CC, A64cc);
+    if (selectCCOpsAreFMaxCompatible(LHS, MinMaxLHS) &&
+        selectCCOpsAreFMaxCompatible(RHS, MinMaxRHS)) {
+      switch (CC) {
+      default:
+        break;
+      case ISD::SETGT:
+      case ISD::SETGE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETOGT:
+      case ISD::SETOGE:
+        return DAG.getNode(AArch64ISD::FMAX, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      case ISD::SETLT:
+      case ISD::SETLE:
+      case ISD::SETULT:
+      case ISD::SETULE:
+      case ISD::SETOLT:
+      case ISD::SETOLE:
+        return DAG.getNode(AArch64ISD::FMIN, dl, VT, MinMaxLHS, MinMaxRHS);
+        break;
+      }
+    }
+  }
 
+  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
+  // and do the comparison.
+  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two CSELs to implement.
+  AArch64CC::CondCode CC1, CC2;
+  changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
+  SDValue CS1 = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
+
+  // If we need a second CSEL, emit it, using the output of the first as the
+  // RHS.  We're effectively OR'ing the two CC's together.
+  if (CC2 != AArch64CC::AL) {
+    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
+    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
 
-  return A64SELECT_CC;
+  // Otherwise, return the output of the first CSEL.
+  return CS1;
 }
 
-SDValue
-AArch64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+SDValue AArch64TargetLowering::LowerJumpTable(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  // Jump table entries as PC relative offsets. No additional tweaking
+  // is necessary here. Just get the address of the jump table.
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
 
-  // We have to make sure we copy the entire structure: 8+8+8+4+4 = 32 bytes
-  // rather than just 8.
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op),
-                       Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(32, MVT::i32), 8, false, false,
-                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G3),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                               AArch64II::MO_G0 | MO_NC));
+  }
+
+  SDValue Hi =
+      DAG.getTargetJumpTable(JT->getIndex(), PtrVT, AArch64II::MO_PAGE);
+  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
+                                      AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+  return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
 }
 
-SDValue
-AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+SDValue AArch64TargetLowering::LowerConstantPool(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
+    // Use the GOT for the large code model on iOS.
+    if (Subtarget->isTargetMachO()) {
+      SDValue GotAddr = DAG.getTargetConstantPool(
+          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+          AArch64II::MO_GOT);
+      return DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, GotAddr);
+    }
+
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G3),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_G0 | MO_NC));
+  } else {
+    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
+    // ELF, the only valid one on Darwin.
+    SDValue Hi =
+        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
+                                  CP->getOffset(), AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetConstantPool(
+        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
+        AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerBlockAddress(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+      !Subtarget->isTargetMachO()) {
+    const unsigned char MO_NC = AArch64II::MO_NC;
+    return DAG.getNode(
+        AArch64ISD::WrapperLarge, DL, PtrVT,
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G3),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G2 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G1 | MO_NC),
+        DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_G0 | MO_NC));
+  } else {
+    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGE);
+    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, AArch64II::MO_PAGEOFF |
+                                                             AArch64II::MO_NC);
+    SDValue ADRP = DAG.getNode(AArch64ISD::ADRP, DL, PtrVT, Hi);
+    return DAG.getNode(AArch64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
+  }
+}
+
+SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  AArch64FunctionInfo *FuncInfo =
+      DAG.getMachineFunction().getInfo<AArch64FunctionInfo>();
+
+  SDLoc DL(Op);
+  SDValue FR =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
+                      MachinePointerInfo(SV), false, false, 0);
+}
+
+SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
+                                                SelectionDAG &DAG) const {
   // The layout of the va_list struct is specified in the AArch64 Procedure Call
   // Standard, section B.3.
   MachineFunction &MF = DAG.getMachineFunction();
-  AArch64MachineFunctionInfo *FuncInfo
-    = MF.getInfo<AArch64MachineFunctionInfo>();
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
   SDLoc DL(Op);
 
   SDValue Chain = Op.getOperand(0);
@@ -3228,498 +3511,2894 @@ AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<SDValue, 4> MemOps;
 
   // void *__stack at offset 0
-  SDValue Stack = DAG.getFrameIndex(FuncInfo->getVariadicStackIdx(),
-                                    getPointerTy());
+  SDValue Stack =
+      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
   MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 0));
+                                MachinePointerInfo(SV), false, false, 8));
 
   // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVariadicGPRSize();
+  int GPRSize = FuncInfo->getVarArgsGPRSize();
   if (GPRSize > 0) {
     SDValue GRTop, GRTopAddr;
 
     GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(8, getPointerTy()));
 
-    GRTop = DAG.getFrameIndex(FuncInfo->getVariadicGPRIdx(), getPointerTy());
+    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
     GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
                         DAG.getConstant(GPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 8), false, false, 8));
   }
 
   // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVariadicFPRSize();
+  int FPRSize = FuncInfo->getVarArgsFPRSize();
   if (FPRSize > 0) {
     SDValue VRTop, VRTopAddr;
     VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                             DAG.getConstant(16, getPointerTy()));
 
-    VRTop = DAG.getFrameIndex(FuncInfo->getVariadicFPRIdx(), getPointerTy());
+    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
     VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
                         DAG.getConstant(FPRSize, getPointerTy()));
 
     MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16),
-                                  false, false, 0));
+                                  MachinePointerInfo(SV, 16), false, false, 8));
   }
 
   // int __gr_offs at offset 24
   SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(24, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24),
-                                false, false, 0));
+                                GROffsAddr, MachinePointerInfo(SV, 24), false,
+                                false, 4));
 
   // int __vr_offs at offset 28
   SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
                                    DAG.getConstant(28, getPointerTy()));
   MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28),
-                                false, false, 0));
+                                VROffsAddr, MachinePointerInfo(SV, 28), false,
+                                false, 4));
 
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                     MemOps.size());
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
-SDValue
-AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default: llvm_unreachable("Don't know how to custom lower this!");
-  case ISD::FADD: return LowerF128ToCall(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB: return LowerF128ToCall(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL: return LowerF128ToCall(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV: return LowerF128ToCall(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_TO_SINT: return LowerFP_TO_INT(Op, DAG, true);
-  case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG, false);
-  case ISD::SINT_TO_FP: return LowerINT_TO_FP(Op, DAG, true);
-  case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG, false);
-  case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
-  case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
-  case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
-
-  case ISD::SHL_PARTS:     return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:     return LowerShiftRightParts(Op, DAG);
-
-  case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
-  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
-  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
-  case ISD::GlobalAddress: return LowerGlobalAddressELF(Op, DAG);
-  case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
-  case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::JumpTable: return LowerJumpTable(Op, DAG);
-  case ISD::SELECT: return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
-  case ISD::SETCC: return LowerSETCC(Op, DAG);
-  case ISD::VACOPY: return LowerVACOPY(Op, DAG);
-  case ISD::VASTART: return LowerVASTART(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG, getSubtarget());
-  case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
-  case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG);
+SDValue AArch64TargetLowering::LowerVASTART(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
+                                     : LowerAAPCS_VASTART(Op, DAG);
+}
+
+SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
+  // pointer.
+  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
+                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
+                       8, false, false, MachinePointerInfo(DestSV),
+                       MachinePointerInfo(SrcSV));
+}
+
+SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetDarwin() &&
+         "automatic va_arg instruction only works on Darwin");
+
+  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Addr = Op.getOperand(1);
+  unsigned Align = Op.getConstantOperandVal(3);
+
+  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
+                               MachinePointerInfo(V), false, false, false, 0);
+  Chain = VAList.getValue(1);
+
+  if (Align > 8) {
+    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                         DAG.getConstant(Align - 1, getPointerTy()));
+    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
+                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
   }
 
-  return SDValue();
+  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
+
+  // Scalar integer and FP values smaller than 64 bits are implicitly extended
+  // up to 64 bits.  At the very least, we have to increase the striding of the
+  // vaargs list to match this, and for FP values we need to introduce
+  // FP_ROUND nodes as well.
+  if (VT.isInteger() && !VT.isVector())
+    ArgSize = 8;
+  bool NeedFPTrunc = false;
+  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
+    ArgSize = 8;
+    NeedFPTrunc = true;
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
+                               DAG.getConstant(ArgSize, getPointerTy()));
+  // Store the incremented VAList to the legalized pointer
+  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
+                                 false, false, 0);
+
+  // Load the actual argument out of the pointer VAList
+  if (NeedFPTrunc) {
+    // Load the value as an f64.
+    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
+                                 MachinePointerInfo(), false, false, false, 0);
+    // Round the value down to an f32.
+    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
+                                   DAG.getIntPtrConstant(1));
+    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
+    // Merge the rounded value with the chain output of the load.
+    return DAG.getMergeValues(Ops, DL);
+  }
+
+  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
+                     false, false, 0);
 }
 
-/// Check if the specified splat value corresponds to a valid vector constant
-/// for a Neon instruction with a "modified immediate" operand (e.g., MOVI).  If
-/// so, return the encoded 8-bit immediate and the OpCmode instruction fields
-/// values.
-static bool isNeonModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
-                              unsigned SplatBitSize, SelectionDAG &DAG,
-                              bool is128Bits, NeonModImmType type, EVT &VT,
-                              unsigned &Imm, unsigned &OpCmode) {
-  switch (SplatBitSize) {
+SDValue AArch64TargetLowering::LowerFRAMEADDR(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setFrameAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDValue FrameAddr =
+      DAG.getCopyFromReg(DAG.getEntryNode(), DL, AArch64::FP, VT);
+  while (Depth--)
+    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
+                            MachinePointerInfo(), false, false, false, 0);
+  return FrameAddr;
+}
+
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned AArch64TargetLowering::getRegisterByName(const char* RegName,
+                                                  EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", AArch64::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
+SDValue AArch64TargetLowering::LowerRETURNADDR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setReturnAddressIsTaken(true);
+
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  if (Depth) {
+    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
+    SDValue Offset = DAG.getConstant(8, getPointerTy());
+    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
+                       MachinePointerInfo(), false, false, false, 0);
+  }
+
+  // Return LR, which contains the return address. Mark it an implicit live-in.
+  unsigned Reg = MF.addLiveIn(AArch64::LR, &AArch64::GPR64RegClass);
+  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
+}
+
+/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
+
+  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
+
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+
+  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  // AArch64 shifts larger than the register width are wrapped rather than
+  // clamped, so we can't just emit "hi >> x".
+  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
+  SDValue TrueValHi = Opc == ISD::SRA
+                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
+                                        DAG.getConstant(VTBits - 1, MVT::i64))
+                          : DAG.getConstant(0, VT);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
+/// i64 values and take a 2 x i64 value to shift plus a shift amount.
+SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
+  EVT VT = Op.getValueType();
+  unsigned VTBits = VT.getSizeInBits();
+  SDLoc dl(Op);
+  SDValue ShOpLo = Op.getOperand(0);
+  SDValue ShOpHi = Op.getOperand(1);
+  SDValue ShAmt = Op.getOperand(2);
+  SDValue ARMcc;
+
+  assert(Op.getOpcode() == ISD::SHL_PARTS);
+  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
+                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
+  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
+  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
+                                   DAG.getConstant(VTBits, MVT::i64));
+  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
+  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
+
+  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+
+  SDValue Cmp = emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64),
+                               ISD::SETGE, dl, DAG);
+  SDValue CCVal = DAG.getConstant(AArch64CC::GE, MVT::i32);
+  SDValue Hi =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
+
+  // AArch64 shifts of larger than register sizes are wrapped rather than
+  // clamped, so we can't just emit "lo << a" if a is too big.
+  SDValue TrueValLo = DAG.getConstant(0, VT);
+  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
+  SDValue Lo =
+      DAG.getNode(AArch64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
+
+  SDValue Ops[2] = { Lo, Hi };
+  return DAG.getMergeValues(Ops, dl);
+}
+
+bool AArch64TargetLowering::isOffsetFoldingLegal(
+    const GlobalAddressSDNode *GA) const {
+  // The AArch64 target doesn't support folding offsets into global addresses.
+  return false;
+}
+
+bool AArch64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
+  // FIXME: We should be able to handle f128 as well with a clever lowering.
+  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
+    return true;
+
+  if (VT == MVT::f64)
+    return AArch64_AM::getFP64Imm(Imm) != -1;
+  else if (VT == MVT::f32)
+    return AArch64_AM::getFP32Imm(Imm) != -1;
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+//                          AArch64 Inline Assembly Support
+//===----------------------------------------------------------------------===//
+
+// Table of Constraints
+// TODO: This is the current set of constraints supported by ARM for the
+// compiler, not all of them may make sense, e.g. S may be difficult to support.
+//
+// r - A general register
+// w - An FP/SIMD register of some size in the range v0-v31
+// x - An FP/SIMD register of some size in the range v0-v15
+// I - Constant that can be used with an ADD instruction
+// J - Constant that can be used with a SUB instruction
+// K - Constant that can be used with a 32-bit logical instruction
+// L - Constant that can be used with a 64-bit logical instruction
+// M - Constant that can be used as a 32-bit MOV immediate
+// N - Constant that can be used as a 64-bit MOV immediate
+// Q - A memory reference with base register and no offset
+// S - A symbolic address
+// Y - Floating point constant zero
+// Z - Integer constant zero
+//
+//   Note that general register operands will be output using their 64-bit x
+// register name, whatever the size of the variable, unless the asm operand
+// is prefixed by the %w modifier. Floating-point and SIMD register operands
+// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
+// %q modifier.
+
+/// getConstraintType - Given a constraint letter, return the type of
+/// constraint it is for this target.
+AArch64TargetLowering::ConstraintType
+AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default:
+      break;
+    case 'z':
+      return C_Other;
+    case 'x':
+    case 'w':
+      return C_RegisterClass;
+    // An address with a single base register. Due to the way we
+    // currently handle addresses it is the same as 'r'.
+    case 'Q':
+      return C_Memory;
+    }
+  }
+  return TargetLowering::getConstraintType(Constraint);
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+AArch64TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+  // If we don't have a value, we can't do a match,
+  // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  Type *type = CallOperandVal->getType();
+  // Look at the constraint type.
+  switch (*constraint) {
   default:
-    llvm_unreachable("unexpected size for isNeonModifiedImm");
-  case 8: {
-    if (type != Neon_Mov_Imm)
-      return false;
-    assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
-    // Neon movi per byte: Op=0, Cmode=1110.
-    OpCmode = 0xe;
-    Imm = SplatBits;
-    VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
+    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
+    break;
+  case 'x':
+  case 'w':
+    if (type->isFloatingPointTy() || type->isVectorTy())
+      weight = CW_Register;
+    break;
+  case 'z':
+    weight = CW_Constant;
     break;
   }
-  case 16: {
-    // Neon move inst per halfword
-    VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x00nn is 0x00nn LSL 0
-      // movi: Op=0, Cmode=1000; mvni: Op=1, Cmode=1000
-      // bic:  Op=1, Cmode=1001;  orr:  Op=0, Cmode=1001
-      // Op=x, Cmode=100y
-      Imm = SplatBits;
-      OpCmode = 0x8;
+  return weight;
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+AArch64TargetLowering::getRegForInlineAsmConstraint(
+    const std::string &Constraint, MVT VT) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'r':
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::GPR64commonRegClass);
+      return std::make_pair(0U, &AArch64::GPR32commonRegClass);
+    case 'w':
+      if (VT == MVT::f32)
+        return std::make_pair(0U, &AArch64::FPR32RegClass);
+      if (VT.getSizeInBits() == 64)
+        return std::make_pair(0U, &AArch64::FPR64RegClass);
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128RegClass);
       break;
-    }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0xnn00 is 0x00nn LSL 8
-      // movi: Op=0, Cmode=1010; mvni: Op=1, Cmode=1010
-      // bic:  Op=1, Cmode=1011;  orr:  Op=0, Cmode=1011
-      // Op=x, Cmode=101x
-      Imm = SplatBits >> 8;
-      OpCmode = 0xa;
+    // The instructions that this constraint is designed for can
+    // only take 128-bit registers so just use that regclass.
+    case 'x':
+      if (VT.getSizeInBits() == 128)
+        return std::make_pair(0U, &AArch64::FPR128_loRegClass);
       break;
     }
-    // can't handle any other
-    return false;
   }
+  if (StringRef("{cc}").equals_lower(Constraint))
+    return std::make_pair(unsigned(AArch64::NZCV), &AArch64::CCRRegClass);
 
-  case 32: {
-    // First the LSL variants (MSL is unusable by some interested instructions).
+  // Use the default implementation in TargetLowering to convert the register
+  // constraint into a member of a register class.
+  std::pair<unsigned, const TargetRegisterClass *> Res;
+  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
+
+  // Not found as a standard register?
+  if (!Res.second) {
+    unsigned Size = Constraint.size();
+    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
+        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
+      const std::string Reg =
+          std::string(&Constraint[2], &Constraint[Size - 1]);
+      int RegNo = atoi(Reg.c_str());
+      if (RegNo >= 0 && RegNo <= 31) {
+        // v0 - v31 are aliases of q0 - q31.
+        // By default we'll emit v0-v31 for this unless there's a modifier where
+        // we'll emit the correct register as well.
+        Res.first = AArch64::FPR128RegClass.getRegister(RegNo);
+        Res.second = &AArch64::FPR128RegClass;
+      }
+    }
+  }
 
-    // Neon move instr per word, shift zeros
-    VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
-    if ((SplatBits & ~0xff) == 0) {
-      // Value = 0x000000nn is 0x000000nn LSL 0
-      // movi: Op=0, Cmode= 0000; mvni: Op=1, Cmode= 0000
-      // bic:  Op=1, Cmode= 0001; orr:  Op=0, Cmode= 0001
-      // Op=x, Cmode=000x
-      Imm = SplatBits;
-      OpCmode = 0;
-      break;
+  return Res;
+}
+
+/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
+/// vector.  If it is invalid, don't add anything to Ops.
+void AArch64TargetLowering::LowerAsmOperandForConstraint(
+    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
+    SelectionDAG &DAG) const {
+  SDValue Result;
+
+  // Currently only support length 1 constraints.
+  if (Constraint.length() != 1)
+    return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    break;
+
+  // This set of constraints deal with valid constants for various instructions.
+  // Validate and return a target constant for them if we can.
+  case 'z': {
+    // 'z' maps to xzr or wzr so it needs an input of 0.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C || C->getZExtValue() != 0)
+      return;
+
+    if (Op.getValueType() == MVT::i64)
+      Result = DAG.getRegister(AArch64::XZR, MVT::i64);
+    else
+      Result = DAG.getRegister(AArch64::WZR, MVT::i32);
+    break;
+  }
+
+  case 'I':
+  case 'J':
+  case 'K':
+  case 'L':
+  case 'M':
+  case 'N':
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    if (!C)
+      return;
+
+    // Grab the value and do some validation.
+    uint64_t CVal = C->getZExtValue();
+    switch (ConstraintLetter) {
+    // The I constraint applies only to simple ADD or SUB immediate operands:
+    // i.e. 0 to 4095 with optional shift by 12
+    // The J constraint applies only to ADD or SUB immediates that would be
+    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
+    // instruction [or vice versa], in other words -1 to -4095 with optional
+    // left shift by 12.
+    case 'I':
+      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
+        break;
+      return;
+    case 'J': {
+      uint64_t NVal = -C->getSExtValue();
+      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
+        break;
+      return;
     }
-    if ((SplatBits & ~0xff00) == 0) {
-      // Value = 0x0000nn00 is 0x000000nn LSL 8
-      // movi: Op=0, Cmode= 0010;  mvni: Op=1, Cmode= 0010
-      // bic:  Op=1, Cmode= 0011;  orr : Op=0, Cmode= 0011
-      // Op=x, Cmode=001x
-      Imm = SplatBits >> 8;
-      OpCmode = 0x2;
-      break;
+    // The K and L constraints apply *only* to logical immediates, including
+    // what used to be the MOVI alias for ORR (though the MOVI alias has now
+    // been removed and MOV should be used). So these constraints have to
+    // distinguish between bit patterns that are valid 32-bit or 64-bit
+    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
+    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
+    // versa.
+    case 'K':
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      return;
+    case 'L':
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      return;
+    // The M and N constraints are a superset of K and L respectively, for use
+    // with the MOV (immediate) alias. As well as the logical immediates they
+    // also match 32 or 64-bit immediates that can be loaded either using a
+    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
+    // (M) or 64-bit 0x1234000000000000 (N) etc.
+    // As a note some of this code is liberally stolen from the asm parser.
+    case 'M': {
+      if (!isUInt<32>(CVal))
+        return;
+      if (AArch64_AM::isLogicalImmediate(CVal, 32))
+        break;
+      if ((CVal & 0xFFFF) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~(uint32_t)CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      return;
     }
-    if ((SplatBits & ~0xff0000) == 0) {
-      // Value = 0x00nn0000 is 0x000000nn LSL 16
-      // movi: Op=0, Cmode= 0100; mvni: Op=1, Cmode= 0100
-      // bic:  Op=1, Cmode= 0101; orr:  Op=0, Cmode= 0101
-      // Op=x, Cmode=010x
-      Imm = SplatBits >> 16;
-      OpCmode = 0x4;
-      break;
+    case 'N': {
+      if (AArch64_AM::isLogicalImmediate(CVal, 64))
+        break;
+      if ((CVal & 0xFFFFULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF0000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF00000000ULL) == CVal)
+        break;
+      if ((CVal & 0xFFFF000000000000ULL) == CVal)
+        break;
+      uint64_t NCVal = ~CVal;
+      if ((NCVal & 0xFFFFULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF0000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
+        break;
+      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
+        break;
+      return;
     }
-    if ((SplatBits & ~0xff000000) == 0) {
-      // Value = 0xnn000000 is 0x000000nn LSL 24
-      // movi: Op=0, Cmode= 0110; mvni: Op=1, Cmode= 0110
-      // bic:  Op=1, Cmode= 0111; orr:  Op=0, Cmode= 0111
-      // Op=x, Cmode=011x
-      Imm = SplatBits >> 24;
-      OpCmode = 0x6;
-      break;
+    default:
+      return;
     }
 
-    // Now the MSL immediates.
+    // All assembler immediates are 64-bit integers.
+    Result = DAG.getTargetConstant(CVal, MVT::i64);
+    break;
+  }
 
-    // Neon move instr per word, shift ones
-    if ((SplatBits & ~0xffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xff) == 0xff) {
-      // Value = 0x0000nnff is 0x000000nn MSL 8
-      // movi: Op=0, Cmode= 1100; mvni: Op=1, Cmode= 1100
-      // Op=x, Cmode=1100
-      Imm = SplatBits >> 8;
-      OpCmode = 0xc;
-      break;
+  if (Result.getNode()) {
+    Ops.push_back(Result);
+    return;
+  }
+
+  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+}
+
+//===----------------------------------------------------------------------===//
+//                     AArch64 Advanced SIMD Support
+//===----------------------------------------------------------------------===//
+
+/// WidenVector - Given a value in the V64 register class, produce the
+/// equivalent value in the V128 register class.
+static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
+  EVT VT = V64Reg.getValueType();
+  unsigned NarrowSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
+  SDLoc DL(V64Reg);
+
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
+                     V64Reg, DAG.getConstant(0, MVT::i32));
+}
+
+/// getExtFactor - Determine the adjustment factor for the position when
+/// generating an "extract from vector registers" instruction.
+static unsigned getExtFactor(SDValue &V) {
+  EVT EltType = V.getValueType().getVectorElementType();
+  return EltType.getSizeInBits() / 8;
+}
+
+/// NarrowVector - Given a value in the V128 register class, produce the
+/// equivalent value in the V64 register class.
+static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
+  EVT VT = V128Reg.getValueType();
+  unsigned WideSize = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType().getSimpleVT();
+  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
+  SDLoc DL(V128Reg);
+
+  return DAG.getTargetExtractSubreg(AArch64::dsub, DL, NarrowTy, V128Reg);
+}
+
+// Gather data to see if the operation can be modelled as a
+// shuffle in combination with VEXTs.
+SDValue AArch64TargetLowering::ReconstructShuffle(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SmallVector<SDValue, 2> SourceVecs;
+  SmallVector<unsigned, 2> MinElts;
+  SmallVector<unsigned, 2> MaxElts;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
+      // A shuffle can only come from building a vector from various
+      // elements of other vectors.
+      return SDValue();
     }
-    if ((SplatBits & ~0xffffff) == 0 &&
-        ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
-      // Value = 0x00nnffff is 0x000000nn MSL 16
-      // movi: Op=1, Cmode= 1101; mvni: Op=1, Cmode= 1101
-      // Op=x, Cmode=1101
-      Imm = SplatBits >> 16;
-      OpCmode = 0xd;
-      break;
+
+    // Record this extraction against the appropriate vector if possible...
+    SDValue SourceVec = V.getOperand(0);
+    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
+    bool FoundSource = false;
+    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
+      if (SourceVecs[j] == SourceVec) {
+        if (MinElts[j] > EltNo)
+          MinElts[j] = EltNo;
+        if (MaxElts[j] < EltNo)
+          MaxElts[j] = EltNo;
+        FoundSource = true;
+        break;
+      }
+    }
+
+    // Or record a new source if not...
+    if (!FoundSource) {
+      SourceVecs.push_back(SourceVec);
+      MinElts.push_back(EltNo);
+      MaxElts.push_back(EltNo);
+    }
+  }
+
+  // Currently only do something sane when at most two source vectors
+  // involved.
+  if (SourceVecs.size() > 2)
+    return SDValue();
+
+  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
+  int VEXTOffsets[2] = { 0, 0 };
+
+  // This loop extracts the usage patterns of the source vectors
+  // and prepares appropriate SDValues for a shuffle if possible.
+  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
+    if (SourceVecs[i].getValueType() == VT) {
+      // No VEXT necessary
+      ShuffleSrcs[i] = SourceVecs[i];
+      VEXTOffsets[i] = 0;
+      continue;
+    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
+      // We can pad out the smaller vector for free, so if it's part of a
+      // shuffle...
+      ShuffleSrcs[i] = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, SourceVecs[i],
+                                   DAG.getUNDEF(SourceVecs[i].getValueType()));
+      continue;
+    }
+
+    // Don't attempt to extract subvectors from BUILD_VECTOR sources
+    // that expand or trunc the original value.
+    // TODO: We can try to bitcast and ANY_EXTEND the result but
+    // we need to consider the cost of vector ANY_EXTEND, and the
+    // legality of all the types.
+    if (SourceVecs[i].getValueType().getVectorElementType() !=
+        VT.getVectorElementType())
+      return SDValue();
+
+    // Since only 64-bit and 128-bit vectors are legal on ARM and
+    // we've eliminated the other cases...
+    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
+           "unexpected vector sizes in ReconstructShuffle");
+
+    if (MaxElts[i] - MinElts[i] >= NumElts) {
+      // Span too large for a VEXT to cope
+      return SDValue();
+    }
+
+    if (MinElts[i] >= NumElts) {
+      // The extraction can just take the second half
+      VEXTOffsets[i] = NumElts;
+      ShuffleSrcs[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+    } else if (MaxElts[i] < NumElts) {
+      // The extraction can just take the first half
+      VEXTOffsets[i] = 0;
+      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                   SourceVecs[i], DAG.getIntPtrConstant(0));
+    } else {
+      // An actual VEXT is needed
+      VEXTOffsets[i] = MinElts[i];
+      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+                                     SourceVecs[i], DAG.getIntPtrConstant(0));
+      SDValue VEXTSrc2 =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
+                      DAG.getIntPtrConstant(NumElts));
+      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
+      ShuffleSrcs[i] = DAG.getNode(AArch64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
+                                   DAG.getConstant(Imm, MVT::i32));
+    }
+  }
+
+  SmallVector<int, 8> Mask;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Entry = Op.getOperand(i);
+    if (Entry.getOpcode() == ISD::UNDEF) {
+      Mask.push_back(-1);
+      continue;
+    }
+
+    SDValue ExtractVec = Entry.getOperand(0);
+    int ExtractElt =
+        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
+    if (ExtractVec == SourceVecs[0]) {
+      Mask.push_back(ExtractElt - VEXTOffsets[0]);
+    } else {
+      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
     }
-    // can't handle any other
+  }
+
+  // Final check before we try to produce nonsense...
+  if (isShuffleMaskLegal(Mask, VT))
+    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
+                                &Mask[0]);
+
+  return SDValue();
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are the same.
+static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
+  if (M[0] < 0)
+    return false;
+
+  Imm = M[0];
+
+  // If this is a VEXT shuffle, the immediate value is the index of the first
+  // element.  The other shuffle indices must be the successive elements after
+  // the first one.
+  unsigned ExpectedElt = Imm;
+  for (unsigned i = 1; i < NumElts; ++i) {
+    // Increment the expected index.  If it wraps around, just follow it
+    // back to index zero and keep going.
+    ++ExpectedElt;
+    if (ExpectedElt == NumElts)
+      ExpectedElt = 0;
+
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if (ExpectedElt != static_cast<unsigned>(M[i]))
+      return false;
+  }
+
+  return true;
+}
+
+// check if an EXT instruction can handle the shuffle mask when the
+// vector sources of the shuffle are different.
+static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
+                      unsigned &Imm) {
+  // Look for the first non-undef element.
+  const int *FirstRealElt = std::find_if(M.begin(), M.end(),
+      [](int Elt) {return Elt >= 0;});
+
+  // Benefit form APInt to handle overflow when calculating expected element.
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  const int *FirstWrongElt = std::find_if(FirstRealElt + 1, M.end(),
+      [&](int Elt) {return Elt != ExpectedElt++ && Elt != -1;});
+  if (FirstWrongElt != M.end())
+    return false;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g. 
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  Imm = ExpectedElt.getZExtValue();
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseEXT = true;
+  else
+    Imm -= NumElts;
+
+  return true;
+}
+
+/// isREVMask - Check if a vector shuffle corresponds to a REV
+/// instruction with the specified blocksize.  (The order of the elements
+/// within each block of the vector is reversed.)
+static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+
+  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
+  if (EltSz == 64)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned BlockElts = M[0] + 1;
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSz;
+
+  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
     return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
+      return false;
+    Idx += 1;
+  }
+
+  return true;
+}
+
+static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    if (M[i] < 0)
+      continue; // ignore UNDEF indices
+    if ((unsigned)M[i] != 2 * i + WhichResult)
+      return false;
+  }
+
+  return true;
+}
+
+static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
+      return false;
   }
+  return true;
+}
 
-  case 64: {
-    if (type != Neon_Mov_Imm)
+/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
+static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
       return false;
-    // Neon move instr bytemask, where each byte is either 0x00 or 0xff.
-    // movi Op=1, Cmode=1110.
-    OpCmode = 0x1e;
-    uint64_t BitMask = 0xff;
-    uint64_t Val = 0;
-    unsigned ImmMask = 1;
-    Imm = 0;
-    for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
-        Val |= BitMask;
-        Imm |= ImmMask;
-      } else if ((SplatBits & BitMask) != 0) {
+    Idx += 1;
+  }
+
+  return true;
+}
+
+/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
+static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned Half = VT.getVectorNumElements() / 2;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned j = 0; j != 2; ++j) {
+    unsigned Idx = WhichResult;
+    for (unsigned i = 0; i != Half; ++i) {
+      int MIdx = M[i + j * Half];
+      if (MIdx >= 0 && (unsigned)MIdx != Idx)
         return false;
-      }
-      BitMask <<= 8;
-      ImmMask <<= 1;
+      Idx += 2;
     }
-    SplatBits = Val;
-    VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
-    break;
-  }
   }
 
   return true;
 }
 
-static SDValue PerformANDCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
+/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
+/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
+static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
+  unsigned NumElts = VT.getVectorNumElements();
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
+        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
+      return false;
+  }
+  return true;
+}
 
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+static bool isINSMask(ArrayRef<int> M, int NumInputElements,
+                      bool &DstIsLeft, int &Anomaly) {
+  if (M.size() != static_cast<size_t>(NumInputElements))
+    return false;
 
-  // We're looking for an SRA/SHL pair which form an SBFX.
+  int NumLHSMatch = 0, NumRHSMatch = 0;
+  int LastLHSMismatch = -1, LastRHSMismatch = -1;
 
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
+  for (int i = 0; i < NumInputElements; ++i) {
+    if (M[i] == -1) {
+      ++NumLHSMatch;
+      ++NumRHSMatch;
+      continue;
+    }
 
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
-    return SDValue();
+    if (M[i] == i)
+      ++NumLHSMatch;
+    else
+      LastLHSMismatch = i;
 
-  uint64_t TruncMask = N->getConstantOperandVal(1);
-  if (!isMask_64(TruncMask))
-    return SDValue();
+    if (M[i] == i + NumInputElements)
+      ++NumRHSMatch;
+    else
+      LastRHSMismatch = i;
+  }
 
-  uint64_t Width = CountPopulation_64(TruncMask);
-  SDValue Shift = N->getOperand(0);
+  if (NumLHSMatch == NumInputElements - 1) {
+    DstIsLeft = true;
+    Anomaly = LastLHSMismatch;
+    return true;
+  } else if (NumRHSMatch == NumInputElements - 1) {
+    DstIsLeft = false;
+    Anomaly = LastRHSMismatch;
+    return true;
+  }
 
-  if (Shift.getOpcode() != ISD::SRL)
-    return SDValue();
+  return false;
+}
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+static bool isConcatMask(ArrayRef<int> Mask, EVT VT, bool SplitLHS) {
+  if (VT.getSizeInBits() != 128)
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  for (int I = 0, E = NumElts / 2; I != E; I++) {
+    if (Mask[I] != I)
+      return false;
+  }
+
+  int Offset = NumElts / 2;
+  for (int I = NumElts / 2, E = NumElts; I != E; I++) {
+    if (Mask[I] != I + SplitLHS * Offset)
+      return false;
+  }
+
+  return true;
+}
+
+static SDValue tryFormConcatFromShuffle(SDValue Op, SelectionDAG &DAG) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue V0 = Op.getOperand(0);
+  SDValue V1 = Op.getOperand(1);
+  ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op)->getMask();
+
+  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
+      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
     return SDValue();
-  uint64_t LSB = Shift->getConstantOperandVal(1);
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
+  bool SplitV0 = V0.getValueType().getSizeInBits() == 128;
+
+  if (!isConcatMask(Mask, VT, SplitV0))
     return SDValue();
 
-  return DAG.getNode(AArch64ISD::UBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
-}
-
-/// For a true bitfield insert, the bits getting into that contiguous mask
-/// should come from the low part of an existing value: they must be formed from
-/// a compatible SHL operation (unless they're already low). This function
-/// checks that condition and returns the least-significant bit that's
-/// intended. If the operation not a field preparation, -1 is returned.
-static int32_t getLSBForBFI(SelectionDAG &DAG, SDLoc DL, EVT VT,
-                            SDValue &MaskedVal, uint64_t Mask) {
-  if (!isShiftedMask_64(Mask))
-    return -1;
-
-  // Now we need to alter MaskedVal so that it is an appropriate input for a BFI
-  // instruction. BFI will do a left-shift by LSB before applying the mask we've
-  // spotted, so in general we should pre-emptively "undo" that by making sure
-  // the incoming bits have had a right-shift applied to them.
-  //
-  // This right shift, however, will combine with existing left/right shifts. In
-  // the simplest case of a completely straight bitfield operation, it will be
-  // expected to completely cancel out with an existing SHL. More complicated
-  // cases (e.g. bitfield to bitfield copy) may still need a real shift before
-  // the BFI.
-
-  uint64_t LSB = countTrailingZeros(Mask);
-  int64_t ShiftRightRequired = LSB;
-  if (MaskedVal.getOpcode() == ISD::SHL &&
-      isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired -= MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  } else if (MaskedVal.getOpcode() == ISD::SRL &&
-             isa<ConstantSDNode>(MaskedVal.getOperand(1))) {
-    ShiftRightRequired += MaskedVal.getConstantOperandVal(1);
-    MaskedVal = MaskedVal.getOperand(0);
-  }
-
-  if (ShiftRightRequired > 0)
-    MaskedVal = DAG.getNode(ISD::SRL, DL, VT, MaskedVal,
-                            DAG.getConstant(ShiftRightRequired, MVT::i64));
-  else if (ShiftRightRequired < 0) {
-    // We could actually end up with a residual left shift, for example with
-    // "struc.bitfield = val << 1".
-    MaskedVal = DAG.getNode(ISD::SHL, DL, VT, MaskedVal,
-                            DAG.getConstant(-ShiftRightRequired, MVT::i64));
-  }
-
-  return LSB;
-}
-
-/// Searches from N for an existing AArch64ISD::BFI node, possibly surrounded by
-/// a mask and an extension. Returns true if a BFI was found and provides
-/// information on its surroundings.
-static bool findMaskedBFI(SDValue N, SDValue &BFI, uint64_t &Mask,
-                          bool &Extended) {
-  Extended = false;
-  if (N.getOpcode() == ISD::ZERO_EXTEND) {
-    Extended = true;
-    N = N.getOperand(0);
-  }
-
-  if (N.getOpcode() == ISD::AND && isa<ConstantSDNode>(N.getOperand(1))) {
-    Mask = N->getConstantOperandVal(1);
-    N = N.getOperand(0);
+  EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                VT.getVectorNumElements() / 2);
+  if (SplitV0) {
+    V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  if (V1.getValueType().getSizeInBits() == 128) {
+    V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
+                     DAG.getConstant(0, MVT::i64));
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
+}
+
+/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
+/// the specified operations to build the shuffle.
+static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
+                                      SDValue RHS, SelectionDAG &DAG,
+                                      SDLoc dl) {
+  unsigned OpNum = (PFEntry >> 26) & 0x0F;
+  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
+  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
+
+  enum {
+    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
+    OP_VREV,
+    OP_VDUP0,
+    OP_VDUP1,
+    OP_VDUP2,
+    OP_VDUP3,
+    OP_VEXT1,
+    OP_VEXT2,
+    OP_VEXT3,
+    OP_VUZPL, // VUZP, left result
+    OP_VUZPR, // VUZP, right result
+    OP_VZIPL, // VZIP, left result
+    OP_VZIPR, // VZIP, right result
+    OP_VTRNL, // VTRN, left result
+    OP_VTRNR  // VTRN, right result
+  };
+
+  if (OpNum == OP_COPY) {
+    if (LHSID == (1 * 9 + 2) * 9 + 3)
+      return LHS;
+    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
+    return RHS;
+  }
+
+  SDValue OpLHS, OpRHS;
+  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
+  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
+  EVT VT = OpLHS.getValueType();
+
+  switch (OpNum) {
+  default:
+    llvm_unreachable("Unknown shuffle opcode!");
+  case OP_VREV:
+    // VREV divides the vector in half and swaps within the half.
+    if (VT.getVectorElementType() == MVT::i32 ||
+        VT.getVectorElementType() == MVT::f32)
+      return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
+    // vrev <4 x i16> -> REV32
+    if (VT.getVectorElementType() == MVT::i16)
+      return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
+    // vrev <4 x i8> -> REV16
+    assert(VT.getVectorElementType() == MVT::i8);
+    return DAG.getNode(AArch64ISD::REV16, dl, VT, OpLHS);
+  case OP_VDUP0:
+  case OP_VDUP1:
+  case OP_VDUP2:
+  case OP_VDUP3: {
+    EVT EltTy = VT.getVectorElementType();
+    unsigned Opcode;
+    if (EltTy == MVT::i8)
+      Opcode = AArch64ISD::DUPLANE8;
+    else if (EltTy == MVT::i16)
+      Opcode = AArch64ISD::DUPLANE16;
+    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
+      Opcode = AArch64ISD::DUPLANE32;
+    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
+      Opcode = AArch64ISD::DUPLANE64;
+    else
+      llvm_unreachable("Invalid vector element type?");
+
+    if (VT.getSizeInBits() == 64)
+      OpLHS = WidenVector(OpLHS, DAG);
+    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
+    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
+  }
+  case OP_VEXT1:
+  case OP_VEXT2:
+  case OP_VEXT3: {
+    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
+    return DAG.getNode(AArch64ISD::EXT, dl, VT, OpLHS, OpRHS,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+  case OP_VUZPL:
+    return DAG.getNode(AArch64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VUZPR:
+    return DAG.getNode(AArch64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPL:
+    return DAG.getNode(AArch64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VZIPR:
+    return DAG.getNode(AArch64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNL:
+    return DAG.getNode(AArch64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  case OP_VTRNR:
+    return DAG.getNode(AArch64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS,
+                       OpRHS);
+  }
+}
+
+static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
+                           SelectionDAG &DAG) {
+  // Check to see if we can use the TBL instruction.
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  SDLoc DL(Op);
+
+  EVT EltVT = Op.getValueType().getVectorElementType();
+  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
+
+  SmallVector<SDValue, 8> TBLMask;
+  for (int Val : ShuffleMask) {
+    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
+      unsigned Offset = Byte + Val * BytesPerElt;
+      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
+    }
+  }
+
+  MVT IndexVT = MVT::v8i8;
+  unsigned IndexLen = 8;
+  if (Op.getValueType().getSizeInBits() == 128) {
+    IndexVT = MVT::v16i8;
+    IndexLen = 16;
+  }
+
+  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
+  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
+
+  SDValue Shuffle;
+  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
+    if (IndexLen == 8)
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
+    Shuffle = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+        DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                    makeArrayRef(TBLMask.data(), IndexLen)));
   } else {
-    // Mask is the whole width.
-    Mask = -1ULL >> (64 - N.getValueType().getSizeInBits());
+    if (IndexLen == 8) {
+      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl1, MVT::i32), V1Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    } else {
+      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
+      // cannot currently represent the register constraints on the input
+      // table registers.
+      //  Shuffle = DAG.getNode(AArch64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
+      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+      //                               &TBLMask[0], IndexLen));
+      Shuffle = DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
+          DAG.getConstant(Intrinsic::aarch64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
+          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
+                      makeArrayRef(TBLMask.data(), IndexLen)));
+    }
+  }
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
+}
+
+static unsigned getDUPLANEOp(EVT EltType) {
+  if (EltType == MVT::i8)
+    return AArch64ISD::DUPLANE8;
+  if (EltType == MVT::i16)
+    return AArch64ISD::DUPLANE16;
+  if (EltType == MVT::i32 || EltType == MVT::f32)
+    return AArch64ISD::DUPLANE32;
+  if (EltType == MVT::i64 || EltType == MVT::f64)
+    return AArch64ISD::DUPLANE64;
+
+  llvm_unreachable("Invalid vector element type?");
+}
+
+SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
+                                                   SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
+
+  // Convert shuffles that are directly supported on NEON to target-specific
+  // DAG nodes, instead of keeping them as shuffles and matching them again
+  // during code selection.  This is more efficient and avoids the possibility
+  // of inconsistencies between legalization and selection.
+  ArrayRef<int> ShuffleMask = SVN->getMask();
+
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+
+  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
+                                       V1.getValueType().getSimpleVT())) {
+    int Lane = SVN->getSplatIndex();
+    // If this is undef splat, generate it via "just" vdup, if possible.
+    if (Lane == -1)
+      Lane = 0;
+
+    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(AArch64ISD::DUP, dl, V1.getValueType(),
+                         V1.getOperand(0));
+    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
+    // constant. If so, we can just reference the lane's definition directly.
+    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
+        !isa<ConstantSDNode>(V1.getOperand(Lane)))
+      return DAG.getNode(AArch64ISD::DUP, dl, VT, V1.getOperand(Lane));
+
+    // Otherwise, duplicate from the lane of the input vector.
+    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
+
+    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
+    // to make a vector of the same size as this SHUFFLE. We can ignore the
+    // extract entirely, and canonicalise the concat using WidenVector.
+    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+      V1 = V1.getOperand(0);
+    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
+      Lane -= Idx * VT.getVectorNumElements() / 2;
+      V1 = WidenVector(V1.getOperand(Idx), DAG);
+    } else if (VT.getSizeInBits() == 64)
+      V1 = WidenVector(V1, DAG);
+
+    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
+  }
+
+  if (isREVMask(ShuffleMask, VT, 64))
+    return DAG.getNode(AArch64ISD::REV64, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 32))
+    return DAG.getNode(AArch64ISD::REV32, dl, V1.getValueType(), V1, V2);
+  if (isREVMask(ShuffleMask, VT, 16))
+    return DAG.getNode(AArch64ISD::REV16, dl, V1.getValueType(), V1, V2);
+
+  bool ReverseEXT = false;
+  unsigned Imm;
+  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
+    if (ReverseEXT)
+      std::swap(V1, V2);
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V2,
+                       DAG.getConstant(Imm, MVT::i32));
+  } else if (V2->getOpcode() == ISD::UNDEF &&
+             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
+    Imm *= getExtFactor(V1);
+    return DAG.getNode(AArch64ISD::EXT, dl, V1.getValueType(), V1, V1,
+                       DAG.getConstant(Imm, MVT::i32));
+  }
+
+  unsigned WhichResult;
+  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
+  }
+
+  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
+    unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2;
+    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
+  }
+
+  SDValue Concat = tryFormConcatFromShuffle(Op, DAG);
+  if (Concat.getNode())
+    return Concat;
+
+  bool DstIsLeft;
+  int Anomaly;
+  int NumInputElements = V1.getValueType().getVectorNumElements();
+  if (isINSMask(ShuffleMask, NumInputElements, DstIsLeft, Anomaly)) {
+    SDValue DstVec = DstIsLeft ? V1 : V2;
+    SDValue DstLaneV = DAG.getConstant(Anomaly, MVT::i64);
+
+    SDValue SrcVec = V1;
+    int SrcLane = ShuffleMask[Anomaly];
+    if (SrcLane >= NumInputElements) {
+      SrcVec = V2;
+      SrcLane -= VT.getVectorNumElements();
+    }
+    SDValue SrcLaneV = DAG.getConstant(SrcLane, MVT::i64);
+
+    EVT ScalarVT = VT.getVectorElementType();
+    if (ScalarVT.getSizeInBits() < 32)
+      ScalarVT = MVT::i32;
+
+    return DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, SrcVec, SrcLaneV),
+        DstLaneV);
+  }
+
+  // If the shuffle is not directly supported and it has 4 elements, use
+  // the PerfectShuffle-generated table to synthesize it from other shuffles.
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 4) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (ShuffleMask[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = ShuffleMask[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
   }
 
-  if (N.getOpcode() == AArch64ISD::BFI) {
-    BFI = N;
+  return GenerateTBL(Op, ShuffleMask, DAG);
+}
+
+static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
+                               APInt &UndefBits) {
+  EVT VT = BVN->getValueType(0);
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
+
+    for (unsigned i = 0; i < NumSplats; ++i) {
+      CnstBits <<= SplatBitSize;
+      UndefBits <<= SplatBitSize;
+      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
+      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
+    }
+
     return true;
   }
 
   return false;
 }
 
-/// Try to combine a subtree (rooted at an OR) into a "masked BFI" node, which
-/// is roughly equivalent to (and (BFI ...), mask). This form is used because it
-/// can often be further combined with a larger mask. Ultimately, we want mask
-/// to be 2^32-1 or 2^64-1 so the AND can be skipped.
-static SDValue tryCombineToBFI(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
+SDValue AArch64TargetLowering::LowerVectorAND(SDValue Op,
+                                              SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
+  SDValue LHS = Op.getOperand(0);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  if (!BVN)
+    return Op;
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We only have BIC vector immediate instruction, which is and-not.
+    CnstBits = ~CnstBits;
+
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::BICi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = ~UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate AND.
+FailedModImm:
+  return Op;
+}
+
+// Specialized code to quickly find if PotentialBVec is a BuildVector that
+// consists of only the same constant int value, returned in reference arg
+// ConstVal
+static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
+                                     uint64_t &ConstVal) {
+  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
+  if (!Bvec)
+    return false;
+  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
+  if (!FirstElt)
+    return false;
+  EVT VT = Bvec->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+  for (unsigned i = 1; i < NumElts; ++i)
+    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
+      return false;
+  ConstVal = FirstElt->getZExtValue();
+  return true;
+}
+
+static unsigned getIntrinsicID(const SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  switch (Opcode) {
+  default:
+    return Intrinsic::not_intrinsic;
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return IID;
+    return Intrinsic::not_intrinsic;
+  }
+  }
+}
+
+// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
+// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
+// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
+// Also, logical shift right -> sri, with the same structure.
+static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
+  if (!VT.isVector())
+    return SDValue();
 
-  // We need the LHS to be (and SOMETHING, MASK). Find out what that mask is or
-  // abandon the effort.
-  SDValue LHS = N->getOperand(0);
-  if (LHS.getOpcode() != ISD::AND)
+  SDLoc DL(N);
+
+  // Is the first op an AND?
+  const SDValue And = N->getOperand(0);
+  if (And.getOpcode() != ISD::AND)
     return SDValue();
 
-  uint64_t LHSMask;
-  if (isa<ConstantSDNode>(LHS.getOperand(1)))
-    LHSMask = LHS->getConstantOperandVal(1);
-  else
+  // Is the second op an shl or lshr?
+  SDValue Shift = N->getOperand(1);
+  // This will have been turned into: AArch64ISD::VSHL vector, #shift
+  // or AArch64ISD::VLSHR vector, #shift
+  unsigned ShiftOpc = Shift.getOpcode();
+  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
     return SDValue();
+  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
 
-  // We also need the RHS to be (and SOMETHING, MASK). Find out what that mask
-  // is or abandon the effort.
-  SDValue RHS = N->getOperand(1);
-  if (RHS.getOpcode() != ISD::AND)
+  // Is the shift amount constant?
+  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
+  if (!C2node)
     return SDValue();
 
-  uint64_t RHSMask;
-  if (isa<ConstantSDNode>(RHS.getOperand(1)))
-    RHSMask = RHS->getConstantOperandVal(1);
-  else
+  // Is the and mask vector all constant?
+  uint64_t C1;
+  if (!isAllConstantBuildVector(And.getOperand(1), C1))
     return SDValue();
 
-  // Can't do anything if the masks are incompatible.
-  if (LHSMask & RHSMask)
+  // Is C1 == ~C2, taking into account how much one can shift elements of a
+  // particular size?
+  uint64_t C2 = C2node->getZExtValue();
+  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
+  if (C2 > ElemSizeInBits)
+    return SDValue();
+  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
+  if ((C1 & ElemMask) != (~C2 & ElemMask))
     return SDValue();
 
-  // Now we need one of the masks to be a contiguous field. Without loss of
-  // generality that should be the RHS one.
-  SDValue Bitfield = LHS.getOperand(0);
-  if (getLSBForBFI(DAG, DL, VT, Bitfield, LHSMask) != -1) {
-    // We know that LHS is a candidate new value, and RHS isn't already a better
-    // one.
-    std::swap(LHS, RHS);
-    std::swap(LHSMask, RHSMask);
+  SDValue X = And.getOperand(0);
+  SDValue Y = Shift.getOperand(0);
+
+  unsigned Intrin =
+      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
+  SDValue ResultSLI =
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
+
+  DEBUG(dbgs() << "aarch64-lower: transformed: \n");
+  DEBUG(N->dump(&DAG));
+  DEBUG(dbgs() << "into: \n");
+  DEBUG(ResultSLI->dump(&DAG));
+
+  ++NumShiftInserts;
+  return ResultSLI;
+}
+
+SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
+  if (EnableAArch64SlrGeneration) {
+    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
+    if (Res.getNode())
+      return Res;
+  }
+
+  BuildVectorSDNode *BVN =
+      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
+  SDValue LHS = Op.getOperand(1);
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  // OR commutes, so try swapping the operands.
+  if (!BVN) {
+    LHS = Op.getOperand(0);
+    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
   }
+  if (!BVN)
+    return Op;
 
-  // We've done our best to put the right operands in the right places, all we
-  // can do now is check whether a BFI exists.
-  Bitfield = RHS.getOperand(0);
-  int32_t LSB = getLSBForBFI(DAG, DL, VT, Bitfield, RHSMask);
-  if (LSB == -1)
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::ORRi, dl, MovTy, LHS,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+
+// We can always fall back to a non-immediate OR.
+FailedModImm:
+  return Op;
+}
+
+SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
+    // We make use of a little bit of goto ickiness in order to avoid having to
+    // duplicate the immediate matching logic for the undef toggled case.
+    bool SecondTry = false;
+  AttemptModImm:
+
+    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
+      CnstBits = CnstBits.zextOrTrunc(64);
+      uint64_t CnstVal = CnstBits.getZExtValue();
+
+      // Certain magic vector constants (used to express things like NOT
+      // and NEG) are passed through unmodified.  This allows codegen patterns
+      // for these operations to match.  Special-purpose patterns will lower
+      // these immediates to MOVIs if it proves necessary.
+      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
+        return Op;
+
+      // The many faces of MOVI...
+      if (AArch64_AM::isAdvSIMDModImmType10(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType10(CnstVal);
+        if (VT.getSizeInBits() == 128) {
+          SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::v2i64,
+                                    DAG.getConstant(CnstVal, MVT::i32));
+          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+        }
+
+        // Support the V64 version via subregister insertion.
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIedit, dl, MVT::f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType9(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType9(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
+        SDValue Mov = DAG.getNode(AArch64ISD::MOVI, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The few faces of FMOV...
+      if (AArch64_AM::isAdvSIMDModImmType11(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType11(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType12(CnstVal) &&
+          VT.getSizeInBits() == 128) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType12(CnstVal);
+        SDValue Mov = DAG.getNode(AArch64ISD::FMOV, dl, MVT::v2f64,
+                                  DAG.getConstant(CnstVal, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      // The many faces of MVNI...
+      CnstVal = ~CnstVal;
+      if (AArch64_AM::isAdvSIMDModImmType1(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType1(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType2(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType2(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType3(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType3(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(16, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType4(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType4(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(24, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType5(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType5(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(0, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType6(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType6(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNIshift, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(8, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType7(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType7(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(264, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+
+      if (AArch64_AM::isAdvSIMDModImmType8(CnstVal)) {
+        CnstVal = AArch64_AM::encodeAdvSIMDModImmType8(CnstVal);
+        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
+        SDValue Mov = DAG.getNode(AArch64ISD::MVNImsl, dl, MovTy,
+                                  DAG.getConstant(CnstVal, MVT::i32),
+                                  DAG.getConstant(272, MVT::i32));
+        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
+      }
+    }
+
+    if (SecondTry)
+      goto FailedModImm;
+    SecondTry = true;
+    CnstBits = UndefBits;
+    goto AttemptModImm;
+  }
+FailedModImm:
+
+  // Scan through the operands to find some interesting properties we can
+  // exploit:
+  //   1) If only one value is used, we can use a DUP, or
+  //   2) if only the low element is not undef, we can just insert that, or
+  //   3) if only one constant value is used (w/ some non-constant lanes),
+  //      we can splat the constant value into the whole vector then fill
+  //      in the non-constant lanes.
+  //   4) FIXME: If different constant values are used, but we can intelligently
+  //             select the values we'll be overwriting for the non-constant
+  //             lanes such that we can directly materialize the vector
+  //             some other way (MOVI, e.g.), we can be sneaky.
+  unsigned NumElts = VT.getVectorNumElements();
+  bool isOnlyLowElement = true;
+  bool usesOnlyOneValue = true;
+  bool usesOnlyOneConstantValue = true;
+  bool isConstant = true;
+  unsigned NumConstantLanes = 0;
+  SDValue Value;
+  SDValue ConstantValue;
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue V = Op.getOperand(i);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (isa<ConstantSDNode>(V) || isa<ConstantFPSDNode>(V)) {
+      ++NumConstantLanes;
+      if (!ConstantValue.getNode())
+        ConstantValue = V;
+      else if (ConstantValue != V)
+        usesOnlyOneConstantValue = false;
+    }
+
+    if (!Value.getNode())
+      Value = V;
+    else if (V != Value)
+      usesOnlyOneValue = false;
+  }
+
+  if (!Value.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
+
+  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
+  // i32 and try again.
+  if (usesOnlyOneValue) {
+    if (!isConstant) {
+      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+          Value.getValueType() != VT)
+        return DAG.getNode(AArch64ISD::DUP, dl, VT, Value);
+
+      // This is actually a DUPLANExx operation, which keeps everything vectory.
+
+      // DUPLANE works on 128-bit vectors, widen it if necessary.
+      SDValue Lane = Value.getOperand(1);
+      Value = Value.getOperand(0);
+      if (Value.getValueType().getSizeInBits() == 64)
+        Value = WidenVector(Value, DAG);
+
+      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
+      return DAG.getNode(Opcode, dl, VT, Value, Lane);
+    }
+
+    if (VT.getVectorElementType().isFloatingPoint()) {
+      SmallVector<SDValue, 8> Ops;
+      MVT NewType =
+          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      for (unsigned i = 0; i < NumElts; ++i)
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
+      Val = LowerBUILD_VECTOR(Val, DAG);
+      if (Val.getNode())
+        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
+    }
+  }
+
+  // If there was only one constant value used and for more than one lane,
+  // start by splatting that value, then replace the non-constant lanes. This
+  // is better than the default, which will perform a separate initialization
+  // for each lane.
+  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
+    SDValue Val = DAG.getNode(AArch64ISD::DUP, dl, VT, ConstantValue);
+    // Now insert the non-constant lanes.
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      if (!isa<ConstantSDNode>(V) && !isa<ConstantFPSDNode>(V)) {
+        // Note that type legalization likely mucked about with the VT of the
+        // source operand, so we may have to convert it here before inserting.
+        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
+      }
+    }
+    return Val;
+  }
+
+  // If all elements are constants and the case above didn't get hit, fall back
+  // to the default expansion, which will generate a load from the constant
+  // pool.
+  if (isConstant)
+    return SDValue();
+
+  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
+  if (NumElts >= 4) {
+    SDValue shuffle = ReconstructShuffle(Op, DAG);
+    if (shuffle != SDValue())
+      return shuffle;
+  }
+
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
+    unsigned i = 0;
+    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // a) Avoid a RMW dependency on the full vector register, and
+    // b) Allow the register coalescer to fold away the copy if the
+    //    value is already in an S or D register.
+    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
+      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
+      MachineSDNode *N =
+          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
+                             DAG.getTargetConstant(SubIdx, MVT::i32));
+      Vec = SDValue(N, 0);
+      ++i;
+    }
+    for (; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
+  // Just use the default expansion. We failed to find a better alternative.
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
+
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(2)))
     return SDValue();
 
-  uint32_t Width = CountPopulation_64(RHSMask);
-  assert(Width && "Expected non-zero bitfield width");
+  EVT VT = Op.getOperand(0).getValueType();
 
-  SDValue BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                            LHS.getOperand(0), Bitfield,
-                            DAG.getConstant(LSB, MVT::i64),
-                            DAG.getConstant(Width, MVT::i64));
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
 
-  // Mask is trivial
-  if ((LHSMask | RHSMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
+  // For V64 types, we perform insertion by expanding the value
+  // to a V128 type and perform the insertion on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
 
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(LHSMask | RHSMask, VT));
+  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
+                             Op.getOperand(1), Op.getOperand(2));
+  // Re-narrow the resultant vector.
+  return NarrowVector(Node, DAG);
 }
 
-/// Search for the bitwise combining (with careful masks) of a MaskedBFI and its
-/// original input. This is surprisingly common because SROA splits things up
-/// into i8 chunks, so the originally detected MaskedBFI may actually only act
-/// on the low (say) byte of a word. This is then orred into the rest of the
-/// word afterwards.
-///
-/// Basic input: (or (and OLDFIELD, MASK1), (MaskedBFI MASK2, OLDFIELD, ...)).
-///
-/// If MASK1 and MASK2 are compatible, we can fold the whole thing into the
-/// MaskedBFI. We can also deal with a certain amount of extend/truncate being
-/// involved.
-static SDValue tryCombineToLargerBFI(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     const AArch64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
+SDValue
+AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
 
-  // First job is to hunt for a MaskedBFI on either the left or right. Swap
-  // operands if it's actually on the right.
-  SDValue BFI;
-  SDValue PossExtraMask;
-  uint64_t ExistingMask = 0;
-  bool Extended = false;
-  if (findMaskedBFI(N->getOperand(0), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(1);
-  else if (findMaskedBFI(N->getOperand(1), BFI, ExistingMask, Extended))
-    PossExtraMask = N->getOperand(0);
-  else
+  // Check for non-constant lane.
+  if (!isa<ConstantSDNode>(Op.getOperand(1)))
+    return SDValue();
+
+  EVT VT = Op.getOperand(0).getValueType();
+
+  // Insertion/extraction are legal for V128 types.
+  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
+    return Op;
+
+  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
+      VT != MVT::v1i64 && VT != MVT::v2f32)
+    return SDValue();
+
+  // For V64 types, we perform extraction by expanding the value
+  // to a V128 type and perform the extraction on that.
+  SDLoc DL(Op);
+  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
+  EVT WideTy = WideVec.getValueType();
+
+  EVT ExtrTy = WideTy.getVectorElementType();
+  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
+    ExtrTy = MVT::i32;
+
+  // For extractions, we just return the result directly.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
+                     Op.getOperand(1));
+}
+
+SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc dl(Op);
+  // Just in case...
+  if (!VT.isVector())
     return SDValue();
 
-  // We can only combine a BFI with another compatible mask.
-  if (PossExtraMask.getOpcode() != ISD::AND ||
-      !isa<ConstantSDNode>(PossExtraMask.getOperand(1)))
+  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+  if (!Cst)
     return SDValue();
+  unsigned Val = Cst->getZExtValue();
+
+  unsigned Size = Op.getValueType().getSizeInBits();
+  if (Val == 0) {
+    switch (Size) {
+    case 8:
+      return DAG.getTargetExtractSubreg(AArch64::bsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 16:
+      return DAG.getTargetExtractSubreg(AArch64::hsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 32:
+      return DAG.getTargetExtractSubreg(AArch64::ssub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    case 64:
+      return DAG.getTargetExtractSubreg(AArch64::dsub, dl, Op.getValueType(),
+                                        Op.getOperand(0));
+    default:
+      llvm_unreachable("Unexpected vector type in extract_subvector!");
+    }
+  }
+  // If this is extracting the upper 64-bits of a 128-bit vector, we match
+  // that directly.
+  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+bool AArch64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
+                                               EVT VT) const {
+  if (VT.getVectorNumElements() == 4 &&
+      (VT.is128BitVector() || VT.is64BitVector())) {
+    unsigned PFIndexes[4];
+    for (unsigned i = 0; i != 4; ++i) {
+      if (M[i] < 0)
+        PFIndexes[i] = 8;
+      else
+        PFIndexes[i] = M[i];
+    }
+
+    // Compute the index in the perfect shuffle table.
+    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
+                            PFIndexes[2] * 9 + PFIndexes[3];
+    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
+    unsigned Cost = (PFEntry >> 30);
+
+    if (Cost <= 4)
+      return true;
+  }
+
+  bool DummyBool;
+  int DummyInt;
+  unsigned DummyUnsigned;
+
+  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
+          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
+          isEXTMask(M, VT, DummyBool, DummyUnsigned) ||
+          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
+          isTRNMask(M, VT, DummyUnsigned) || isUZPMask(M, VT, DummyUnsigned) ||
+          isZIPMask(M, VT, DummyUnsigned) ||
+          isTRN_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isUZP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isZIP_v_undef_Mask(M, VT, DummyUnsigned) ||
+          isINSMask(M, VT.getVectorNumElements(), DummyBool, DummyInt) ||
+          isConcatMask(M, VT, VT.getSizeInBits() == 128));
+}
+
+/// getVShiftImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift operation, where all the elements of the
+/// build_vector must have the same constant integer value.
+static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
+  // Ignore bit_converts.
+  while (Op.getOpcode() == ISD::BITCAST)
+    Op = Op.getOperand(0);
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
+                                    HasAnyUndefs, ElementBits) ||
+      SplatBitSize > ElementBits)
+    return false;
+  Cnt = SplatBits.getSExtValue();
+  return true;
+}
+
+/// isVShiftLImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift left operation.  That value must be in the range:
+///   0 <= Value < ElementBits for a left shift; or
+///   0 <= Value <= ElementBits for a long left shift.
+static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
+}
+
+/// isVShiftRImm - Check if this is a valid build_vector for the immediate
+/// operand of a vector shift right operation.  For a shift opcode, the value
+/// is positive, but for an intrinsic the value count must be negative. The
+/// absolute value must be in the range:
+///   1 <= |Value| <= ElementBits for a right shift; or
+///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
+static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
+                         int64_t &Cnt) {
+  assert(VT.isVector() && "vector shift count is not a vector type");
+  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
+  if (!getVShiftImm(Op, ElementBits, Cnt))
+    return false;
+  if (isIntrinsic)
+    Cnt = -Cnt;
+  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
+}
+
+SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
+                                                      SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  int64_t Cnt;
+
+  if (!Op.getOperand(1).getValueType().isVector())
+    return Op;
+  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("unexpected shift opcode");
+
+  case ISD::SHL:
+    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
+      return DAG.getNode(AArch64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(Intrinsic::aarch64_neon_ushl, MVT::i32),
+                       Op.getOperand(0), Op.getOperand(1));
+  case ISD::SRA:
+  case ISD::SRL:
+    // Right shift immediate
+    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
+        Cnt < EltSize) {
+      unsigned Opc =
+          (Op.getOpcode() == ISD::SRA) ? AArch64ISD::VASHR : AArch64ISD::VLSHR;
+      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
+                         DAG.getConstant(Cnt, MVT::i32));
+    }
+
+    // Right shift register.  Note, there is not a shift right register
+    // instruction, but the shift left register instruction takes a signed
+    // value, where negative numbers specify a right shift.
+    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::aarch64_neon_sshl
+                                                : Intrinsic::aarch64_neon_ushl;
+    // negate the shift amount
+    SDValue NegShift = DAG.getNode(AArch64ISD::NEG, DL, VT, Op.getOperand(1));
+    SDValue NegShiftLeft =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
+    return NegShiftLeft;
+  }
+
+  return SDValue();
+}
+
+static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
+                                    AArch64CC::CondCode CC, bool NoNans, EVT VT,
+                                    SDLoc dl, SelectionDAG &DAG) {
+  EVT SrcVT = LHS.getValueType();
+
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  APInt CnstBits(VT.getSizeInBits(), 0);
+  APInt UndefBits(VT.getSizeInBits(), 0);
+  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
+  bool IsZero = IsCnst && (CnstBits == 0);
+
+  if (SrcVT.getVectorElementType().isFloatingPoint()) {
+    switch (CC) {
+    default:
+      return SDValue();
+    case AArch64CC::NE: {
+      SDValue Fcmeq;
+      if (IsZero)
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      else
+        Fcmeq = DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+      return DAG.getNode(AArch64ISD::NOT, dl, VT, Fcmeq);
+    }
+    case AArch64CC::EQ:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMEQz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMEQ, dl, VT, LHS, RHS);
+    case AArch64CC::GE:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, LHS, RHS);
+    case AArch64CC::GT:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMGTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, LHS, RHS);
+    case AArch64CC::LS:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLEz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGE, dl, VT, RHS, LHS);
+    case AArch64CC::LT:
+      if (!NoNans)
+        return SDValue();
+    // If we ignore NaNs then we can use to the MI implementation.
+    // Fallthrough.
+    case AArch64CC::MI:
+      if (IsZero)
+        return DAG.getNode(AArch64ISD::FCMLTz, dl, VT, LHS);
+      return DAG.getNode(AArch64ISD::FCMGT, dl, VT, RHS, LHS);
+    }
+  }
+
+  switch (CC) {
+  default:
+    return SDValue();
+  case AArch64CC::NE: {
+    SDValue Cmeq;
+    if (IsZero)
+      Cmeq = DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    else
+      Cmeq = DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+    return DAG.getNode(AArch64ISD::NOT, dl, VT, Cmeq);
+  }
+  case AArch64CC::EQ:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMEQz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMEQ, dl, VT, LHS, RHS);
+  case AArch64CC::GE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, LHS, RHS);
+  case AArch64CC::GT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMGTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, LHS, RHS);
+  case AArch64CC::LE:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLEz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGE, dl, VT, RHS, LHS);
+  case AArch64CC::LS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, RHS, LHS);
+  case AArch64CC::LO:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, RHS, LHS);
+  case AArch64CC::LT:
+    if (IsZero)
+      return DAG.getNode(AArch64ISD::CMLTz, dl, VT, LHS);
+    return DAG.getNode(AArch64ISD::CMGT, dl, VT, RHS, LHS);
+  case AArch64CC::HI:
+    return DAG.getNode(AArch64ISD::CMHI, dl, VT, LHS, RHS);
+  case AArch64CC::HS:
+    return DAG.getNode(AArch64ISD::CMHS, dl, VT, LHS, RHS);
+  }
+}
 
-  uint64_t ExtraMask = PossExtraMask->getConstantOperandVal(1);
+SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  if (LHS.getValueType().getVectorElementType().isInteger()) {
+    assert(LHS.getValueType() == RHS.getValueType());
+    AArch64CC::CondCode AArch64CC = changeIntCCToAArch64CC(CC);
+    return EmitVectorComparison(LHS, RHS, AArch64CC, false, Op.getValueType(),
+                                dl, DAG);
+  }
+
+  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
+         LHS.getValueType().getVectorElementType() == MVT::f64);
+
+  // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
+  // clean.  Some of them require two branches to implement.
+  AArch64CC::CondCode CC1, CC2;
+  bool ShouldInvert;
+  changeVectorFPCCToAArch64CC(CC, CC1, CC2, ShouldInvert);
 
-  // Masks must be compatible.
-  if (ExtraMask & ExistingMask)
+  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
+  SDValue Cmp =
+      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
+  if (!Cmp.getNode())
     return SDValue();
 
-  SDValue OldBFIVal = BFI.getOperand(0);
-  SDValue NewBFIVal = BFI.getOperand(1);
-  if (Extended) {
-    // We skipped a ZERO_EXTEND above, so the input to the MaskedBFIs should be
-    // 32-bit and we'll be forming a 64-bit MaskedBFI. The MaskedBFI arguments
-    // need to be made compatible.
-    assert(VT == MVT::i64 && BFI.getValueType() == MVT::i32
-           && "Invalid types for BFI");
-    OldBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, OldBFIVal);
-    NewBFIVal = DAG.getNode(ISD::ANY_EXTEND, DL, VT, NewBFIVal);
+  if (CC2 != AArch64CC::AL) {
+    SDValue Cmp2 =
+        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
+    if (!Cmp2.getNode())
+      return SDValue();
+
+    Cmp = DAG.getNode(ISD::OR, dl, Cmp.getValueType(), Cmp, Cmp2);
+  }
+
+  if (ShouldInvert)
+    return Cmp = DAG.getNOT(dl, Cmp, Cmp.getValueType());
+
+  return Cmp;
+}
+
+/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
+/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
+/// specified in the intrinsic calls.
+bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                               const CallInst &I,
+                                               unsigned Intrinsic) const {
+  switch (Intrinsic) {
+  case Intrinsic::aarch64_neon_ld2:
+  case Intrinsic::aarch64_neon_ld3:
+  case Intrinsic::aarch64_neon_ld4:
+  case Intrinsic::aarch64_neon_ld1x2:
+  case Intrinsic::aarch64_neon_ld1x3:
+  case Intrinsic::aarch64_neon_ld1x4:
+  case Intrinsic::aarch64_neon_ld2lane:
+  case Intrinsic::aarch64_neon_ld3lane:
+  case Intrinsic::aarch64_neon_ld4lane:
+  case Intrinsic::aarch64_neon_ld2r:
+  case Intrinsic::aarch64_neon_ld3r:
+  case Intrinsic::aarch64_neon_ld4r: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile loads with NEON intrinsics not supported
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_neon_st2:
+  case Intrinsic::aarch64_neon_st3:
+  case Intrinsic::aarch64_neon_st4:
+  case Intrinsic::aarch64_neon_st1x2:
+  case Intrinsic::aarch64_neon_st1x3:
+  case Intrinsic::aarch64_neon_st1x4:
+  case Intrinsic::aarch64_neon_st2lane:
+  case Intrinsic::aarch64_neon_st3lane:
+  case Intrinsic::aarch64_neon_st4lane: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    unsigned NumElts = 0;
+    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
+      Type *ArgTy = I.getArgOperand(ArgI)->getType();
+      if (!ArgTy->isVectorTy())
+        break;
+      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
+    }
+    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
+    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
+    Info.offset = 0;
+    Info.align = 0;
+    Info.vol = false; // volatile stores with NEON intrinsics not supported
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxr:
+  case Intrinsic::aarch64_ldxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxr:
+  case Intrinsic::aarch64_stxr: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  case Intrinsic::aarch64_ldaxp:
+  case Intrinsic::aarch64_ldxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::aarch64_stlxp:
+  case Intrinsic::aarch64_stxp: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::i128;
+    Info.ptrVal = I.getArgOperand(2);
+    Info.offset = 0;
+    Info.align = 16;
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
+  default:
+    break;
   }
 
-  // We need the MaskedBFI to be combined with a mask of the *same* value.
-  if (PossExtraMask.getOperand(0) != OldBFIVal)
+  return false;
+}
+
+// Truncations from 64-bit GPR to 32-bit GPR is free.
+bool AArch64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+bool AArch64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 <= NumBits2)
+    return false;
+  return true;
+}
+
+// All 32-bit GPR operations implicitly zero the high-half of the corresponding
+// 64-bit GPR.
+bool AArch64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+bool AArch64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  if (NumBits1 == 32 && NumBits2 == 64)
+    return true;
+  return false;
+}
+
+bool AArch64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
+  EVT VT1 = Val.getValueType();
+  if (isZExtFree(VT1, VT2)) {
+    return true;
+  }
+
+  if (Val.getOpcode() != ISD::LOAD)
+    return false;
+
+  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
+  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
+          VT2.isInteger() && VT1.getSizeInBits() <= 32);
+}
+
+bool AArch64TargetLowering::hasPairedLoad(Type *LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
+                                          unsigned &RequiredAligment) const {
+  if (!LoadedType.isSimple() ||
+      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
+    return false;
+  // Cyclone supports unaligned accesses.
+  RequiredAligment = 0;
+  unsigned NumBits = LoadedType.getSizeInBits();
+  return NumBits == 32 || NumBits == 64;
+}
+
+static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
+                       unsigned AlignCheck) {
+  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
+          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+}
+
+EVT AArch64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                                               unsigned SrcAlign, bool IsMemset,
+                                               bool ZeroMemset,
+                                               bool MemcpyStrSrc,
+                                               MachineFunction &MF) const {
+  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+  // instruction to materialize the v2i64 zero and one store (with restrictive
+  // addressing mode). Just do two i64 store of zero-registers.
+  bool Fast;
+  const Function *F = MF.getFunction();
+  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                       Attribute::NoImplicitFloat) &&
+      (memOpAlign(SrcAlign, DstAlign, 16) ||
+       (allowsUnalignedMemoryAccesses(MVT::f128, 0, &Fast) && Fast)))
+    return MVT::f128;
+
+  return Size >= 8 ? MVT::i64 : MVT::i32;
+}
+
+// 12-bit optionally shifted immediates are legal for adds.
+bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
+  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
+    return true;
+  return false;
+}
+
+// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
+// immediates is the same as for an add or a sub.
+bool AArch64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
+  if (Immed < 0)
+    Immed *= -1;
+  return isLegalAddImmediate(Immed);
+}
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool AArch64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
+                                                  Type *Ty) const {
+  // AArch64 has five basic addressing modes:
+  //  reg
+  //  reg + 9-bit signed offset
+  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
+  //  reg1 + reg2
+  //  reg + SIZE_IN_BYTES * reg
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // No reg+reg+imm addressing.
+  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
+    return false;
+
+  // check reg + imm case:
+  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
+  uint64_t NumBytes = 0;
+  if (Ty->isSized()) {
+    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
+    NumBytes = NumBits / 8;
+    if (!isPowerOf2_64(NumBits))
+      NumBytes = 0;
+  }
+
+  if (!AM.Scale) {
+    int64_t Offset = AM.BaseOffs;
+
+    // 9-bit signed offset
+    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
+      return true;
+
+    // 12-bit unsigned offset
+    unsigned shift = Log2_64(NumBytes);
+    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
+        // Must be a multiple of NumBytes (NumBytes is a power of 2)
+        (Offset >> shift) << shift == Offset)
+      return true;
+    return false;
+  }
+
+  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
+
+  if (!AM.Scale || AM.Scale == 1 ||
+      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
+    return true;
+  return false;
+}
+
+int AArch64TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                                Type *Ty) const {
+  // Scaling factors are not free at all.
+  // Operands                     | Rt Latency
+  // -------------------------------------------
+  // Rt, [Xn, Xm]                 | 4
+  // -------------------------------------------
+  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
+  // Rt, [Xn, Wm, <extend> #imm]  |
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1 if
+    // it is not equal to 0 or 1.
+    return AM.Scale != 0 && AM.Scale != 1;
+  return -1;
+}
+
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+  VT = VT.getScalarType();
+
+  if (!VT.isSimple())
+    return false;
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::f32:
+  case MVT::f64:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+const MCPhysReg *
+AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  // LR is a callee-save register, but we must treat it as clobbered by any call
+  // site. Hence we include LR in the scratch registers, which are in turn added
+  // as implicit-defs for stackmaps and patchpoints.
+  static const MCPhysReg ScratchRegs[] = {
+    AArch64::X16, AArch64::X17, AArch64::LR, 0
+  };
+  return ScratchRegs;
+}
+
+bool
+AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N) const {
+  EVT VT = N->getValueType(0);
+    // If N is unsigned bit extraction: ((x >> C) & mask), then do not combine
+    // it with shift to let it be lowered to UBFX.
+  if (N->getOpcode() == ISD::AND && (VT == MVT::i32 || VT == MVT::i64) &&
+      isa<ConstantSDNode>(N->getOperand(1))) {
+    uint64_t TruncMask = N->getConstantOperandVal(1);
+    if (isMask_64(TruncMask) &&
+      N->getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N->getOperand(0)->getOperand(1)))
+      return false;
+  }
+  return true;
+}
+
+bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                              Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return false;
+
+  int64_t Val = Imm.getSExtValue();
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, BitSize))
+    return true;
+
+  if ((int64_t)Val < 0)
+    Val = ~Val;
+  if (BitSize == 32)
+    Val &= (1LL << 32) - 1;
+
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  unsigned Shift = (63 - LZ) / 16;
+  // MOVZ is free so return true for one or fewer MOVK.
+  return (Shift < 3) ? true : false;
+}
+
+// Generate SUBS and CSEL for integer abs.
+static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
+  // and change it to SUB and CSEL.
+  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
+      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
+    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
+      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
+        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                  N0.getOperand(0));
+        // Generate SUBS & CSEL.
+        SDValue Cmp =
+            DAG.getNode(AArch64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
+                        N0.getOperand(0), DAG.getConstant(0, VT));
+        return DAG.getNode(AArch64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
+                           DAG.getConstant(AArch64CC::PL, MVT::i32),
+                           SDValue(Cmp.getNode(), 1));
+      }
+  return SDValue();
+}
+
+// performXorCombine - Attempts to handle integer ABS.
+static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  BFI = DAG.getNode(AArch64ISD::BFI, DL, VT,
-                    OldBFIVal, NewBFIVal,
-                    BFI.getOperand(2), BFI.getOperand(3));
+  return performIntegerAbsCombine(N, DAG);
+}
 
-  // If the masking is trivial, we don't need to create it.
-  if ((ExtraMask | ExistingMask) == (-1ULL >> (64 - VT.getSizeInBits())))
-    return BFI;
+static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const AArch64Subtarget *Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
 
-  return DAG.getNode(ISD::AND, DL, VT, BFI,
-                     DAG.getConstant(ExtraMask | ExistingMask, VT));
+  // Multiplication of a power of two plus/minus one can be done more
+  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
+  // future CPUs have a cheaper MADD instruction, this may need to be
+  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
+  // 64-bit is 5 cycles, so this is always a win.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
+    APInt Value = C->getAPIntValue();
+    EVT VT = N->getValueType(0);
+    APInt VP1 = Value + 1;
+    if (VP1.isPowerOf2()) {
+      // Multiplying by one less than a power of two, replace with a shift
+      // and a subtract.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VP1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+    APInt VM1 = Value - 1;
+    if (VM1.isPowerOf2()) {
+      // Multiplying by one more than a power of two, replace with a shift
+      // and an add.
+      SDValue ShiftedVal =
+          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
+                      DAG.getConstant(VM1.logBase2(), MVT::i64));
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
+    }
+  }
+  return SDValue();
+}
+
+static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::f32 && VT != MVT::f64)
+    return SDValue();
+  // Only optimize when the source and destination types have the same width.
+  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
+    return SDValue();
+
+  // If the result of an integer load is only used by an integer-to-float
+  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
+  // This eliminates an "integer-to-vector-move UOP and improve throughput.
+  SDValue N0 = N->getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                               LN0->getPointerInfo(), LN0->isVolatile(),
+                               LN0->isNonTemporal(), LN0->isInvariant(),
+                               LN0->getAlignment());
+
+    // Make sure successors of the original load stay after it by updating them
+    // to use the new Chain.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
+
+    unsigned Opcode =
+        (N->getOpcode() == ISD::SINT_TO_FP) ? AArch64ISD::SITOF : AArch64ISD::UITOF;
+    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
+  }
+
+  return SDValue();
 }
 
 /// An EXTR instruction is made up of two shifts, ORed together. This helper
@@ -3782,298 +6461,952 @@ static SDValue tryCombineToEXTR(SDNode *N,
     std::swap(ShiftLHS, ShiftRHS);
   }
 
-  return DAG.getNode(AArch64ISD::EXTR, DL, VT,
-                     LHS, RHS,
+  return DAG.getNode(AArch64ISD::EXTR, DL, VT, LHS, RHS,
                      DAG.getConstant(ShiftRHS, MVT::i64));
 }
 
-/// Target-specific dag combine xforms for ISD::OR
-static SDValue PerformORCombine(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI,
-                                const AArch64Subtarget *Subtarget) {
-
+static SDValue tryCombineToBSL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI) {
+  EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
+
+  if (!VT.isVector())
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  if (N0.getOpcode() != ISD::AND)
+    return SDValue();
+
+  SDValue N1 = N->getOperand(1);
+  if (N1.getOpcode() != ISD::AND)
+    return SDValue();
+
+  // We only have to look for constant vectors here since the general, variable
+  // case can be handled in TableGen.
+  unsigned Bits = VT.getVectorElementType().getSizeInBits();
+  uint64_t BitMask = Bits == 64 ? -1ULL : ((1ULL << Bits) - 1);
+  for (int i = 1; i >= 0; --i)
+    for (int j = 1; j >= 0; --j) {
+      BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(i));
+      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(j));
+      if (!BVN0 || !BVN1)
+        continue;
+
+      bool FoundMatch = true;
+      for (unsigned k = 0; k < VT.getVectorNumElements(); ++k) {
+        ConstantSDNode *CN0 = dyn_cast<ConstantSDNode>(BVN0->getOperand(k));
+        ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(BVN1->getOperand(k));
+        if (!CN0 || !CN1 ||
+            CN0->getZExtValue() != (BitMask & ~CN1->getZExtValue())) {
+          FoundMatch = false;
+          break;
+        }
+      }
+
+      if (FoundMatch)
+        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+                           N0->getOperand(1 - i), N1->getOperand(1 - j));
+    }
+
+  return SDValue();
+}
+
+static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                                const AArch64Subtarget *Subtarget) {
+  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
+  if (!EnableAArch64ExtrGeneration)
+    return SDValue();
+  SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
-  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
-  // Attempt to recognise bitfield-insert operations.
-  SDValue Res = tryCombineToBFI(N, DCI, Subtarget);
+  SDValue Res = tryCombineToEXTR(N, DCI);
   if (Res.getNode())
     return Res;
 
-  // Attempt to combine an existing MaskedBFI operation into one with a larger
-  // mask.
-  Res = tryCombineToLargerBFI(N, DCI, Subtarget);
+  Res = tryCombineToBSL(N, DCI);
   if (Res.getNode())
     return Res;
 
-  Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
+  return SDValue();
+}
 
-  if (!Subtarget->hasNEON())
+static SDValue performBitcastCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
-  // Attempt to use vector immediate-form BSL
-  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  // Remove extraneous bitcasts around an extract_subvector.
+  // For example,
+  //    (v4i16 (bitconvert
+  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
+  //  becomes
+  //    (extract_subvector ((v8i16 ...), (i64 4)))
 
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND)
+  // Only interested in 64-bit vectors as the ultimate result.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector())
     return SDValue();
-
-  SDValue N1 = N->getOperand(1);
-  if (N1.getOpcode() != ISD::AND)
+  if (VT.getSimpleVT().getSizeInBits() != 64)
     return SDValue();
-
-  if (VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
-    APInt SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    APInt SplatBits0;
-    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs) &&
-        !HasAnyUndefs) {
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-      APInt SplatBits1;
-      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                        HasAnyUndefs) && !HasAnyUndefs &&
-          SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
-          SplatBits0 == ~SplatBits1) {
-
-        return DAG.getNode(ISD::VSELECT, DL, VT, N0->getOperand(1),
-                           N0->getOperand(0), N1->getOperand(0));
-      }
-    }
+  // Is the operand an extract_subvector starting at the beginning or halfway
+  // point of the vector? A low half may also come through as an
+  // EXTRACT_SUBREG, so look for that, too.
+  SDValue Op0 = N->getOperand(0);
+  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
+      !(Op0->isMachineOpcode() &&
+        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
+    return SDValue();
+  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
+  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
+      return SDValue();
+  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
+    if (idx != AArch64::dsub)
+      return SDValue();
+    // The dsub reference is equivalent to a lane zero subvector reference.
+    idx = 0;
   }
+  // Look through the bitcast of the input to the extract.
+  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
+    return SDValue();
+  SDValue Source = Op0->getOperand(0)->getOperand(0);
+  // If the source type has twice the number of elements as our destination
+  // type, we know this is an extract of the high or low half of the vector.
+  EVT SVT = Source->getValueType(0);
+  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
+    return SDValue();
 
-  return SDValue();
+  DEBUG(dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
+
+  // Create the simplified form to just extract the low or high half of the
+  // vector directly rather than bothering with the bitcasts.
+  SDLoc dl(N);
+  unsigned NumElements = VT.getVectorNumElements();
+  if (idx) {
+    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
+  } else {
+    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, MVT::i32);
+    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
+                                      Source, SubReg),
+                   0);
+  }
 }
 
-/// Target-specific dag combine xforms for ISD::SRA
-static SDValue PerformSRACombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue performConcatVectorsCombine(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
 
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
+  SDLoc dl(N);
   EVT VT = N->getValueType(0);
 
-  // We're looking for an SRA/SHL pair which form an SBFX.
+  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
+  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
+  // canonicalise to that.
+  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
+    assert(VT.getVectorElementType().getSizeInBits() == 64);
+    return DAG.getNode(AArch64ISD::DUPLANE64, dl, VT,
+                       WidenVector(N->getOperand(0), DAG),
+                       DAG.getConstant(0, MVT::i64));
+  }
 
-  if (VT != MVT::i32 && VT != MVT::i64)
+  // Canonicalise concat_vectors so that the right-hand vector has as few
+  // bit-casts as possible before its real operation. The primary matching
+  // destination for these operations will be the narrowing "2" instructions,
+  // which depend on the operation being performed on this right-hand vector.
+  // For example,
+  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
+  // becomes
+  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
+
+  SDValue Op1 = N->getOperand(1);
+  if (Op1->getOpcode() != ISD::BITCAST)
     return SDValue();
+  SDValue RHS = Op1->getOperand(0);
+  MVT RHSTy = RHS.getValueType().getSimpleVT();
+  // If the RHS is not a vector, this is not the pattern we're looking for.
+  if (!RHSTy.isVector())
+    return SDValue();
+
+  DEBUG(dbgs() << "aarch64-lower: concat_vectors bitcast simplification\n");
+
+  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
+                                  RHSTy.getVectorNumElements() * 2);
+  return DAG.getNode(
+      ISD::BITCAST, dl, VT,
+      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
+                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
+}
 
-  if (!isa<ConstantSDNode>(N->getOperand(1)))
+static SDValue tryCombineFixedPointConvert(SDNode *N,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           SelectionDAG &DAG) {
+  // Wait 'til after everything is legalized to try this. That way we have
+  // legal vector types and such.
+  if (DCI.isBeforeLegalizeOps())
     return SDValue();
+  // Transform a scalar conversion of a value from a lane extract into a
+  // lane extract of a vector conversion. E.g., from foo1 to foo2:
+  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
+  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
+  //
+  // The second form interacts better with instruction selection and the
+  // register allocator to avoid cross-class register copies that aren't
+  // coalescable due to a lane reference.
+
+  // Check the operand and see if it originates from a lane extract.
+  SDValue Op1 = N->getOperand(1);
+  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    // Yep, no additional predication needed. Perform the transform.
+    SDValue IID = N->getOperand(0);
+    SDValue Shift = N->getOperand(2);
+    SDValue Vec = Op1.getOperand(0);
+    SDValue Lane = Op1.getOperand(1);
+    EVT ResTy = N->getValueType(0);
+    EVT VecResTy;
+    SDLoc DL(N);
+
+    // The vector width should be 128 bits by the time we get here, even
+    // if it started as 64 bits (the extract_vector handling will have
+    // done so).
+    assert(Vec.getValueType().getSizeInBits() == 128 &&
+           "unexpected vector size on extract_vector_elt!");
+    if (Vec.getValueType() == MVT::v4i32)
+      VecResTy = MVT::v4f32;
+    else if (Vec.getValueType() == MVT::v2i64)
+      VecResTy = MVT::v2f64;
+    else
+      assert(0 && "unexpected vector type!");
 
-  uint64_t ExtraSignBits = N->getConstantOperandVal(1);
-  SDValue Shift = N->getOperand(0);
+    SDValue Convert =
+        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
+  }
+  return SDValue();
+}
 
-  if (Shift.getOpcode() != ISD::SHL)
+// AArch64 high-vector "long" operations are formed by performing the non-high
+// version on an extract_subvector of each operand which gets the high half:
+//
+//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
+//
+// However, there are cases which don't have an extract_high explicitly, but
+// have another operation that can be made compatible with one for free. For
+// example:
+//
+//  (dupv64 scalar) --> (extract_high (dup128 scalar))
+//
+// This routine does the actual conversion of such DUPs, once outer routines
+// have determined that everything else is in order.
+static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
+  // We can handle most types of duplicate, but the lane ones have an extra
+  // operand saying *which* lane, so we need to know.
+  bool IsDUPLANE;
+  switch (N.getOpcode()) {
+  case AArch64ISD::DUP:
+    IsDUPLANE = false;
+    break;
+  case AArch64ISD::DUPLANE8:
+  case AArch64ISD::DUPLANE16:
+  case AArch64ISD::DUPLANE32:
+  case AArch64ISD::DUPLANE64:
+    IsDUPLANE = true;
+    break;
+  default:
     return SDValue();
+  }
 
-  if (!isa<ConstantSDNode>(Shift->getOperand(1)))
+  MVT NarrowTy = N.getSimpleValueType();
+  if (!NarrowTy.is64BitVector())
     return SDValue();
 
-  uint64_t BitsOnLeft = Shift->getConstantOperandVal(1);
-  uint64_t Width = VT.getSizeInBits() - ExtraSignBits;
-  uint64_t LSB = VT.getSizeInBits() - Width - BitsOnLeft;
+  MVT ElementTy = NarrowTy.getVectorElementType();
+  unsigned NumElems = NarrowTy.getVectorNumElements();
+  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
 
-  if (LSB > VT.getSizeInBits() || Width > VT.getSizeInBits())
-    return SDValue();
+  SDValue NewDUP;
+  if (IsDUPLANE)
+    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
+                         N.getOperand(1));
+  else
+    NewDUP = DAG.getNode(AArch64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
 
-  return DAG.getNode(AArch64ISD::SBFX, DL, VT, Shift.getOperand(0),
-                     DAG.getConstant(LSB, MVT::i64),
-                     DAG.getConstant(LSB + Width - 1, MVT::i64));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
+                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
 }
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift operation, where all the elements of the build_vector
-/// must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
+static bool isEssentiallyExtractSubvector(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    return true;
+
+  return N.getOpcode() == ISD::BITCAST &&
+         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
 }
 
-/// Check if this is a valid build_vector for the immediate operand of
-/// a vector shift left operation.  That value must be in the range:
-/// 0 <= Value < ElementBits
-static bool isVShiftLImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
+/// \brief Helper structure to keep track of ISD::SET_CC operands.
+struct GenericSetCCInfo {
+  const SDValue *Opnd0;
+  const SDValue *Opnd1;
+  ISD::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of a SET_CC lowered into AArch64 code.
+struct AArch64SetCCInfo {
+  const SDValue *Cmp;
+  AArch64CC::CondCode CC;
+};
+
+/// \brief Helper structure to keep track of SetCC information.
+union SetCCInfo {
+  GenericSetCCInfo Generic;
+  AArch64SetCCInfo AArch64;
+};
+
+/// \brief Helper structure to be able to read SetCC information.  If set to
+/// true, IsAArch64 field, Info is a AArch64SetCCInfo, otherwise Info is a
+/// GenericSetCCInfo.
+struct SetCCInfoAndKind {
+  SetCCInfo Info;
+  bool IsAArch64;
+};
+
+/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
+/// an
+/// AArch64 lowered one.
+/// \p SetCCInfo is filled accordingly.
+/// \post SetCCInfo is meanginfull only when this function returns true.
+/// \return True when Op is a kind of SET_CC operation.
+static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
+  // If this is a setcc, this is straight forward.
+  if (Op.getOpcode() == ISD::SETCC) {
+    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
+    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
+    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    SetCCInfo.IsAArch64 = false;
+    return true;
+  }
+  // Otherwise, check if this is a matching csel instruction.
+  // In other words:
+  // - csel 1, 0, cc
+  // - csel 0, 1, !cc
+  if (Op.getOpcode() != AArch64ISD::CSEL)
+    return false;
+  // Set the information about the operands.
+  // TODO: we want the operands of the Cmp not the csel
+  SetCCInfo.Info.AArch64.Cmp = &Op.getOperand(3);
+  SetCCInfo.IsAArch64 = true;
+  SetCCInfo.Info.AArch64.CC = static_cast<AArch64CC::CondCode>(
+      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+
+  // Check that the operands matches the constraints:
+  // (1) Both operands must be constants.
+  // (2) One must be 1 and the other must be 0.
+  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+
+  // Check (1).
+  if (!TValue || !FValue)
     return false;
-  return (Cnt >= 0 && Cnt < ElementBits);
+
+  // Check (2).
+  if (!TValue->isOne()) {
+    // Update the comparison when we are interested in !cc.
+    std::swap(TValue, FValue);
+    SetCCInfo.Info.AArch64.CC =
+        AArch64CC::getInvertedCondCode(SetCCInfo.Info.AArch64.CC);
+  }
+  return TValue->isOne() && FValue->isNullValue();
 }
 
-/// Check if this is a valid build_vector for the immediate operand of a
-/// vector shift right operation. The value must be in the range:
-///   1 <= Value <= ElementBits
-static bool isVShiftRImm(SDValue Op, EVT VT, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 1 && Cnt <= ElementBits);
+// Returns true if Op is setcc or zext of setcc.
+static bool isSetCCOrZExtSetCC(const SDValue& Op, SetCCInfoAndKind &Info) {
+  if (isSetCC(Op, Info))
+    return true;
+  return ((Op.getOpcode() == ISD::ZERO_EXTEND) &&
+    isSetCC(Op->getOperand(0), Info));
 }
 
-static SDValue GenForSextInreg(SDNode *N,
-                               TargetLowering::DAGCombinerInfo &DCI,
-                               EVT SrcVT, EVT DestVT, EVT SubRegVT,
-                               const int *Mask, SDValue Src) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDValue Bitcast
-    = DAG.getNode(ISD::BITCAST, SDLoc(N), SrcVT, Src);
-  SDValue Sext
-    = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), DestVT, Bitcast);
-  SDValue ShuffleVec
-    = DAG.getVectorShuffle(DestVT, SDLoc(N), Sext, DAG.getUNDEF(DestVT), Mask);
-  SDValue ExtractSubreg
-    = SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, SDLoc(N),
-                SubRegVT, ShuffleVec,
-                DAG.getTargetConstant(AArch64::sub_64, MVT::i32)), 0);
-  return ExtractSubreg;
-}
-
-/// Checks for vector shifts and lowers them.
-static SDValue PerformShiftCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   const AArch64Subtarget *ST) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-  if (N->getOpcode() == ISD::SRA && (VT == MVT::i32 || VT == MVT::i64))
-    return PerformSRACombine(N, DCI);
+// The folding we want to perform is:
+// (add x, [zext] (setcc cc ...) )
+//   -->
+// (csel x, (add x, 1), !cc ...)
+//
+// The latter will get matched to a CSINC instruction.
+static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
+  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
+  SDValue LHS = Op->getOperand(0);
+  SDValue RHS = Op->getOperand(1);
+  SetCCInfoAndKind InfoAndKind;
+
+  // If neither operand is a SET_CC, give up.
+  if (!isSetCCOrZExtSetCC(LHS, InfoAndKind)) {
+    std::swap(LHS, RHS);
+    if (!isSetCCOrZExtSetCC(LHS, InfoAndKind))
+      return SDValue();
+  }
 
-  // We're looking for an SRA/SHL pair to help generating instruction
-  //   sshll  v0.8h, v0.8b, #0
-  // The instruction STXL is also the alias of this instruction.
-  //
-  // For example, for DAG like below,
-  //   v2i32 = sra (v2i32 (shl v2i32, 16)), 16
-  // we can transform it into
-  //   v2i32 = EXTRACT_SUBREG 
-  //             (v4i32 (suffle_vector
-  //                       (v4i32 (sext (v4i16 (bitcast v2i32))), 
-  //                       undef, (0, 2, u, u)),
-  //             sub_64
-  //
-  // With this transformation we expect to generate "SSHLL + UZIP1"
-  // Sometimes UZIP1 can be optimized away by combining with other context.
-  int64_t ShrCnt, ShlCnt;
-  if (N->getOpcode() == ISD::SRA
-      && (VT == MVT::v2i32 || VT == MVT::v4i16)
-      && isVShiftRImm(N->getOperand(1), VT, ShrCnt)
-      && N->getOperand(0).getOpcode() == ISD::SHL
-      && isVShiftRImm(N->getOperand(0).getOperand(1), VT, ShlCnt)) {
-    SDValue Src = N->getOperand(0).getOperand(0);
-    if (VT == MVT::v2i32 && ShrCnt == 16 && ShlCnt == 16) {
-      // sext_inreg(v2i32, v2i16)
-      // We essentially only care the Mask {0, 2, u, u}
-      int Mask[4] = {0, 2, 4, 6};
-      return GenForSextInreg(N, DCI, MVT::v4i16, MVT::v4i32, MVT::v2i32,
-                             Mask, Src); 
-    }
-    else if (VT == MVT::v2i32 && ShrCnt == 24 && ShlCnt == 24) {
-      // sext_inreg(v2i16, v2i8)
-      // We essentially only care the Mask {0, u, 4, u, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v2i32,
-                             Mask, Src);
-    }
-    else if (VT == MVT::v4i16 && ShrCnt == 8 && ShlCnt == 8) {
-      // sext_inreg(v4i16, v4i8)
-      // We essentially only care the Mask {0, 2, 4, 6, u, u, u, u, u, u, u, u}
-      int Mask[8] = {0, 2, 4, 6, 8, 10, 12, 14};
-      return GenForSextInreg(N, DCI, MVT::v8i8, MVT::v8i16, MVT::v4i16,
-                             Mask, Src);
-    }
+  // FIXME: This could be generatized to work for FP comparisons.
+  EVT CmpVT = InfoAndKind.IsAArch64
+                  ? InfoAndKind.Info.AArch64.Cmp->getOperand(0).getValueType()
+                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
+  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
+    return SDValue();
+
+  SDValue CCVal;
+  SDValue Cmp;
+  SDLoc dl(Op);
+  if (InfoAndKind.IsAArch64) {
+    CCVal = DAG.getConstant(
+        AArch64CC::getInvertedCondCode(InfoAndKind.Info.AArch64.CC), MVT::i32);
+    Cmp = *InfoAndKind.Info.AArch64.Cmp;
+  } else
+    Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
+                      *InfoAndKind.Info.Generic.Opnd1,
+                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
+                      CCVal, DAG, dl);
+
+  EVT VT = Op->getValueType(0);
+  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
+  return DAG.getNode(AArch64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
+}
+
+// The basic add/sub long vector instructions have variants with "2" on the end
+// which act on the high-half of their inputs. They are normally matched by
+// patterns like:
+//
+// (add (zeroext (extract_high LHS)),
+//      (zeroext (extract_high RHS)))
+// -> uaddl2 vD, vN, vM
+//
+// However, if one of the extracts is something like a duplicate, this
+// instruction can still be used profitably. This function puts the DAG into a
+// more appropriate form for those patterns to trigger.
+static SDValue performAddSubLongCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  MVT VT = N->getSimpleValueType(0);
+  if (!VT.is128BitVector()) {
+    if (N->getOpcode() == ISD::ADD)
+      return performSetccAddFolding(N, DAG);
+    return SDValue();
   }
 
-  // Nothing to be done for scalar shifts.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!VT.isVector() || !TLI.isTypeLegal(VT))
+  // Make sure both branches are extended in the same way.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
+       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
+      LHS.getOpcode() != RHS.getOpcode())
     return SDValue();
 
-  assert(ST->hasNEON() && "unexpected vector shift");
-  int64_t Cnt;
+  unsigned ExtType = LHS.getOpcode();
 
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
+  // It's not worth doing if at least one of the inputs isn't already an
+  // extract, but we don't know which it'll be so we have to try both.
+  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
+    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
+    if (!RHS.getNode())
+      return SDValue();
 
-  case ISD::SHL:
-    if (isVShiftLImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0), RHS);
-    }
-    break;
+    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
+  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
+    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
+    if (!LHS.getNode())
+      return SDValue();
 
-  case ISD::SRA:
-  case ISD::SRL:
-    if (isVShiftRImm(N->getOperand(1), VT, Cnt)) {
-      SDValue RHS =
-          DAG.getNode(AArch64ISD::NEON_VDUP, SDLoc(N->getOperand(1)), VT,
-                      DAG.getConstant(Cnt, MVT::i32));
-      return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N->getOperand(0), RHS);
-    }
+    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
+  }
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
+}
+
+// Massage DAGs which we can use the high-half "long" operations on into
+// something isel will recognize better. E.g.
+//
+// (aarch64_neon_umull (extract_high vec) (dupv64 scalar)) -->
+//   (aarch64_neon_umull (extract_high (v2i64 vec)))
+//                     (extract_high (v2i64 (dup128 scalar)))))
+//
+static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       SelectionDAG &DAG) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  assert(LHS.getValueType().is64BitVector() &&
+         RHS.getValueType().is64BitVector() &&
+         "unexpected shape for long operation");
+
+  // Either node could be a DUP, but it's not worth doing both of them (you'd
+  // just as well use the non-high version) so look for a corresponding extract
+  // operation on the other "wing".
+  if (isEssentiallyExtractSubvector(LHS)) {
+    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
+    if (!RHS.getNode())
+      return SDValue();
+  } else if (isEssentiallyExtractSubvector(RHS)) {
+    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
+    if (!LHS.getNode())
+      return SDValue();
+  }
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), LHS, RHS);
+}
+
+static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
+  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
+  unsigned ElemBits = ElemTy.getSizeInBits();
+
+  int64_t ShiftAmount;
+  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
+    APInt SplatValue, SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
+                              HasAnyUndefs, ElemBits) ||
+        SplatBitSize != ElemBits)
+      return SDValue();
+
+    ShiftAmount = SplatValue.getSExtValue();
+  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    ShiftAmount = CVN->getSExtValue();
+  } else
+    return SDValue();
+
+  unsigned Opcode;
+  bool IsRightShift;
+  switch (IID) {
+  default:
+    llvm_unreachable("Unknown shift intrinsic");
+  case Intrinsic::aarch64_neon_sqshl:
+    Opcode = AArch64ISD::SQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_uqshl:
+    Opcode = AArch64ISD::UQSHL_I;
+    IsRightShift = false;
+    break;
+  case Intrinsic::aarch64_neon_srshl:
+    Opcode = AArch64ISD::SRSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_urshl:
+    Opcode = AArch64ISD::URSHR_I;
+    IsRightShift = true;
+    break;
+  case Intrinsic::aarch64_neon_sqshlu:
+    Opcode = AArch64ISD::SQSHLU_I;
+    IsRightShift = false;
     break;
   }
 
+  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(-ShiftAmount, MVT::i32));
+  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
+    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
+                       DAG.getConstant(ShiftAmount, MVT::i32));
+
   return SDValue();
 }
 
-/// ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
-  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+// The CRC32[BH] instructions ignore the high bits of their data operand. Since
+// the intrinsics must be legal and take an i32, this means there's almost
+// certainly going to be a zext in the DAG which we can eliminate.
+static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
+  SDValue AndN = N->getOperand(2);
+  if (AndN.getOpcode() != ISD::AND)
+    return SDValue();
+
+  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
+  if (!CMask || CMask->getZExtValue() != Mask)
+    return SDValue();
+
+  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
+                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
+}
 
-  switch (IntNo) {
+static SDValue performIntrinsicCombine(SDNode *N,
+                                       TargetLowering::DAGCombinerInfo &DCI,
+                                       const AArch64Subtarget *Subtarget) {
+  SelectionDAG &DAG = DCI.DAG;
+  unsigned IID = getIntrinsicID(N);
+  switch (IID) {
   default:
-    // Don't do anything for most intrinsics.
     break;
+  case Intrinsic::aarch64_neon_vcvtfxs2fp:
+  case Intrinsic::aarch64_neon_vcvtfxu2fp:
+    return tryCombineFixedPointConvert(N, DCI, DAG);
+    break;
+  case Intrinsic::aarch64_neon_fmax:
+    return DAG.getNode(AArch64ISD::FMAX, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_fmin:
+    return DAG.getNode(AArch64ISD::FMIN, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
+  case Intrinsic::aarch64_neon_smull:
+  case Intrinsic::aarch64_neon_umull:
+  case Intrinsic::aarch64_neon_pmull:
+  case Intrinsic::aarch64_neon_sqdmull:
+    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
+  case Intrinsic::aarch64_neon_sqshl:
+  case Intrinsic::aarch64_neon_uqshl:
+  case Intrinsic::aarch64_neon_sqshlu:
+  case Intrinsic::aarch64_neon_srshl:
+  case Intrinsic::aarch64_neon_urshl:
+    return tryCombineShiftImm(IID, N, DAG);
+  case Intrinsic::aarch64_crc32b:
+  case Intrinsic::aarch64_crc32cb:
+    return tryCombineCRC32(0xff, N, DAG);
+  case Intrinsic::aarch64_crc32h:
+  case Intrinsic::aarch64_crc32ch:
+    return tryCombineCRC32(0xffff, N, DAG);
+  }
+  return SDValue();
+}
 
-  case Intrinsic::arm_neon_vqshifts:
-  case Intrinsic::arm_neon_vqshiftu:
-    EVT VT = N->getOperand(1).getValueType();
-    int64_t Cnt;
-    if (!isVShiftLImm(N->getOperand(2), VT, Cnt))
-      break;
-    unsigned VShiftOpc = (IntNo == Intrinsic::arm_neon_vqshifts)
-                             ? AArch64ISD::NEON_QSHLs
-                             : AArch64ISD::NEON_QSHLu;
-    return DAG.getNode(VShiftOpc, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), DAG.getConstant(Cnt, MVT::i32));
+static SDValue performExtendCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
+  // we can convert that DUP into another extract_high (of a bigger DUP), which
+  // helps the backend to decide that an sabdl2 would be useful, saving a real
+  // extract_high operation.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
+      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    SDNode *ABDNode = N->getOperand(0).getNode();
+    unsigned IID = getIntrinsicID(ABDNode);
+    if (IID == Intrinsic::aarch64_neon_sabd ||
+        IID == Intrinsic::aarch64_neon_uabd) {
+      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
+      if (!NewABD.getNode())
+        return SDValue();
+
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
+                         NewABD);
+    }
+  }
+
+  // This is effectively a custom type legalization for AArch64.
+  //
+  // Type legalization will split an extend of a small, legal, type to a larger
+  // illegal type by first splitting the destination type, often creating
+  // illegal source types, which then get legalized in isel-confusing ways,
+  // leading to really terrible codegen. E.g.,
+  //   %result = v8i32 sext v8i8 %value
+  // becomes
+  //   %losrc = extract_subreg %value, ...
+  //   %hisrc = extract_subreg %value, ...
+  //   %lo = v4i32 sext v4i8 %losrc
+  //   %hi = v4i32 sext v4i8 %hisrc
+  // Things go rapidly downhill from there.
+  //
+  // For AArch64, the [sz]ext vector instructions can only go up one element
+  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
+  // take two instructions.
+  //
+  // This implies that the most efficient way to do the extend from v8i8
+  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
+  // the normal splitting to happen for the v8i16->v8i32.
+
+  // This is pre-legalization to catch some cases where the default
+  // type legalization will create ill-tempered code.
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  // We're only interested in cleaning things up for non-legal vector types
+  // here. If both the source and destination are legal, things will just
+  // work naturally without any fiddling.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT ResVT = N->getValueType(0);
+  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
+    return SDValue();
+  // If the vector type isn't a simple VT, it's beyond the scope of what
+  // we're  worried about here. Let legalization do its thing and hope for
+  // the best.
+  if (!ResVT.isSimple())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  MVT SrcVT = Src->getValueType(0).getSimpleVT();
+  // If the source VT is a 64-bit vector, we can play games and get the
+  // better results we want.
+  if (SrcVT.getSizeInBits() != 64)
+    return SDValue();
+
+  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
+  unsigned ElementCount = SrcVT.getVectorNumElements();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  SDLoc DL(N);
+  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
+
+  // Now split the rest of the operation into two halves, each with a 64
+  // bit source.
+  EVT LoVT, HiVT;
+  SDValue Lo, Hi;
+  unsigned NumElements = ResVT.getVectorNumElements();
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
+                                 ResVT.getVectorElementType(), NumElements / 2);
+
+  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
+                               LoVT.getVectorNumElements());
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(0));
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
+                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
+
+  // Now combine the parts back together so we still have a single result
+  // like the combiner expects.
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
+/// value. The load store optimizer pass will merge them to store pair stores.
+/// This has better performance than a splat of the scalar followed by a split
+/// vector store. Even if the stores are not merged it is four stores vs a dup,
+/// followed by an ext.b and two stores.
+static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
+  SDValue StVal = St->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't replace floating point stores, they possibly won't be transformed to
+  // stp because of the store pair suppress pass.
+  if (VT.isFloatingPoint())
+    return SDValue();
+
+  // Check for insert vector elements.
+  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
+    return SDValue();
+
+  // We can express a splat as store pair(s) for 2 or 4 elements.
+  unsigned NumVecElts = VT.getVectorNumElements();
+  if (NumVecElts != 4 && NumVecElts != 2)
+    return SDValue();
+  SDValue SplatVal = StVal.getOperand(1);
+  unsigned RemainInsertElts = NumVecElts - 1;
+
+  // Check that this is a splat.
+  while (--RemainInsertElts) {
+    SDValue NextInsertElt = StVal.getOperand(0);
+    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
+      return SDValue();
+    if (NextInsertElt.getOperand(1) != SplatVal)
+      return SDValue();
+    StVal = NextInsertElt;
+  }
+  unsigned OrigAlignment = St->getAlignment();
+  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
+  unsigned Alignment = std::min(OrigAlignment, EltOffset);
+
+  // Create scalar stores. This is at least as good as the code sequence for a
+  // split unaligned store wich is a dup.s, ext.b, and two stores.
+  // Most of the time the three stores should be replaced by store pair
+  // instructions (stp).
+  SDLoc DL(St);
+  SDValue BasePtr = St->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
+                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
+
+  unsigned Offset = EltOffset;
+  while (--NumVecElts) {
+    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                    DAG.getConstant(Offset, MVT::i64));
+    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
+                          St->getPointerInfo(), St->isVolatile(),
+                          St->isNonTemporal(), Alignment);
+    Offset += EltOffset;
+  }
+  return NewST1;
+}
+
+static SDValue performSTORECombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   SelectionDAG &DAG,
+                                   const AArch64Subtarget *Subtarget) {
+  if (!DCI.isBeforeLegalize())
+    return SDValue();
+
+  StoreSDNode *S = cast<StoreSDNode>(N);
+  if (S->isVolatile())
+    return SDValue();
+
+  // Cyclone has bad performance on unaligned 16B stores when crossing line and
+  // page boundries. We want to split such stores.
+  if (!Subtarget->isCyclone())
+    return SDValue();
+
+  // Don't split at Oz.
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
+      AttributeSet::FunctionIndex, Attribute::MinSize);
+  if (IsMinSize)
+    return SDValue();
+
+  SDValue StVal = S->getValue();
+  EVT VT = StVal.getValueType();
+
+  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
+  // those up regresses performance on micro-benchmarks and olden/bh.
+  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
+    return SDValue();
+
+  // Split unaligned 16B stores. They are terrible for performance.
+  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
+  // extensions can use this to mark that it does not want splitting to happen
+  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
+  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
+  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
+      S->getAlignment() <= 2)
+    return SDValue();
+
+  // If we get a splat of a scalar convert this vector store to a store of
+  // scalars. They will be merged into store pairs thereby removing two
+  // instructions.
+  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
+  if (ReplacedSplat != SDValue())
+    return ReplacedSplat;
+
+  SDLoc DL(S);
+  unsigned NumElts = VT.getVectorNumElements() / 2;
+  // Split VT into two.
+  EVT HalfVT =
+      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
+  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(0));
+  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
+                                   DAG.getIntPtrConstant(NumElts));
+  SDValue BasePtr = S->getBasePtr();
+  SDValue NewST1 =
+      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
+                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
+  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
+                                  DAG.getConstant(8, MVT::i64));
+  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
+                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
+                      S->getAlignment());
+}
+
+/// Target-specific DAG combine function for post-increment LD1 (lane) and
+/// post-increment LD1R.
+static SDValue performPostLD1Combine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     bool IsLaneOp) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  EVT VT = N->getValueType(0);
+
+  unsigned LoadIdx = IsLaneOp ? 1 : 0;
+  SDNode *LD = N->getOperand(LoadIdx).getNode();
+  // If it is not LOAD, can not do such combine.
+  if (LD->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LoadSDN = cast<LoadSDNode>(LD);
+  EVT MemVT = LoadSDN->getMemoryVT();
+  // Check if memory operand is the same type as the vector element.
+  if (MemVT != VT.getVectorElementType())
+    return SDValue();
+
+  // Check if there are other uses. If so, do not combine as it will introduce
+  // an extra load.
+  for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end(); UI != UE;
+       ++UI) {
+    if (UI.getUse().getResNo() == 1) // Ignore uses of the chain result.
+      continue;
+    if (*UI != N)
+      return SDValue();
   }
 
+  SDValue Addr = LD->getOperand(1);
+  SDValue Vector = N->getOperand(0);
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(), UE =
+       Addr.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD
+        || UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load.  Otherwise, folding it
+    // would create a cycle.
+    if (User->isPredecessorOf(LD) || LD->isPredecessorOf(User))
+      continue;
+    // Also check that add is not used in the vector operand.  This would also
+    // create a cycle.
+    if (User->isPredecessorOf(Vector.getNode()))
+      continue;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
+      uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = VT.getScalarSizeInBits() / 8;
+      if (IncVal != NumBytes)
+        continue;
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(LD->getOperand(0));  // Chain
+    if (IsLaneOp) {
+      Ops.push_back(Vector);           // The vector to be inserted
+      Ops.push_back(N->getOperand(2)); // The lane to be inserted in the vector
+    }
+    Ops.push_back(Addr);
+    Ops.push_back(Inc);
+
+    EVT Tys[3] = { VT, MVT::i64, MVT::Other };
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, 3));
+    unsigned NewOp = IsLaneOp ? AArch64ISD::LD1LANEpost : AArch64ISD::LD1DUPpost;
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOp, SDLoc(N), SDTys, Ops,
+                                           MemVT,
+                                           LoadSDN->getMemOperand());
+
+    // Update the uses.
+    std::vector<SDValue> NewResults;
+    NewResults.push_back(SDValue(LD, 0));             // The result of load
+    NewResults.push_back(SDValue(UpdN.getNode(), 2)); // Chain
+    DCI.CombineTo(LD, NewResults);
+    DCI.CombineTo(N, SDValue(UpdN.getNode(), 0));     // Dup/Inserted Result
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), 1));  // Write back register
+
+    break;
+  }
   return SDValue();
 }
 
 /// Target-specific DAG combine function for NEON load/store intrinsics
 /// to merge base address updates.
-static SDValue CombineBaseUpdate(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI) {
+static SDValue performNEONPostLDSTCombine(SDNode *N,
+                                          TargetLowering::DAGCombinerInfo &DCI,
+                                          SelectionDAG &DAG) {
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
-  SelectionDAG &DAG = DCI.DAG;
-  bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
-                      N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
-  unsigned AddrOpIdx = (isIntrinsic ? 2 : 1);
+  unsigned AddrOpIdx = N->getNumOperands() - 1;
   SDValue Addr = N->getOperand(AddrOpIdx);
 
   // Search for a use of the address operand that is an increment.
@@ -4090,106 +7423,96 @@ static SDValue CombineBaseUpdate(SDNode *N,
       continue;
 
     // Find the new opcode for the updating load/store.
-    bool isLoad = true;
-    bool isLaneOp = false;
+    bool IsStore = false;
+    bool IsLaneOp = false;
+    bool IsDupOp = false;
     unsigned NewOpc = 0;
     unsigned NumVecs = 0;
-    if (isIntrinsic) {
-      unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-      switch (IntNo) {
-      default: llvm_unreachable("unexpected intrinsic for Neon base update");
-      case Intrinsic::arm_neon_vld1:       NewOpc = AArch64ISD::NEON_LD1_UPD;
-        NumVecs = 1; break;
-      case Intrinsic::arm_neon_vld2:       NewOpc = AArch64ISD::NEON_LD2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::arm_neon_vld3:       NewOpc = AArch64ISD::NEON_LD3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::arm_neon_vld4:       NewOpc = AArch64ISD::NEON_LD4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::arm_neon_vst1:       NewOpc = AArch64ISD::NEON_ST1_UPD;
-        NumVecs = 1; isLoad = false; break;
-      case Intrinsic::arm_neon_vst2:       NewOpc = AArch64ISD::NEON_ST2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::arm_neon_vst3:       NewOpc = AArch64ISD::NEON_ST3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::arm_neon_vst4:       NewOpc = AArch64ISD::NEON_ST4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vld1x2: NewOpc = AArch64ISD::NEON_LD1x2_UPD;
-        NumVecs = 2; break;
-      case Intrinsic::aarch64_neon_vld1x3: NewOpc = AArch64ISD::NEON_LD1x3_UPD;
-        NumVecs = 3; break;
-      case Intrinsic::aarch64_neon_vld1x4: NewOpc = AArch64ISD::NEON_LD1x4_UPD;
-        NumVecs = 4; break;
-      case Intrinsic::aarch64_neon_vst1x2: NewOpc = AArch64ISD::NEON_ST1x2_UPD;
-        NumVecs = 2; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x3: NewOpc = AArch64ISD::NEON_ST1x3_UPD;
-        NumVecs = 3; isLoad = false; break;
-      case Intrinsic::aarch64_neon_vst1x4: NewOpc = AArch64ISD::NEON_ST1x4_UPD;
-        NumVecs = 4; isLoad = false; break;
-      case Intrinsic::arm_neon_vld2lane:   NewOpc = AArch64ISD::NEON_LD2LN_UPD;
-        NumVecs = 2; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld3lane:   NewOpc = AArch64ISD::NEON_LD3LN_UPD;
-        NumVecs = 3; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vld4lane:   NewOpc = AArch64ISD::NEON_LD4LN_UPD;
-        NumVecs = 4; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst2lane:   NewOpc = AArch64ISD::NEON_ST2LN_UPD;
-        NumVecs = 2; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst3lane:   NewOpc = AArch64ISD::NEON_ST3LN_UPD;
-        NumVecs = 3; isLoad = false; isLaneOp = true; break;
-      case Intrinsic::arm_neon_vst4lane:   NewOpc = AArch64ISD::NEON_ST4LN_UPD;
-        NumVecs = 4; isLoad = false; isLaneOp = true; break;
-      }
-    } else {
-      isLaneOp = true;
-      switch (N->getOpcode()) {
-      default: llvm_unreachable("unexpected opcode for Neon base update");
-      case AArch64ISD::NEON_LD2DUP: NewOpc = AArch64ISD::NEON_LD2DUP_UPD;
-        NumVecs = 2; break;
-      case AArch64ISD::NEON_LD3DUP: NewOpc = AArch64ISD::NEON_LD3DUP_UPD;
-        NumVecs = 3; break;
-      case AArch64ISD::NEON_LD4DUP: NewOpc = AArch64ISD::NEON_LD4DUP_UPD;
-        NumVecs = 4; break;
-      }
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default: llvm_unreachable("unexpected intrinsic for Neon base update");
+    case Intrinsic::aarch64_neon_ld2:       NewOpc = AArch64ISD::LD2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld3:       NewOpc = AArch64ISD::LD3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld4:       NewOpc = AArch64ISD::LD4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st2:       NewOpc = AArch64ISD::ST2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st3:       NewOpc = AArch64ISD::ST3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st4:       NewOpc = AArch64ISD::ST4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld1x2:     NewOpc = AArch64ISD::LD1x2post;
+      NumVecs = 2; break;
+    case Intrinsic::aarch64_neon_ld1x3:     NewOpc = AArch64ISD::LD1x3post;
+      NumVecs = 3; break;
+    case Intrinsic::aarch64_neon_ld1x4:     NewOpc = AArch64ISD::LD1x4post;
+      NumVecs = 4; break;
+    case Intrinsic::aarch64_neon_st1x2:     NewOpc = AArch64ISD::ST1x2post;
+      NumVecs = 2; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x3:     NewOpc = AArch64ISD::ST1x3post;
+      NumVecs = 3; IsStore = true; break;
+    case Intrinsic::aarch64_neon_st1x4:     NewOpc = AArch64ISD::ST1x4post;
+      NumVecs = 4; IsStore = true; break;
+    case Intrinsic::aarch64_neon_ld2r:      NewOpc = AArch64ISD::LD2DUPpost;
+      NumVecs = 2; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld3r:      NewOpc = AArch64ISD::LD3DUPpost;
+      NumVecs = 3; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld4r:      NewOpc = AArch64ISD::LD4DUPpost;
+      NumVecs = 4; IsDupOp = true; break;
+    case Intrinsic::aarch64_neon_ld2lane:   NewOpc = AArch64ISD::LD2LANEpost;
+      NumVecs = 2; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld3lane:   NewOpc = AArch64ISD::LD3LANEpost;
+      NumVecs = 3; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_ld4lane:   NewOpc = AArch64ISD::LD4LANEpost;
+      NumVecs = 4; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st2lane:   NewOpc = AArch64ISD::ST2LANEpost;
+      NumVecs = 2; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st3lane:   NewOpc = AArch64ISD::ST3LANEpost;
+      NumVecs = 3; IsStore = true; IsLaneOp = true; break;
+    case Intrinsic::aarch64_neon_st4lane:   NewOpc = AArch64ISD::ST4LANEpost;
+      NumVecs = 4; IsStore = true; IsLaneOp = true; break;
     }
 
-    // Find the size of memory referenced by the load/store.
     EVT VecTy;
-    if (isLoad)
-      VecTy = N->getValueType(0);
+    if (IsStore)
+      VecTy = N->getOperand(2).getValueType();
     else
-      VecTy = N->getOperand(AddrOpIdx + 1).getValueType();
-    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
-    if (isLaneOp)
-      NumBytes /= VecTy.getVectorNumElements();
+      VecTy = N->getValueType(0);
 
     // If the increment is a constant, it must match the memory ref size.
     SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
     if (ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode())) {
       uint32_t IncVal = CInc->getZExtValue();
+      unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+      if (IsLaneOp || IsDupOp)
+        NumBytes /= VecTy.getVectorNumElements();
       if (IncVal != NumBytes)
         continue;
-      Inc = DAG.getTargetConstant(IncVal, MVT::i32);
+      Inc = DAG.getRegister(AArch64::XZR, MVT::i64);
     }
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // Incoming chain
+    // Load lane and store have vector list as input.
+    if (IsLaneOp || IsStore)
+      for (unsigned i = 2; i < AddrOpIdx; ++i)
+        Ops.push_back(N->getOperand(i));
+    Ops.push_back(Addr); // Base register
+    Ops.push_back(Inc);
 
-    // Create the new updating load/store node.
+    // Return Types.
     EVT Tys[6];
-    unsigned NumResultVecs = (isLoad ? NumVecs : 0);
+    unsigned NumResultVecs = (IsStore ? 0 : NumVecs);
     unsigned n;
     for (n = 0; n < NumResultVecs; ++n)
       Tys[n] = VecTy;
-    Tys[n++] = MVT::i64;
-    Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs + 2);
-    SmallVector<SDValue, 8> Ops;
-    Ops.push_back(N->getOperand(0)); // incoming chain
-    Ops.push_back(N->getOperand(AddrOpIdx));
-    Ops.push_back(Inc);
-    for (unsigned i = AddrOpIdx + 1; i < N->getNumOperands(); ++i) {
-      Ops.push_back(N->getOperand(i));
-    }
+    Tys[n++] = MVT::i64;  // Type of write back register
+    Tys[n] = MVT::Other;  // Type of the chain
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs + 2));
+
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
-    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops.data(), Ops.size(),
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys, Ops,
                                            MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
 
@@ -4198,7 +7521,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
     for (unsigned i = 0; i < NumResultVecs; ++i) {
       NewResults.push_back(SDValue(UpdN.getNode(), i));
     }
-    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1));
     DCI.CombineTo(N, NewResults);
     DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
 
@@ -4207,107 +7530,58 @@ static SDValue CombineBaseUpdate(SDNode *N,
   return SDValue();
 }
 
-/// For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1)
-/// intrinsic, and if all the other uses of that intrinsic are also VDUPLANEs.
-/// If so, combine them to a vldN-dup operation and return true.
-static SDValue CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  // Check if the VDUPLANE operand is a vldN-dup intrinsic.
-  SDNode *VLD = N->getOperand(0).getNode();
-  if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
+// Optimize compare with zero and branch.
+static SDValue performBRCONDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI,
+                                    SelectionDAG &DAG) {
+  SDValue Chain = N->getOperand(0);
+  SDValue Dest = N->getOperand(1);
+  SDValue CCVal = N->getOperand(2);
+  SDValue Cmp = N->getOperand(3);
+
+  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
+  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
     return SDValue();
-  unsigned NumVecs = 0;
-  unsigned NewOpc = 0;
-  unsigned IntNo = cast<ConstantSDNode>(VLD->getOperand(1))->getZExtValue();
-  if (IntNo == Intrinsic::arm_neon_vld2lane) {
-    NumVecs = 2;
-    NewOpc = AArch64ISD::NEON_LD2DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
-    NumVecs = 3;
-    NewOpc = AArch64ISD::NEON_LD3DUP;
-  } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
-    NumVecs = 4;
-    NewOpc = AArch64ISD::NEON_LD4DUP;
-  } else {
+
+  unsigned CmpOpc = Cmp.getOpcode();
+  if (CmpOpc != AArch64ISD::ADDS && CmpOpc != AArch64ISD::SUBS)
     return SDValue();
-  }
 
-  // First check that all the vldN-lane uses are VDUPLANEs and that the lane
-  // numbers match the load.
-  unsigned VLDLaneNo =
-      cast<ConstantSDNode>(VLD->getOperand(NumVecs + 3))->getZExtValue();
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    // Ignore uses of the chain result.
-    if (UI.getUse().getResNo() == NumVecs)
-      continue;
-    SDNode *User = *UI;
-    if (User->getOpcode() != AArch64ISD::NEON_VDUPLANE ||
-        VLDLaneNo != cast<ConstantSDNode>(User->getOperand(1))->getZExtValue())
-      return SDValue();
-  }
+  // Only attempt folding if there is only one use of the flag and no use of the
+  // value.
+  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
+    return SDValue();
 
-  // Create the vldN-dup node.
-  EVT Tys[5];
-  unsigned n;
-  for (n = 0; n < NumVecs; ++n)
-    Tys[n] = VT;
-  Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(Tys, NumVecs + 1);
-  SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
-  MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
-  SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys, Ops, 2,
-                                           VLDMemInt->getMemoryVT(),
-                                           VLDMemInt->getMemOperand());
-
-  // Update the uses.
-  for (SDNode::use_iterator UI = VLD->use_begin(), UE = VLD->use_end();
-       UI != UE; ++UI) {
-    unsigned ResNo = UI.getUse().getResNo();
-    // Ignore uses of the chain result.
-    if (ResNo == NumVecs)
-      continue;
-    SDNode *User = *UI;
-    DCI.CombineTo(User, SDValue(VLDDup.getNode(), ResNo));
-  }
+  SDValue LHS = Cmp.getOperand(0);
+  SDValue RHS = Cmp.getOperand(1);
 
-  // Now the vldN-lane intrinsic is dead except for its chain result.
-  // Update uses of the chain.
-  std::vector<SDValue> VLDDupResults;
-  for (unsigned n = 0; n < NumVecs; ++n)
-    VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
-  VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
-  DCI.CombineTo(VLD, VLDDupResults);
+  assert(LHS.getValueType() == RHS.getValueType() &&
+         "Expected the value type to be the same for both operands!");
+  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
+    return SDValue();
 
-  return SDValue(N, 0);
-}
+  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
+    std::swap(LHS, RHS);
 
-// v1i1 setcc ->
-//     v1i1 (bitcast (i1 setcc (extract_vector_elt, extract_vector_elt))
-// FIXME: Currently the type legalizer can't handle SETCC having v1i1 as result.
-// If it can legalize "v1i1 SETCC" correctly, no need to combine such SETCC.
-static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT ResVT = N->getValueType(0);
+  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
+    return SDValue();
 
-  if (!ResVT.isVector() || ResVT.getVectorNumElements() != 1 ||
-      ResVT.getVectorElementType() != MVT::i1)
+  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
+      LHS.getOpcode() == ISD::SRL)
     return SDValue();
 
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  EVT CmpVT = LHS.getValueType();
-  LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
-                    CmpVT.getVectorElementType(), LHS,
-                    DAG.getConstant(0, MVT::i64));
-  RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
-                    CmpVT.getVectorElementType(), RHS,
-                    DAG.getConstant(0, MVT::i64));
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), MVT::i1, LHS, RHS,
-                   cast<CondCodeSDNode>(N->getOperand(2))->get());
-  return DAG.getNode(ISD::BITCAST, SDLoc(N), ResVT, SetCC);
+  // Fold the compare into the branch instruction.
+  SDValue BR;
+  if (CC == AArch64CC::EQ)
+    BR = DAG.getNode(AArch64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+  else
+    BR = DAG.getNode(AArch64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
+
+  // Do not add new nodes to DAG combiner worklist.
+  DCI.CombineTo(N, BR, false);
+
+  return SDValue();
 }
 
 // vselect (v1i1 setcc) ->
@@ -4315,7 +7589,7 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG) {
 // FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
 // condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
 // such VSELECT.
-static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
+static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT CCVT = N0.getValueType();
 
@@ -4340,79 +7614,109 @@ static SDValue PerformVSelectCombine(SDNode *N, SelectionDAG &DAG) {
                      IfTrue, IfFalse);
 }
 
-// sign_extend (extract_vector_elt (v1i1 setcc)) ->
-//     extract_vector_elt (v1iXX setcc)
-// (XX is the size of the compared operand type)
-static SDValue PerformSignExtendCombine(SDNode *N, SelectionDAG &DAG) {
+/// A vector select: "(select vL, vR, (setcc LHS, RHS))" is best performed with
+/// the compare-mask instructions rather than going via NZCV, even if LHS and
+/// RHS are really scalar. This replaces any scalar setcc in the above pattern
+/// with a vector one followed by a DUP shuffle on the result.
+static SDValue performSelectCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
-  SDValue Vec = N0.getOperand(0);
+  EVT ResVT = N->getValueType(0);
 
-  if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-      Vec.getOpcode() != ISD::SETCC)
+  if (!N->getOperand(1).getValueType().isVector())
     return SDValue();
 
-  EVT ResVT = N->getValueType(0);
-  EVT CmpVT = Vec.getOperand(0).getValueType();
-  // Only optimize when the result type is of the same size as the element
-  // type of the compared operand.
-  if (ResVT.getSizeInBits() != CmpVT.getVectorElementType().getSizeInBits())
+  if (N0.getOpcode() != ISD::SETCC || N0.getValueType() != MVT::i1)
     return SDValue();
 
-  SDValue Lane = N0.getOperand(1);
-  SDValue SetCC =
-      DAG.getSetCC(SDLoc(N), CmpVT.changeVectorElementTypeToInteger(),
-                   Vec.getOperand(0), Vec.getOperand(1),
-                   cast<CondCodeSDNode>(Vec.getOperand(2))->get());
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ResVT,
-                     SetCC, Lane);
+  SDLoc DL(N0);
+
+  EVT SrcVT = N0.getOperand(0).getValueType();
+  SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT,
+                           ResVT.getSizeInBits() / SrcVT.getSizeInBits());
+  EVT CCVT = SrcVT.changeVectorElementTypeToInteger();
+
+  // First perform a vector comparison, where lane 0 is the one we're interested
+  // in.
+  SDValue LHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(0));
+  SDValue RHS =
+      DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, SrcVT, N0.getOperand(1));
+  SDValue SetCC = DAG.getNode(ISD::SETCC, DL, CCVT, LHS, RHS, N0.getOperand(2));
+
+  // Now duplicate the comparison mask we want across all other lanes.
+  SmallVector<int, 8> DUPMask(CCVT.getVectorNumElements(), 0);
+  SDValue Mask = DAG.getVectorShuffle(CCVT, DL, SetCC, SetCC, DUPMask.data());
+  Mask = DAG.getNode(ISD::BITCAST, DL, ResVT.changeVectorElementTypeToInteger(),
+                     Mask);
+
+  return DAG.getSelect(DL, ResVT, Mask, N->getOperand(1), N->getOperand(2));
 }
 
-SDValue
-AArch64TargetLowering::PerformDAGCombine(SDNode *N,
-                                         DAGCombinerInfo &DCI) const {
+SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   switch (N->getOpcode()) {
-  default: break;
-  case ISD::AND: return PerformANDCombine(N, DCI);
-  case ISD::OR: return PerformORCombine(N, DCI, getSubtarget());
-  case ISD::SHL:
-  case ISD::SRA:
-  case ISD::SRL:
-    return PerformShiftCombine(N, DCI, getSubtarget());
-  case ISD::SETCC: return PerformSETCCCombine(N, DCI.DAG);
-  case ISD::VSELECT: return PerformVSelectCombine(N, DCI.DAG);
-  case ISD::SIGN_EXTEND: return PerformSignExtendCombine(N, DCI.DAG);
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+    return performAddSubLongCombine(N, DCI, DAG);
+  case ISD::XOR:
+    return performXorCombine(N, DAG, DCI, Subtarget);
+  case ISD::MUL:
+    return performMulCombine(N, DAG, DCI, Subtarget);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    return performIntToFpCombine(N, DAG);
+  case ISD::OR:
+    return performORCombine(N, DCI, Subtarget);
   case ISD::INTRINSIC_WO_CHAIN:
-    return PerformIntrinsicCombine(N, DCI.DAG);
-  case AArch64ISD::NEON_VDUPLANE:
-    return CombineVLDDUP(N, DCI);
-  case AArch64ISD::NEON_LD2DUP:
-  case AArch64ISD::NEON_LD3DUP:
-  case AArch64ISD::NEON_LD4DUP:
-    return CombineBaseUpdate(N, DCI);
+    return performIntrinsicCombine(N, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performExtendCombine(N, DCI, DAG);
+  case ISD::BITCAST:
+    return performBitcastCombine(N, DCI, DAG);
+  case ISD::CONCAT_VECTORS:
+    return performConcatVectorsCombine(N, DCI, DAG);
+  case ISD::SELECT:
+    return performSelectCombine(N, DAG);
+  case ISD::VSELECT:
+    return performVSelectCombine(N, DCI.DAG);
+  case ISD::STORE:
+    return performSTORECombine(N, DCI, DAG, Subtarget);
+  case AArch64ISD::BRCOND:
+    return performBRCONDCombine(N, DCI, DAG);
+  case AArch64ISD::DUP:
+    return performPostLD1Combine(N, DCI, false);
+  case ISD::INSERT_VECTOR_ELT:
+    return performPostLD1Combine(N, DCI, true);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
-    case Intrinsic::arm_neon_vld1:
-    case Intrinsic::arm_neon_vld2:
-    case Intrinsic::arm_neon_vld3:
-    case Intrinsic::arm_neon_vld4:
-    case Intrinsic::arm_neon_vst1:
-    case Intrinsic::arm_neon_vst2:
-    case Intrinsic::arm_neon_vst3:
-    case Intrinsic::arm_neon_vst4:
-    case Intrinsic::arm_neon_vld2lane:
-    case Intrinsic::arm_neon_vld3lane:
-    case Intrinsic::arm_neon_vld4lane:
-    case Intrinsic::aarch64_neon_vld1x2:
-    case Intrinsic::aarch64_neon_vld1x3:
-    case Intrinsic::aarch64_neon_vld1x4:
-    case Intrinsic::aarch64_neon_vst1x2:
-    case Intrinsic::aarch64_neon_vst1x3:
-    case Intrinsic::aarch64_neon_vst1x4:
-    case Intrinsic::arm_neon_vst2lane:
-    case Intrinsic::arm_neon_vst3lane:
-    case Intrinsic::arm_neon_vst4lane:
-      return CombineBaseUpdate(N, DCI);
+    case Intrinsic::aarch64_neon_ld2:
+    case Intrinsic::aarch64_neon_ld3:
+    case Intrinsic::aarch64_neon_ld4:
+    case Intrinsic::aarch64_neon_ld1x2:
+    case Intrinsic::aarch64_neon_ld1x3:
+    case Intrinsic::aarch64_neon_ld1x4:
+    case Intrinsic::aarch64_neon_ld2lane:
+    case Intrinsic::aarch64_neon_ld3lane:
+    case Intrinsic::aarch64_neon_ld4lane:
+    case Intrinsic::aarch64_neon_ld2r:
+    case Intrinsic::aarch64_neon_ld3r:
+    case Intrinsic::aarch64_neon_ld4r:
+    case Intrinsic::aarch64_neon_st2:
+    case Intrinsic::aarch64_neon_st3:
+    case Intrinsic::aarch64_neon_st4:
+    case Intrinsic::aarch64_neon_st1x2:
+    case Intrinsic::aarch64_neon_st1x3:
+    case Intrinsic::aarch64_neon_st1x4:
+    case Intrinsic::aarch64_neon_st2lane:
+    case Intrinsic::aarch64_neon_st3lane:
+    case Intrinsic::aarch64_neon_st4lane:
+      return performNEONPostLDSTCombine(N, DCI, DAG);
     default:
       break;
     }
@@ -4420,979 +7724,214 @@ AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-bool
-AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f16:
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  case MVT::f128:
+// Check if the return value is used as only a return value, as otherwise
+// we can't perform a tail-call. In particular, we need to check for
+// target ISD nodes that are returns and any other "odd" constructs
+// that the generic analysis code won't necessarily catch.
+bool AArch64TargetLowering::isUsedByReturnOnly(SDNode *N,
+                                               SDValue &Chain) const {
+  if (N->getNumValues() != 1)
     return false;
-  default:
-    break;
-  }
-
-  return false;
-}
-// Check whether a shuffle_vector could be presented as concat_vector.
-bool AArch64TargetLowering::isConcatVector(SDValue Op, SelectionDAG &DAG,
-                                           SDValue V0, SDValue V1,
-                                           const int *Mask,
-                                           SDValue &Res) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  if (VT.getSizeInBits() != 128)
+  if (!N->hasNUsesOfValue(1, 0))
     return false;
-  if (VT.getVectorElementType() != V0.getValueType().getVectorElementType() ||
-      VT.getVectorElementType() != V1.getValueType().getVectorElementType())
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isContactVector = true;
-  bool splitV0 = false;
-  if (V0.getValueType().getSizeInBits() == 128)
-    splitV0 = true;
-
-  for (int I = 0, E = NumElts / 2; I != E; I++) {
-    if (Mask[I] != I) {
-      isContactVector = false;
-      break;
-    }
-  }
-
-  if (isContactVector) {
-    int offset = NumElts / 2;
-    for (int I = NumElts / 2, E = NumElts; I != E; I++) {
-      if (Mask[I] != I + splitV0 * offset) {
-        isContactVector = false;
-        break;
-      }
-    }
-  }
-
-  if (isContactVector) {
-    EVT CastVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                  NumElts / 2);
-    if (splitV0) {
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V0,
-                       DAG.getConstant(0, MVT::i64));
-    }
-    if (V1.getValueType().getSizeInBits() == 128) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, CastVT, V1,
-                       DAG.getConstant(0, MVT::i64));
-    }
-    Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, V0, V1);
-    return true;
-  }
-  return false;
-}
-
-// Check whether a Build Vector could be presented as Shuffle Vector.
-// This Shuffle Vector maybe not legalized, so the length of its operand and
-// the length of result may not equal.
-bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
-                                                 SDValue &V0, SDValue &V1,
-                                                 int *Mask) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned V0NumElts = 0;
 
-  // Check if all elements are extracted from less than 3 vectors.
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Elt = Op.getOperand(i);
-    if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-        Elt.getOperand(0).getValueType().getVectorElementType() !=
-            VT.getVectorElementType())
+  SDValue TCChain = Chain;
+  SDNode *Copy = *N->use_begin();
+  if (Copy->getOpcode() == ISD::CopyToReg) {
+    // If the copy has a glue operand, we conservatively assume it isn't safe to
+    // perform a tail call.
+    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
+        MVT::Glue)
       return false;
+    TCChain = Copy->getOperand(0);
+  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
+    return false;
 
-    if (V0.getNode() == 0) {
-      V0 = Elt.getOperand(0);
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-    if (Elt.getOperand(0) == V0) {
-      Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
-      continue;
-    } else if (V1.getNode() == 0) {
-      V1 = Elt.getOperand(0);
-    }
-    if (Elt.getOperand(0) == V1) {
-      unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
-      Mask[i] = (Lane + V0NumElts);
-      continue;
-    } else {
+  bool HasRet = false;
+  for (SDNode *Node : Copy->uses()) {
+    if (Node->getOpcode() != AArch64ISD::RET_FLAG)
       return false;
-    }
+    HasRet = true;
   }
-  return true;
-}
-
-// LowerShiftRightParts - Lower SRL_PARTS and SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Tmp3 = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
 
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp3.getValueType()), Tmp3,
-                           A64cc);
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           TrueVal, FalseVal, A64cc);
+  if (!HasRet)
+    return false;
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  Chain = TCChain;
+  return true;
 }
 
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue AArch64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a quad-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt  = Op.getOperand(2);
-
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue Tmp4 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-
-  SDValue A64cc;
-  SDValue CmpOp = getSelectableIntSetCC(ExtraShAmt,
-                                        DAG.getConstant(0, MVT::i64),
-                                        ISD::SETGE, A64cc,
-                                        DAG, dl);
-
-  SDValue Lo = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           DAG.getConstant(0, Tmp4.getValueType()), Tmp4,
-                           A64cc);
-  SDValue Hi = DAG.getNode(AArch64ISD::SELECT_CC, dl, VT, CmpOp,
-                           Tmp3, FalseVal, A64cc);
+// Return whether the an instruction can potentially be optimized to a tail
+// call. This will cause the optimizers to attempt to move, or duplicate,
+// return instructions to help enable tail call optimizations for this
+// instruction.
+bool AArch64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
+  if (!CI->isTailCall())
+    return false;
 
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return true;
 }
 
-// If this is a case we can't handle, return null and let the default
-// expansion code take care of it.
-SDValue
-AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                                         const AArch64Subtarget *ST) const {
-
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-
-  unsigned UseNeonMov = VT.getSizeInBits() >= 64;
-
-  // Note we favor lowering MOVI over MVNI.
-  // This has implications on the definition of patterns in TableGen to select
-  // BIC immediate instructions but not ORR immediate instructions.
-  // If this lowering order is changed, TableGen patterns for BIC immediate and
-  // ORR immediate instructions have to be updated.
-  if (UseNeonMov &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
-      // First attempt to use vector immediate-form MOVI
-      EVT NeonMovVT;
-      unsigned Imm = 0;
-      unsigned OpCmode = 0;
-
-      if (isNeonModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
-                            SplatBitSize, DAG, VT.is128BitVector(),
-                            Neon_Mov_Imm, NeonMovVT, Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MOVIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
-        }
-      }
-
-      // Then attempt to use vector immediate-form MVNI
-      uint64_t NegatedImm = (~SplatBits).getZExtValue();
-      if (isNeonModifiedImm(NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
-                            DAG, VT.is128BitVector(), Neon_Mvn_Imm, NeonMovVT,
-                            Imm, OpCmode)) {
-        SDValue ImmVal = DAG.getTargetConstant(Imm, MVT::i32);
-        SDValue OpCmodeVal = DAG.getConstant(OpCmode, MVT::i32);
-        if (ImmVal.getNode() && OpCmodeVal.getNode()) {
-          SDValue NeonMov = DAG.getNode(AArch64ISD::NEON_MVNIMM, DL, NeonMovVT,
-                                        ImmVal, OpCmodeVal);
-          return DAG.getNode(ISD::BITCAST, DL, VT, NeonMov);
-        }
-      }
-
-      // Attempt to use vector immediate-form FMOV
-      if (((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) ||
-          (VT == MVT::v2f64 && SplatBitSize == 64)) {
-        APFloat RealVal(
-            SplatBitSize == 32 ? APFloat::IEEEsingle : APFloat::IEEEdouble,
-            SplatBits);
-        uint32_t ImmVal;
-        if (A64Imms::isFPImm(RealVal, ImmVal)) {
-          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
-          return DAG.getNode(AArch64ISD::NEON_FMOVIMM, DL, VT, Val);
-        }
-      }
-    }
-  }
-
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isOnlyLowElement = true;
-  bool usesOnlyOneValue = true;
-  bool hasDominantValue = false;
-  bool isConstant = true;
-
-  // Map of the number of times a particular SDValue appears in the
-  // element list.
-  DenseMap<SDValue, unsigned> ValueCounts;
-  SDValue Value;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    if (i > 0)
-      isOnlyLowElement = false;
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
-      isConstant = false;
-
-    ValueCounts.insert(std::make_pair(V, 0));
-    unsigned &Count = ValueCounts[V];
-
-    // Is this value dominant? (takes up more than half of the lanes)
-    if (++Count > (NumElts / 2)) {
-      hasDominantValue = true;
-      Value = V;
-    }
-  }
-  if (ValueCounts.size() != 1)
-    usesOnlyOneValue = false;
-  if (!Value.getNode() && ValueCounts.size() > 0)
-    Value = ValueCounts.begin()->first;
-
-  if (ValueCounts.size() == 0)
-    return DAG.getUNDEF(VT);
-
-  if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value);
-
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (hasDominantValue && EltSize <= 64) {
-    // Use VDUP for non-constant splats.
-    if (!isConstant) {
-      SDValue N;
-
-      // If we are DUPing a value that comes directly from a vector, we could
-      // just use DUPLANE. We can only do this if the lane being extracted
-      // is at a constant index, as the DUP from lane instructions only have
-      // constant-index forms.
-      //
-      // If there is a TRUNCATE between EXTRACT_VECTOR_ELT and DUP, we can
-      // remove TRUNCATE for DUPLANE by apdating the source vector to
-      // appropriate vector type and lane index.
-      //
-      // FIXME: for now we have v1i8, v1i16, v1i32 legal vector types, if they
-      // are not legal any more, no need to check the type size in bits should
-      // be large than 64.
-      SDValue V = Value;
-      if (Value->getOpcode() == ISD::TRUNCATE)
-        V = Value->getOperand(0);
-      if (V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-          isa<ConstantSDNode>(V->getOperand(1)) &&
-          V->getOperand(0).getValueType().getSizeInBits() >= 64) {
-
-        // If the element size of source vector is larger than DUPLANE
-        // element size, we can do transformation by,
-        // 1) bitcasting source register to smaller element vector
-        // 2) mutiplying the lane index by SrcEltSize/ResEltSize
-        // For example, we can lower
-        //     "v8i16 vdup_lane(v4i32, 1)"
-        // to be
-        //     "v8i16 vdup_lane(v8i16 bitcast(v4i32), 2)".
-        SDValue SrcVec = V->getOperand(0);
-        unsigned SrcEltSize =
-            SrcVec.getValueType().getVectorElementType().getSizeInBits();
-        unsigned ResEltSize = VT.getVectorElementType().getSizeInBits();
-        if (SrcEltSize > ResEltSize) {
-          assert((SrcEltSize % ResEltSize == 0) && "Invalid element size");
-          SDValue BitCast;
-          unsigned SrcSize = SrcVec.getValueType().getSizeInBits();
-          unsigned ResSize = VT.getSizeInBits();
-
-          if (SrcSize > ResSize) {
-            assert((SrcSize % ResSize == 0) && "Invalid vector size");
-            EVT CastVT =
-                EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
-                                 SrcSize / ResEltSize);
-            BitCast = DAG.getNode(ISD::BITCAST, DL, CastVT, SrcVec);
-          } else {
-            assert((SrcSize == ResSize) && "Invalid vector size of source vec");
-            BitCast = DAG.getNode(ISD::BITCAST, DL, VT, SrcVec);
-          }
-
-          unsigned LaneIdx = V->getConstantOperandVal(1);
-          SDValue Lane =
-              DAG.getConstant((SrcEltSize / ResEltSize) * LaneIdx, MVT::i64);
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, BitCast, Lane);
-        } else {
-          assert((SrcEltSize == ResEltSize) &&
-                 "Invalid element size of source vec");
-          N = DAG.getNode(AArch64ISD::NEON_VDUPLANE, DL, VT, V->getOperand(0),
-                          V->getOperand(1));
-        }
-      } else
-        N = DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
-
-      if (!usesOnlyOneValue) {
-        // The dominant value was splatted as 'N', but we now have to insert
-        // all differing elements.
-        for (unsigned I = 0; I < NumElts; ++I) {
-          if (Op.getOperand(I) == Value)
-            continue;
-          SmallVector<SDValue, 3> Ops;
-          Ops.push_back(N);
-          Ops.push_back(Op.getOperand(I));
-          Ops.push_back(DAG.getConstant(I, MVT::i64));
-          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
-        }
-      }
-      return N;
-    }
-    if (usesOnlyOneValue && isConstant) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, DL, VT, Value);
-    }
-  }
-  // If all elements are constants and the case above didn't get hit, fall back
-  // to the default expansion, which will generate a load from the constant
-  // pool.
-  if (isConstant)
-    return SDValue();
-
-  // Try to lower this in lowering ShuffleVector way.
-  SDValue V0, V1;
-  int Mask[16];
-  if (isKnownShuffleVector(Op, DAG, V0, V1, Mask)) {
-    unsigned V0NumElts = V0.getValueType().getVectorNumElements();
-    if (!V1.getNode() && V0NumElts == NumElts * 2) {
-      V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(NumElts, MVT::i64));
-      V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
-                       DAG.getConstant(0, MVT::i64));
-      V0NumElts = V0.getValueType().getVectorNumElements();
-    }
-
-    if (V1.getNode() && NumElts == V0NumElts &&
-        V0NumElts == V1.getValueType().getVectorNumElements()) {
-      SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
-      if (Shuffle.getOpcode() != ISD::VECTOR_SHUFFLE)
-        return Shuffle;
-      else
-        return LowerVECTOR_SHUFFLE(Shuffle, DAG);
-    } else {
-      SDValue Res;
-      if (isConcatVector(Op, DAG, V0, V1, Mask, Res))
-        return Res;
-    }
-  }
+bool AArch64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
+                                                   SDValue &Offset,
+                                                   ISD::MemIndexedMode &AM,
+                                                   bool &IsInc,
+                                                   SelectionDAG &DAG) const {
+  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
+    return false;
 
-  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
-  // know the default expansion would otherwise fall back on something even
-  // worse. For a vector with one or two non-undef values, that's
-  // scalar_to_vector for the elements followed by a shuffle (provided the
-  // shuffle is valid for the target) and materialization element by element
-  // on the stack followed by a load for everything else.
-  if (!isConstant && !usesOnlyOneValue) {
-    SDValue Vec = DAG.getUNDEF(VT);
-    for (unsigned i = 0 ; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
-        continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
-    }
-    return Vec;
+  Base = Op->getOperand(0);
+  // All of the indexed addressing mode instructions take a signed
+  // 9 bit immediate offset.
+  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
+    int64_t RHSC = (int64_t)RHS->getZExtValue();
+    if (RHSC >= 256 || RHSC <= -256)
+      return false;
+    IsInc = (Op->getOpcode() == ISD::ADD);
+    Offset = Op->getOperand(1);
+    return true;
   }
-  return SDValue();
+  return false;
 }
 
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
+bool AArch64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                                      SDValue &Offset,
+                                                      ISD::MemIndexedMode &AM,
+                                                      SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
     return false;
 
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
+  bool IsInc;
+  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
     return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
+  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
   return true;
 }
 
-// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
-// TRN instruction.
-static unsigned isPermuteMask(ArrayRef<int> M, EVT VT, bool isV2undef) {
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts < 4)
-    return 0;
-
-  bool ismatch = true;
-
-  // Check UZP1
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP1;
-
-  // Check UZP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i * 2 + 1;
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_UZP2;
-
-  // Check ZIP1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP1;
-
-  // Check ZIP2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = (NumElts + i) / 2 + NumElts * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_ZIP2;
-
-  // Check TRN1
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN1;
-
-  // Check TRN2
-  ismatch = true;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    unsigned answer = 1 + i + (NumElts - 1) * (i % 2);
-    if (isV2undef && answer >= NumElts)
-      answer -= NumElts;
-    if (M[i] != -1 && (unsigned)M[i] != answer) {
-      ismatch = false;
-      break;
-    }
-  }
-  if (ismatch)
-    return AArch64ISD::NEON_TRN2;
-
-  return 0;
-}
-
-SDValue
-AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-  if (EltSize > 64)
-    return SDValue();
-
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(AArch64ISD::NEON_REV64, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(AArch64ISD::NEON_REV32, dl, VT, V1);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
-
-  unsigned ISDNo;
-  if (V2.getOpcode() == ISD::UNDEF)
-    ISDNo = isPermuteMask(ShuffleMask, VT, true);
-  else
-    ISDNo = isPermuteMask(ShuffleMask, VT, false);
-
-  if (ISDNo) {
-    if (V2.getOpcode() == ISD::UNDEF)
-      return DAG.getNode(ISDNo, dl, VT, V1, V1);
-    else
-      return DAG.getNode(ISDNo, dl, VT, V1, V2);
-  }
-
-  SDValue Res;
-  if (isConcatVector(Op, DAG, V1, V2, &ShuffleMask[0], Res))
-    return Res;
-
-  // If the element of shuffle mask are all the same constant, we can
-  // transform it into either NEON_VDUP or NEON_VDUPLANE
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1) Lane = 0;
-
-    // Test if V1 is a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
-      return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT, V1.getOperand(0));
-    }
-    // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR) {
-      bool IsScalarToVector = true;
-      for (unsigned i = 0, e = V1.getNumOperands(); i != e; ++i)
-        if (V1.getOperand(i).getOpcode() != ISD::UNDEF &&
-            i != (unsigned)Lane) {
-          IsScalarToVector = false;
-          break;
-        }
-      if (IsScalarToVector)
-        return DAG.getNode(AArch64ISD::NEON_VDUP, dl, VT,
-                           V1.getOperand(Lane));
-    }
-
-    // Test if V1 is a EXTRACT_SUBVECTOR.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      int ExtLane = cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1.getOperand(0),
-                         DAG.getConstant(Lane + ExtLane, MVT::i64));
-    }
-    // Test if V1 is a CONCAT_VECTORS.
-    if (V1.getOpcode() == ISD::CONCAT_VECTORS &&
-        V1.getOperand(1).getOpcode() == ISD::UNDEF) {
-      SDValue Op0 = V1.getOperand(0);
-      assert((unsigned)Lane < Op0.getValueType().getVectorNumElements() &&
-             "Invalid vector lane access");
-      return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, Op0,
-                         DAG.getConstant(Lane, MVT::i64));
-    }
-
-    return DAG.getNode(AArch64ISD::NEON_VDUPLANE, dl, VT, V1,
-                       DAG.getConstant(Lane, MVT::i64));
-  }
-
-  int Length = ShuffleMask.size();
-  int V1EltNum = V1.getValueType().getVectorNumElements();
-
-  // If the number of v1 elements is the same as the number of shuffle mask
-  // element and the shuffle masks are sequential values, we can transform
-  // it into NEON_VEXTRACT.
-  if (V1EltNum == Length) {
-    // Check if the shuffle mask is sequential.
-    int SkipUndef = 0;
-    while (ShuffleMask[SkipUndef] == -1) {
-      SkipUndef++;
-    }
-    int CurMask = ShuffleMask[SkipUndef];
-    if (CurMask >= SkipUndef) {
-      bool IsSequential = true;
-      for (int I = SkipUndef; I < Length; ++I) {
-        if (ShuffleMask[I] != -1 && ShuffleMask[I] != CurMask) {
-          IsSequential = false;
-          break;
-        }
-        CurMask++;
-      }
-      if (IsSequential) {
-        assert((EltSize % 8 == 0) && "Bitsize of vector element is incorrect");
-        unsigned VecSize = EltSize * V1EltNum;
-        unsigned Index = (EltSize / 8) * (ShuffleMask[SkipUndef] - SkipUndef);
-        if (VecSize == 64 || VecSize == 128)
-          return DAG.getNode(AArch64ISD::NEON_VEXTRACT, dl, VT, V1, V2,
-                             DAG.getConstant(Index, MVT::i64));
-      }
-    }
-  }
-
-  // For shuffle mask like "0, 1, 2, 3, 4, 5, 13, 7", try to generate insert
-  // by element from V2 to V1 .
-  // If shuffle mask is like "0, 1, 10, 11, 12, 13, 14, 15", V2 would be a
-  // better choice to be inserted than V1 as less insert needed, so we count
-  // element to be inserted for both V1 and V2, and select less one as insert
-  // target.
-
-  // Collect elements need to be inserted and their index.
-  SmallVector<int, 8> NV1Elt;
-  SmallVector<int, 8> N1Index;
-  SmallVector<int, 8> NV2Elt;
-  SmallVector<int, 8> N2Index;
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != I) {
-      NV1Elt.push_back(ShuffleMask[I]);
-      N1Index.push_back(I);
-    }
-  }
-  for (int I = 0; I != Length; ++I) {
-    if (ShuffleMask[I] != (I + V1EltNum)) {
-      NV2Elt.push_back(ShuffleMask[I]);
-      N2Index.push_back(I);
-    }
-  }
-
-  // Decide which to be inserted. If all lanes mismatch, neither V1 nor V2
-  // will be inserted.
-  SDValue InsV = V1;
-  SmallVector<int, 8> InsMasks = NV1Elt;
-  SmallVector<int, 8> InsIndex = N1Index;
-  if ((int)NV1Elt.size() != Length || (int)NV2Elt.size() != Length) {
-    if (NV1Elt.size() > NV2Elt.size()) {
-      InsV = V2;
-      InsMasks = NV2Elt;
-      InsIndex = N2Index;
-    }
-  } else {
-    InsV = DAG.getNode(ISD::UNDEF, dl, VT);
-  }
-
-  for (int I = 0, E = InsMasks.size(); I != E; ++I) {
-    SDValue ExtV = V1;
-    int Mask = InsMasks[I];
-    if (Mask >= V1EltNum) {
-      ExtV = V2;
-      Mask -= V1EltNum;
-    }
-    // Any value type smaller than i32 is illegal in AArch64, and this lower
-    // function is called after legalize pass, so we need to legalize
-    // the result here.
-    EVT EltVT;
-    if (VT.getVectorElementType().isFloatingPoint())
-      EltVT = (EltSize == 64) ? MVT::f64 : MVT::f32;
-    else
-      EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
+bool AArch64TargetLowering::getPostIndexedAddressParts(
+    SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset,
+    ISD::MemIndexedMode &AM, SelectionDAG &DAG) const {
+  EVT VT;
+  SDValue Ptr;
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    VT = LD->getMemoryVT();
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
+    VT = ST->getMemoryVT();
+    Ptr = ST->getBasePtr();
+  } else
+    return false;
 
-    if (Mask >= 0) {
-      ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
-                         DAG.getConstant(Mask, MVT::i64));
-      InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
-                         DAG.getConstant(InsIndex[I], MVT::i64));
-    }
-  }
-  return InsV;
+  bool IsInc;
+  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
+    return false;
+  // Post-indexing updates the base, so it's not a valid transform
+  // if that's not the same as the load's pointer.
+  if (Ptr != Base)
+    return false;
+  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
+  return true;
 }
 
-AArch64TargetLowering::ConstraintType
-AArch64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default: break;
-    case 'w': // An FP/SIMD vector register
-      return C_RegisterClass;
-    case 'I': // Constant that can be used with an ADD instruction
-    case 'J': // Constant that can be used with a SUB instruction
-    case 'K': // Constant that can be used with a 32-bit logical instruction
-    case 'L': // Constant that can be used with a 64-bit logical instruction
-    case 'M': // Constant that can be used as a 32-bit MOV immediate
-    case 'N': // Constant that can be used as a 64-bit MOV immediate
-    case 'Y': // Floating point constant zero
-    case 'Z': // Integer constant zero
-      return C_Other;
-    case 'Q': // A memory reference with base register and no offset
-      return C_Memory;
-    case 'S': // A symbolic address
-      return C_Other;
-    }
+void AArch64TargetLowering::ReplaceNodeResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Don't know how to custom expand this");
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
+    // Let normal code take care of it by not adding anything to Results.
+    return;
   }
-
-  // FIXME: Ump, Utf, Usa, Ush
-  // Ump: A memory address suitable for ldp/stp in SI, DI, SF and DF modes,
-  //      whatever they may be
-  // Utf: A memory address suitable for ldp/stp in TF mode, whatever it may be
-  // Usa: An absolute symbolic address
-  // Ush: The high part (bits 32:12) of a pc-relative symbolic address
-  assert(Constraint != "Ump" && Constraint != "Utf" && Constraint != "Usa"
-         && Constraint != "Ush" && "Unimplemented constraints");
-
-  return TargetLowering::getConstraintType(Constraint);
 }
 
-TargetLowering::ConstraintWeight
-AArch64TargetLowering::getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                                const char *Constraint) const {
-
-  llvm_unreachable("Constraint weight unimplemented");
+bool AArch64TargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 128-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 128;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 128;
+
+  // For the real atomic operations, we have ldxr/stxr up to 128 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 128;
 }
 
-void
-AArch64TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
-                                                    std::string &Constraint,
-                                                    std::vector<SDValue> &Ops,
-                                                    SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
-
-  // Only length 1 constraints are C_Other.
-  if (Constraint.size() != 1) return;
-
-  // Only C_Other constraints get lowered like this. That means constants for us
-  // so return early if there's no hope the constraint can be lowered.
-
-  switch(Constraint[0]) {
-  default: break;
-  case 'I': case 'J': case 'K': case 'L':
-  case 'M': case 'N': case 'Z': {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
-
-    uint64_t CVal = C->getZExtValue();
-    uint32_t Bits;
-
-    switch (Constraint[0]) {
-    default:
-      // FIXME: 'M' and 'N' are MOV pseudo-insts -- unsupported in assembly. 'J'
-      // is a peculiarly useless SUB constraint.
-      llvm_unreachable("Unimplemented C_Other constraint");
-    case 'I':
-      if (CVal <= 0xfff)
-        break;
-      return;
-    case 'K':
-      if (A64Imms::isLogicalImm(32, CVal, Bits))
-        break;
-      return;
-    case 'L':
-      if (A64Imms::isLogicalImm(64, CVal, Bits))
-        break;
-      return;
-    case 'Z':
-      if (CVal == 0)
-        break;
-      return;
-    }
-
-    Result = DAG.getTargetConstant(CVal, Op.getValueType());
-    break;
-  }
-  case 'S': {
-    // An absolute symbolic address or label reference.
-    if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op)) {
-      Result = DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(Op),
-                                          GA->getValueType(0));
-    } else if (const BlockAddressSDNode *BA
-                 = dyn_cast<BlockAddressSDNode>(Op)) {
-      Result = DAG.getTargetBlockAddress(BA->getBlockAddress(),
-                                         BA->getValueType(0));
-    } else if (const ExternalSymbolSDNode *ES
-                 = dyn_cast<ExternalSymbolSDNode>(Op)) {
-      Result = DAG.getTargetExternalSymbol(ES->getSymbol(),
-                                           ES->getValueType(0));
-    } else
-      return;
-    break;
-  }
-  case 'Y':
-    if (const ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
-      if (CFP->isExactlyValue(0.0)) {
-        Result = DAG.getTargetConstantFP(0.0, CFP->getValueType(0));
-        break;
-      }
-    }
-    return;
+Value *AArch64TargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                             AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i128 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i64, i64} and we have to recombine them into a
+  // single i128 here.
+  if (ValTy->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
+    Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 64)), "val64");
   }
 
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
-  }
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int =
+      IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
+  Function *Ldxr = llvm::Intrinsic::getDeclaration(M, Int, Tys);
 
-  // It's an unknown constraint for us. Let generic code have a go.
-  TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldxr, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
 }
 
-std::pair<unsigned, const TargetRegisterClass*>
-AArch64TargetLowering::getRegForInlineAsmConstraint(
-                                                  const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() <= 32)
-        return std::make_pair(0U, &AArch64::GPR32RegClass);
-      else if (VT == MVT::i64)
-        return std::make_pair(0U, &AArch64::GPR64RegClass);
-      break;
-    case 'w':
-      if (VT == MVT::f16)
-        return std::make_pair(0U, &AArch64::FPR16RegClass);
-      else if (VT == MVT::f32)
-        return std::make_pair(0U, &AArch64::FPR32RegClass);
-      else if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &AArch64::FPR64RegClass);
-      else if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &AArch64::FPR128RegClass);
-      break;
-    }
+Value *AArch64TargetLowering::emitStoreConditional(IRBuilder<> &Builder,
+                                                   Value *Val, Value *Addr,
+                                                   AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i128 intrinsics take two
+  // parameters: "i64, i64". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 128) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::aarch64_stlxp : Intrinsic::aarch64_stxp;
+    Function *Stxr = Intrinsic::getDeclaration(M, Int);
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int64Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 64), Int64Ty, "hi");
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Stxr, Lo, Hi, Addr);
   }
 
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-}
+  Intrinsic::ID Int =
+      IsRelease ? Intrinsic::aarch64_stlxr : Intrinsic::aarch64_stxr;
+  Type *Tys[] = { Addr->getType() };
+  Function *Stxr = Intrinsic::getDeclaration(M, Int, Tys);
 
-/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
-/// The associated MachineMemOperands record the alignment specified
-/// in the intrinsic calls.
-bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                               const CallInst &I,
-                                               unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm_neon_vld1:
-  case Intrinsic::arm_neon_vld2:
-  case Intrinsic::arm_neon_vld3:
-  case Intrinsic::arm_neon_vld4:
-  case Intrinsic::aarch64_neon_vld1x2:
-  case Intrinsic::aarch64_neon_vld1x3:
-  case Intrinsic::aarch64_neon_vld1x4:
-  case Intrinsic::arm_neon_vld2lane:
-  case Intrinsic::arm_neon_vld3lane:
-  case Intrinsic::arm_neon_vld4lane: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm_neon_vst1:
-  case Intrinsic::arm_neon_vst2:
-  case Intrinsic::arm_neon_vst3:
-  case Intrinsic::arm_neon_vst4:
-  case Intrinsic::aarch64_neon_vst1x2:
-  case Intrinsic::aarch64_neon_vst1x3:
-  case Intrinsic::aarch64_neon_vst1x4:
-  case Intrinsic::arm_neon_vst2lane:
-  case Intrinsic::arm_neon_vst3lane:
-  case Intrinsic::arm_neon_vst4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
-  }
-
-  return false;
+  return Builder.CreateCall2(
+      Stxr, Builder.CreateZExtOrBitCast(
+                Val, Stxr->getFunctionType()->getParamType(0)),
+      Addr);
 }
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index e946b25..de16c4d 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -12,364 +12,453 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_ISELLOWERING_H
-#define LLVM_TARGET_AARCH64_ISELLOWERING_H
+#ifndef LLVM_TARGET_AArch64_ISELLOWERING_H
+#define LLVM_TARGET_AArch64_ISELLOWERING_H
 
-#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/CallingConv.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
+
 namespace AArch64ISD {
-  enum NodeType {
-    // Start the numbering from where ISD NodeType finishes.
-    FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-    // This is a conditional branch which also notes the flag needed
-    // (eq/sgt/...). A64 puts this information on the branches rather than
-    // compares as LLVM does.
-    BR_CC,
-
-    // A node to be selected to an actual call operation: either BL or BLR in
-    // the absence of tail calls.
-    Call,
-
-    // Indicates a floating-point immediate which fits into the format required
-    // by the FMOV instructions. First (and only) operand is the 8-bit encoded
-    // value of that immediate.
-    FPMOV,
-
-    // Corresponds directly to an EXTR instruction. Operands are an LHS an RHS
-    // and an LSB.
-    EXTR,
-
-    // Wraps a load from the GOT, which should always be performed with a 64-bit
-    // load instruction. This prevents the DAG combiner folding a truncate to
-    // form a smaller memory access.
-    GOTLoad,
-
-    // Performs a bitfield insert. Arguments are: the value being inserted into;
-    // the value being inserted; least significant bit changed; width of the
-    // field.
-    BFI,
-
-    // Simply a convenient node inserted during ISelLowering to represent
-    // procedure return. Will almost certainly be selected to "RET".
-    Ret,
-
-    /// Extracts a field of contiguous bits from the source and sign extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    SBFX,
-
-    /// This is an A64-ification of the standard LLVM SELECT_CC operation. The
-    /// main difference is that it only has the values and an A64 condition,
-    /// which will be produced by a setcc instruction.
-    SELECT_CC,
-
-    /// This serves most of the functions of the LLVM SETCC instruction, for two
-    /// purposes. First, it prevents optimisations from fiddling with the
-    /// compare after we've moved the CondCode information onto the SELECT_CC or
-    /// BR_CC instructions. Second, it gives a legal instruction for the actual
-    /// comparison.
-    ///
-    /// It keeps a record of the condition flags asked for because certain
-    /// instructions are only valid for a subset of condition codes.
-    SETCC,
-
-    // Designates a node which is a tail call: both a call and a return
-    // instruction as far as selction is concerned. It should be selected to an
-    // unconditional branch. Has the usual plethora of call operands, but: 1st
-    // is callee, 2nd is stack adjustment required immediately before branch.
-    TC_RETURN,
-
-    // Designates a call used to support the TLS descriptor ABI. The call itself
-    // will be indirect ("BLR xN") but a relocation-specifier (".tlsdesccall
-    // var") must be attached somehow during code generation. It takes two
-    // operands: the callee and the symbol to be relocated against.
-    TLSDESCCALL,
-
-    // Leaf node which will be lowered to an appropriate MRS to obtain the
-    // thread pointer: TPIDR_EL0.
-    THREAD_POINTER,
-
-    /// Extracts a field of contiguous bits from the source and zero extends
-    /// them into a single register. Arguments are: source; immr; imms. Note
-    /// these are pre-encoded since DAG matching can't cope with combining LSB
-    /// and Width into these values itself.
-    UBFX,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the large memory model style: i.e. a sequence of four
-    // movz/movk instructions.
-    WrapperLarge,
-
-    // Wraps an address which the ISelLowering phase has decided should be
-    // created using the small memory model style: i.e. adrp/add or
-    // adrp/mem-op. This exists to prevent bare TargetAddresses which may never
-    // get selected.
-    WrapperSmall,
-
-    // Vector move immediate
-    NEON_MOVIMM,
-
-    // Vector Move Inverted Immediate
-    NEON_MVNIMM,
-
-    // Vector FP move immediate
-    NEON_FMOVIMM,
-
-    // Vector permute
-    NEON_UZP1,
-    NEON_UZP2,
-    NEON_ZIP1,
-    NEON_ZIP2,
-    NEON_TRN1,
-    NEON_TRN2,
-
-    // Vector Element reverse
-    NEON_REV64,
-    NEON_REV32,
-    NEON_REV16,
-
-    // Vector compare
-    NEON_CMP,
-
-    // Vector compare zero
-    NEON_CMPZ,
-
-    // Vector compare bitwise test
-    NEON_TST,
-
-    // Vector saturating shift
-    NEON_QSHLs,
-    NEON_QSHLu,
-
-    // Vector dup
-    NEON_VDUP,
-
-    // Vector dup by lane
-    NEON_VDUPLANE,
-
-    // Vector extract
-    NEON_VEXTRACT,
-
-    // NEON duplicate lane loads
-    NEON_LD2DUP = ISD::FIRST_TARGET_MEMORY_OPCODE,
-    NEON_LD3DUP,
-    NEON_LD4DUP,
-
-    // NEON loads with post-increment base updates:
-    NEON_LD1_UPD,
-    NEON_LD2_UPD,
-    NEON_LD3_UPD,
-    NEON_LD4_UPD,
-    NEON_LD1x2_UPD,
-    NEON_LD1x3_UPD,
-    NEON_LD1x4_UPD,
-
-    // NEON stores with post-increment base updates:
-    NEON_ST1_UPD,
-    NEON_ST2_UPD,
-    NEON_ST3_UPD,
-    NEON_ST4_UPD,
-    NEON_ST1x2_UPD,
-    NEON_ST1x3_UPD,
-    NEON_ST1x4_UPD,
-
-    // NEON duplicate lane loads with post-increment base updates:
-    NEON_LD2DUP_UPD,
-    NEON_LD3DUP_UPD,
-    NEON_LD4DUP_UPD,
-
-    // NEON lane loads with post-increment base updates:
-    NEON_LD2LN_UPD,
-    NEON_LD3LN_UPD,
-    NEON_LD4LN_UPD,
-
-    // NEON lane store with post-increment base updates:
-    NEON_ST2LN_UPD,
-    NEON_ST3LN_UPD,
-    NEON_ST4LN_UPD
-  };
-}
 
+enum {
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
+  CALL,         // Function call.
+
+  // Almost the same as a normal call node, except that a TLSDesc relocation is
+  // needed so the linker can relax it correctly if possible.
+  TLSDESC_CALL,
+  ADRP,     // Page address of a TargetGlobalAddress operand.
+  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
+  LOADgot,  // Load from automatically generated descriptor (e.g. Global
+            // Offset Table, TLS record).
+  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
+  BRCOND,   // Conditional branch instruction; "b.cond".
+  CSEL,
+  FCSEL, // Conditional move instruction.
+  CSINV, // Conditional select invert.
+  CSNEG, // Conditional select negate.
+  CSINC, // Conditional select increment.
+
+  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
+  // ELF.
+  THREAD_POINTER,
+  ADC,
+  SBC, // adc, sbc instructions
+
+  // Arithmetic instructions which write flags.
+  ADDS,
+  SUBS,
+  ADCS,
+  SBCS,
+  ANDS,
+
+  // Floating point comparison
+  FCMP,
+
+  // Floating point max and min instructions.
+  FMAX,
+  FMIN,
+
+  // Scalar extract
+  EXTR,
+
+  // Scalar-to-vector duplication
+  DUP,
+  DUPLANE8,
+  DUPLANE16,
+  DUPLANE32,
+  DUPLANE64,
+
+  // Vector immedate moves
+  MOVI,
+  MOVIshift,
+  MOVIedit,
+  MOVImsl,
+  FMOV,
+  MVNIshift,
+  MVNImsl,
+
+  // Vector immediate ops
+  BICi,
+  ORRi,
+
+  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // element must be identical.
+  BSL,
+
+  // Vector arithmetic negation
+  NEG,
+
+  // Vector shuffles
+  ZIP1,
+  ZIP2,
+  UZP1,
+  UZP2,
+  TRN1,
+  TRN2,
+  REV16,
+  REV32,
+  REV64,
+  EXT,
+
+  // Vector shift by scalar
+  VSHL,
+  VLSHR,
+  VASHR,
+
+  // Vector shift by scalar (again)
+  SQSHL_I,
+  UQSHL_I,
+  SQSHLU_I,
+  SRSHR_I,
+  URSHR_I,
+
+  // Vector comparisons
+  CMEQ,
+  CMGE,
+  CMGT,
+  CMHI,
+  CMHS,
+  FCMEQ,
+  FCMGE,
+  FCMGT,
+
+  // Vector zero comparisons
+  CMEQz,
+  CMGEz,
+  CMGTz,
+  CMLEz,
+  CMLTz,
+  FCMEQz,
+  FCMGEz,
+  FCMGTz,
+  FCMLEz,
+  FCMLTz,
+
+  // Vector bitwise negation
+  NOT,
+
+  // Vector bitwise selection
+  BIT,
+
+  // Compare-and-branch
+  CBZ,
+  CBNZ,
+  TBZ,
+  TBNZ,
+
+  // Tail calls
+  TC_RETURN,
+
+  // Custom prefetch handling
+  PREFETCH,
+
+  // {s|u}int to FP within a FP register.
+  SITOF,
+  UITOF,
+
+  // NEON Load/Store with post-increment base updates
+  LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
+  LD3post,
+  LD4post,
+  ST2post,
+  ST3post,
+  ST4post,
+  LD1x2post,
+  LD1x3post,
+  LD1x4post,
+  ST1x2post,
+  ST1x3post,
+  ST1x4post,
+  LD1DUPpost,
+  LD2DUPpost,
+  LD3DUPpost,
+  LD4DUPpost,
+  LD1LANEpost,
+  LD2LANEpost,
+  LD3LANEpost,
+  LD4LANEpost,
+  ST2LANEpost,
+  ST3LANEpost,
+  ST4LANEpost
+};
+
+} // end namespace AArch64ISD
 
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
 class AArch64TargetLowering : public TargetLowering {
+  bool RequireStrictAlign;
+
 public:
   explicit AArch64TargetLowering(AArch64TargetMachine &TM);
 
-  const char *getTargetNodeName(unsigned Opcode) const;
+  /// Selects the correct CCAssignFn for a the given CallingConvention
+  /// value.
+  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
+
+  /// computeKnownBitsForTargetNode - Determine which of the bits specified in
+  /// Mask are known to be either zero or one and return them in the
+  /// KnownZero/KnownOne bitsets.
+  void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
+
+  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
+  /// unaligned memory accesses. of the specified type.
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
+                                     bool *Fast = nullptr) const override {
+    if (RequireStrictAlign)
+      return false;
+    // FIXME: True for Cyclone, but not necessary others.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
 
-  CCAssignFn *CCAssignFnForNode(CallingConv::ID CC) const;
+  /// LowerOperation - Provide custom lowering hooks for some operations.
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-  SDValue LowerFormalArguments(SDValue Chain,
-                               CallingConv::ID CallConv, bool isVarArg,
-                               const SmallVectorImpl<ISD::InputArg> &Ins,
-                               SDLoc dl, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
-  SDValue LowerReturn(SDValue Chain,
-                      CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals,
-                      SDLoc dl, SelectionDAG &DAG) const;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  virtual unsigned getByValTypeAlignment(Type *Ty) const override;
+  /// getFunctionAlignment - Return the Log2 alignment of this function.
+  unsigned getFunctionAlignment(const Function *F) const;
 
-  SDValue LowerCall(CallLoweringInfo &CLI,
-                    SmallVectorImpl<SDValue> &InVals) const;
+  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
+  /// be used for loads / stores from the global.
+  unsigned getMaximalGlobalOffset() const override;
 
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool IsVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins,
-                          SDLoc dl, SelectionDAG &DAG,
-                          SmallVectorImpl<SDValue> &InVals) const;
+  /// Returns true if a cast between SrcAS and DestAS is a noop.
+  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
+    // Addrspacecasts are always noops.
+    return true;
+  }
 
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  /// createFastISel - This method returns a target specific FastISel object,
+  /// or null if the target does not support "fast" ISel.
+  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                           const TargetLibraryInfo *libInfo) const override;
 
-  bool isConcatVector(SDValue Op, SelectionDAG &DAG, SDValue V0, SDValue V1,
-                      const int *Mask, SDValue &Res) const;
+  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-  bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &V0,
-                            SDValue &V1, int *Mask) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
-                            const AArch64Subtarget *ST) const;
+  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
+  /// codegen'd directly, or if it should be stack expanded.
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
 
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  /// getSetCCResultType - Return the ISD::SETCC ValueType
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-  void SaveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
+  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
-  /// IsEligibleForTailCallOptimization - Check whether the call is eligible
-  /// for tail call optimization. Targets which want to do tail call
-  /// optimization should implement this function.
-  bool IsEligibleForTailCallOptimization(SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    bool IsVarArg,
-                                    bool IsCalleeStructRet,
-                                    bool IsCallerStructRet,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<SDValue> &OutVals,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const;
+  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
+                                  MachineBasicBlock *BB) const;
 
-  /// Finds the incoming stack arguments which overlap the given fixed stack
-  /// object and incorporates their load into the current chain. This prevents
-  /// an upcoming store from clobbering the stack argument before it's used.
-  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
-                              MachineFrameInfo *MFI, int ClobberedFI) const;
+  MachineBasicBlock *
+  EmitInstrWithCustomInserter(MachineInstr *MI,
+                              MachineBasicBlock *MBB) const override;
 
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+                          unsigned Intrinsic) const override;
 
-  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+  bool isZExtFree(SDValue Val, EVT VT2) const override;
 
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  bool hasPairedLoad(Type *LoadedType,
+                     unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
 
-  bool isLegalICmpImmediate(int64_t Val) const;
-  SDValue getSelectableIntSetCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                         SDValue &A64cc, SelectionDAG &DAG, SDLoc &dl) const;
+  bool isLegalAddImmediate(int64_t) const override;
+  bool isLegalICmpImmediate(int64_t) const override;
 
-  virtual MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
+                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+                          MachineFunction &MF) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *MBB,
-                   unsigned Size, unsigned Opcode) const;
+  /// isLegalAddressingMode - Return true if the addressing mode represented
+  /// by AM is legal for this target, for a load/store of the specified type.
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
-  MachineBasicBlock *
-  emitAtomicBinaryMinMax(MachineInstr *MI, MachineBasicBlock *BB,
-                         unsigned Size, unsigned CmpOp,
-                         A64CC::CondCodes Cond) const;
-  MachineBasicBlock *
-  emitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
-                    unsigned Size) const;
+  /// \brief Return the cost of the scaling factor used in the addressing
+  /// mode represented by AM for this target, for a load/store
+  /// of the specified type.
+  /// If the AM is supported, the return value must be >= 0.
+  /// If the AM is not supported, it returns a negative value.
+  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
-  MachineBasicBlock *
-  EmitF128CSEL(MachineInstr *MI, MachineBasicBlock *MBB) const;
+  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+  /// expanded to FMAs when this method returns true, otherwise fmuladd is
+  /// expanded to fmul + fadd.
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
-  SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128ToCall(SDValue Op, SelectionDAG &DAG,
-                          RTLIB::Libcall Call) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
-  SDValue LowerGlobalAddressELFSmall(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELFLarge(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Returns false if N is a bit extraction pattern of (X >> C) & Mask.
+  bool isDesirableToCommuteWithShift(const SDNode *N) const override;
 
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  /// \brief Returns true if it is beneficial to convert a load of a constant
+  /// to just the constant itself.
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                         Type *Ty) const override;
 
-  SDValue LowerTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                           SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG, bool IsSigned) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                        AtomicOrdering Ord) const override;
+  Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                              Value *Addr, AtomicOrdering Ord) const override;
 
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  bool shouldExpandAtomicInIR(Instruction *Inst) const override;
 
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+private:
+  /// Subtarget - Keep a pointer to the AArch64Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const AArch64Subtarget *Subtarget;
 
-  ConstraintType getConstraintType(const std::string &Constraint) const;
+  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
+  void addDRTypeForNEON(MVT VT);
+  void addQRTypeForNEON(MVT VT);
 
-  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &Info,
-                                                  const char *Constraint) const;
-  void LowerAsmOperandForConstraint(SDValue Op,
-                                    std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const;
+  SDValue
+  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                       SelectionDAG &DAG,
+                       SmallVectorImpl<SDValue> &InVals) const override;
 
-  std::pair<unsigned, const TargetRegisterClass*>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
+                    SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                                  unsigned Intrinsic) const override;
+  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
+                          CallingConv::ID CallConv, bool isVarArg,
+                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
+                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+                          bool isThisReturn, SDValue ThisVal) const;
+
+  bool isEligibleForTailCallOptimization(
+      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
+      bool isCalleeStructRet, bool isCallerStructRet,
+      const SmallVectorImpl<ISD::OutputArg> &Outs,
+      const SmallVectorImpl<SDValue> &OutVals,
+      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
-protected:
-  std::pair<const TargetRegisterClass*, uint8_t>
-  findRepresentativeClass(MVT VT) const;
+  /// Finds the incoming stack arguments which overlap the given fixed stack
+  /// object and incorporates their load into the current chain. This prevents
+  /// an upcoming store from clobbering the stack argument before it's used.
+  SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG,
+                              MachineFrameInfo *MFI, int ClobberedFI) const;
 
-private:
-  const InstrItineraryData *Itins;
+  bool DoesCalleeRestoreStack(CallingConv::ID CallCC, bool TailCallOpt) const;
 
-  const AArch64Subtarget *getSubtarget() const {
-    return &getTargetMachine().getSubtarget<AArch64Subtarget>();
-  }
-};
-enum NeonModImmType {
-  Neon_Mov_Imm,
-  Neon_Mvn_Imm
+  bool IsTailCallConvention(CallingConv::ID CallCC) const;
+
+  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
+                           SDValue &Chain) const;
+
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      LLVMContext &Context) const override;
+
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
+                      SelectionDAG &DAG) const override;
+
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
+                              SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
+                        RTLIB::Libcall Call) const;
+  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
+  unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
+  /// Examine constraint string and operand type and determine a weight value.
+  /// The operand object must already have been set up with the operand type.
+  ConstraintWeight
+  getSingleConstraintMatchWeight(AsmOperandInfo &info,
+                                 const char *constraint) const override;
+
+  std::pair<unsigned, const TargetRegisterClass *>
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+
+  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
+  bool mayBeEmittedAsTailCall(CallInst *CI) const override;
+  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
+                              ISD::MemIndexedMode &AM, bool &IsInc,
+                              SelectionDAG &DAG) const;
+  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
+                                 ISD::MemIndexedMode &AM,
+                                 SelectionDAG &DAG) const override;
+  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
+                                  SDValue &Offset, ISD::MemIndexedMode &AM,
+                                  SelectionDAG &DAG) const override;
+
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 };
 
-extern SDValue ScanBUILD_VECTOR(SDValue Op, bool &isOnlyLowElement,
-                                bool &usesOnlyOneValue, bool &hasDominantValue,
-                                bool &isConstant, bool &isUNDEF);
-} // namespace llvm
+namespace AArch64 {
+FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                         const TargetLibraryInfo *libInfo);
+} // end namespace AArch64
+
+} // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64_ISELLOWERING_H
+#endif // LLVM_TARGET_AArch64_ISELLOWERING_H
diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td
new file mode 100644
index 0000000..3b9e3c6
--- /dev/null
+++ b/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -0,0 +1,364 @@
+//=- AArch64InstrAtomics.td - AArch64 Atomic codegen support -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 Atomic operand code-gen constructs.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------
+// Atomic fences
+//===----------------------------------
+def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+
+//===----------------------------------
+// Atomic loads
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A atomic load operation that actually needs acquire semantics.
+class acquiring_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected load ordering");
+  return Ordering == Acquire || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic load operation that does not need either acquire or release
+// semantics.
+class relaxed_load<PatFrag base>
+  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit loads
+def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                                     ro_Wextend8:$offset)),
+          (LDRBBroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend8:$offset)),
+          (LDRBBroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8> (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_load<atomic_load_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit loads
+def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend16:$extend)),
+          (LDRHHroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend16:$extend)),
+          (LDRHHroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_load<atomic_load_16> (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_load<atomic_load_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit loads
+def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend32:$extend)),
+          (LDRWroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend32:$extend)),
+          (LDRWroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_load<atomic_load_32> (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)),
+          (LDRWui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_load<atomic_load_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+          (LDURWi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit loads
+def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                       ro_Wextend64:$extend)),
+          (LDRXroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                       ro_Xextend64:$extend)),
+          (LDRXroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_load<atomic_load_64> (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset)),
+          (LDRXui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_load<atomic_load_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (LDURXi GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Atomic stores
+//===----------------------------------
+
+// When they're actually atomic, only one addressing mode (GPR64sp) is
+// supported, but when they're relaxed and anything can be used, all the
+// standard modes would be valid and may give efficiency gains.
+
+// A store operation that actually needs release semantics.
+class releasing_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  assert(Ordering != AcquireRelease && "unexpected store ordering");
+  return Ordering == Release || Ordering == SequentiallyConsistent;
+}]>;
+
+// An atomic store operation that doesn't actually need to be atomic on AArch64.
+class relaxed_store<PatFrag base>
+  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
+  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
+  return Ordering == Monotonic || Ordering == Unordered;
+}]>;
+
+// 8-bit stores
+def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
+          (STLRB GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+               GPR32:$val),
+          (STRBBroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+               GPR32:$val),
+          (STRBBroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset), GPR32:$val),
+          (STRBBui GPR32:$val, GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(relaxed_store<atomic_store_8>
+               (am_unscaled8 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURBBi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bit stores
+def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
+          (STLRH GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16> (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR32:$val),
+          (STRHHroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend)>;
+def : Pat<(relaxed_store<atomic_store_16>
+              (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset), GPR32:$val),
+          (STRHHui GPR32:$val, GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(relaxed_store<atomic_store_16>
+               (am_unscaled16 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURHHi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bit stores
+def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
+          (STLRW GPR32:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend32:$extend),
+                                          GPR32:$val),
+          (STRWroW GPR32:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32> (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend32:$extend),
+                                          GPR32:$val),
+          (STRWroX GPR32:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
+def : Pat<(relaxed_store<atomic_store_32>
+              (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset), GPR32:$val),
+          (STRWui GPR32:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(relaxed_store<atomic_store_32>
+               (am_unscaled32 GPR64sp:$Rn, simm9:$offset), GPR32:$val),
+          (STURWi GPR32:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bit stores
+def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
+          (STLRX GPR64:$val, GPR64sp:$ptr)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                         ro_Wextend16:$extend),
+                                          GPR64:$val),
+          (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                         ro_Xextend16:$extend),
+                                          GPR64:$val),
+          (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
+def : Pat<(relaxed_store<atomic_store_64>
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), GPR64:$val),
+          (STRXui GPR64:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(relaxed_store<atomic_store_64>
+               (am_unscaled64 GPR64sp:$Rn, simm9:$offset), GPR64:$val),
+          (STURXi GPR64:$val, GPR64sp:$Rn, simm9:$offset)>;
+
+//===----------------------------------
+// Low-level exclusive operations
+//===----------------------------------
+
+// Load-exclusives.
+
+def ldxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldxr_8 GPR64sp:$addr), (LDXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDXRW GPR64sp:$addr), sub_32)>;
+
+// Load-exclusives.
+
+def ldaxr_1 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldaxr_2 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldaxr_4 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def ldaxr_8 : PatFrag<(ops node:$ptr), (int_aarch64_ldaxr node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+def : Pat<(ldaxr_1 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_2 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_4 GPR64sp:$addr),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+def : Pat<(ldaxr_8 GPR64sp:$addr), (LDAXRX GPR64sp:$addr)>;
+
+def : Pat<(and (ldaxr_1 GPR64sp:$addr), 0xff),
+          (SUBREG_TO_REG (i64 0), (LDAXRB GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_2 GPR64sp:$addr), 0xffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRH GPR64sp:$addr), sub_32)>;
+def : Pat<(and (ldaxr_4 GPR64sp:$addr), 0xffffffff),
+          (SUBREG_TO_REG (i64 0), (LDAXRW GPR64sp:$addr), sub_32)>;
+
+// Store-exclusives.
+
+def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stxr_1 GPR64:$val, GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 GPR64:$val, GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 GPR64:$val, GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_8 GPR64:$val, GPR64sp:$addr),
+          (STXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+// Store-release-exclusives.
+
+def stlxr_1 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def stlxr_2 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def stlxr_4 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def stlxr_8 : PatFrag<(ops node:$val, node:$ptr),
+                     (int_aarch64_stlxr node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+}]>;
+
+
+def : Pat<(stlxr_1 GPR64:$val, GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 GPR64:$val, GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 GPR64:$val, GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_8 GPR64:$val, GPR64sp:$addr),
+          (STLXRX GPR64:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (zext (and GPR32:$val, 0xff)), GPR64sp:$addr),
+          (STLXRB GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (zext (and GPR32:$val, 0xffff)), GPR64sp:$addr),
+          (STLXRH GPR32:$val, GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (zext GPR32:$val), GPR64sp:$addr),
+          (STLXRW GPR32:$val, GPR64sp:$addr)>;
+
+def : Pat<(stlxr_1 (and GPR64:$val, 0xff), GPR64sp:$addr),
+          (STLXRB (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_2 (and GPR64:$val, 0xffff), GPR64sp:$addr),
+          (STLXRH (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+def : Pat<(stlxr_4 (and GPR64:$val, 0xffffffff), GPR64sp:$addr),
+          (STLXRW (EXTRACT_SUBREG GPR64:$val, sub_32), GPR64sp:$addr)>;
+
+
+// And clear exclusive.
+
+def : Pat<(int_aarch64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index 4cc3813..d455d7e 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1,4 +1,4 @@
-//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tablegen -*-=//
+//===- AArch64InstrFormats.td - AArch64 Instruction Formats --*- tblgen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,1482 +6,8569 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// This file describes AArch64 instruction formats, down to the level of the
-// instruction's overall class.
-//===----------------------------------------------------------------------===//
-
 
 //===----------------------------------------------------------------------===//
-// A64 Instruction Format Definitions.
-//===----------------------------------------------------------------------===//
+//  Describe AArch64 instructions format here
+//
+
+// Format specifies the encoding used by the instruction.  This is part of the
+// ad-hoc solution used to emit machine instruction encodings by our machine
+// code emitter.
+class Format<bits<2> val> {
+  bits<2> Value = val;
+}
 
-// A64 is currently the only instruction set supported by the AArch64
-// architecture.
-class A64Inst<dag outs, dag ins, string asmstr, list<dag> patterns,
-              InstrItinClass itin>
-    : Instruction {
-  // All A64 instructions are 32-bit. This field will be filled in
-  // gradually going down the hierarchy.
-  field bits<32> Inst;
+def PseudoFrm   : Format<0>;
+def NormalFrm   : Format<1>; // Do we need any others?
 
+// AArch64 Instruction Format
+class AArch64Inst<Format f, string cstr> : Instruction {
+  field bits<32> Inst; // Instruction encoding.
+  // Mask of bits that cause an encoding to be UNPREDICTABLE.
+  // If a bit is set, then if the corresponding bit in the
+  // target encoding differs from its value in the "Inst" field,
+  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
   field bits<32> Unpredictable = 0;
   // SoftFail is the generic name for this field, but we alias it so
   // as to make it more obvious what it means in ARM-land.
   field bits<32> SoftFail = Unpredictable;
+  let Namespace   = "AArch64";
+  Format F        = f;
+  bits<2> Form    = F.Value;
+  let Pattern     = [];
+  let Constraints = cstr;
+}
+
+// Pseudo instructions (don't have encoding information)
+class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
+    : AArch64Inst<PseudoFrm, cstr> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let Pattern        = pattern;
+  let isCodeGenOnly  = 1;
+}
 
-  // LLVM-level model of the AArch64/A64 distinction.
-  let Namespace = "AArch64";
-  let DecoderNamespace = "A64";
+// Real instructions (have encoding information)
+class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
+  let Pattern = pattern;
   let Size = 4;
+}
 
-  // Set the templated fields
-  let OutOperandList = outs;
-  let InOperandList = ins;
-  let AsmString = asmstr;
-  let Pattern = patterns;
-  let Itinerary = itin;
+// Normal instructions
+class I<dag oops, dag iops, string asm, string operands, string cstr,
+        list<dag> pattern>
+    : EncodedI<cstr, pattern> {
+  dag OutOperandList = oops;
+  dag InOperandList  = iops;
+  let AsmString      = !strconcat(asm, operands);
 }
 
-class PseudoInst<dag outs, dag ins, list<dag> patterns> : Instruction {
-  let Namespace = "AArch64";
+class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
+class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
+class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
+
+// Helper fragment for an extract of the high portion of a 128-bit vector.
+def extract_high_v16i8 :
+   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
+def extract_high_v8i16 :
+   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
+def extract_high_v4i32 :
+   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
+def extract_high_v2i64 :
+   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
+
+//===----------------------------------------------------------------------===//
+// Asm Operand Classes.
+//
 
-  let OutOperandList = outs;
-  let InOperandList= ins;
-  let Pattern = patterns;
-  let isCodeGenOnly = 1;
-  let isPseudo = 1;
+// Shifter operand for arithmetic shifted encodings.
+def ShifterOperand : AsmOperandClass {
+  let Name = "Shifter";
 }
 
-// Represents a pseudo-instruction that represents a single A64 instruction for
-// whatever reason, the eventual result will be a 32-bit real instruction.
-class A64PseudoInst<dag outs, dag ins, list<dag> patterns>
-  : PseudoInst<outs, ins, patterns> {
-  let Size = 4;
+// Shifter operand for mov immediate encodings.
+def MovImm32ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm32Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm32Shift";
+}
+def MovImm64ShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MovImm64Shifter";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "InvalidMovImm64Shift";
+}
+
+// Shifter operand for arithmetic register shifted encodings.
+class ArithmeticShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "ArithmeticShifter" # width;
+  let PredicateMethod = "isArithmeticShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
 }
 
-// As above, this will be a single A64 instruction, but we can actually give the
-// expansion in TableGen.
-class A64PseudoExpand<dag outs, dag ins, list<dag> patterns, dag Result>
-  : A64PseudoInst<outs, ins, patterns>,
-    PseudoInstExpansion<Result>;
+def ArithmeticShifterOperand32 : ArithmeticShifterOperand<32>;
+def ArithmeticShifterOperand64 : ArithmeticShifterOperand<64>;
 
+// Shifter operand for logical register shifted encodings.
+class LogicalShifterOperand<int width> : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalShifter" # width;
+  let PredicateMethod = "isLogicalShifter<" # width # ">";
+  let RenderMethod = "addShifterOperands";
+  let DiagnosticType = "AddSubRegShift" # width;
+}
 
-// First, some common cross-hierarchy register formats.
+def LogicalShifterOperand32 : LogicalShifterOperand<32>;
+def LogicalShifterOperand64 : LogicalShifterOperand<64>;
 
-class A64InstRd<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rd;
+// Shifter operand for logical vector 128/64-bit shifted encodings.
+def LogicalVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "LogicalVecShifter";
+  let RenderMethod = "addShifterOperands";
+}
+def LogicalVecHalfWordShifterOperand : AsmOperandClass {
+  let SuperClasses = [LogicalVecShifterOperand];
+  let Name = "LogicalVecHalfWordShifter";
+  let RenderMethod = "addShifterOperands";
+}
 
-  let Inst{4-0} = Rd;
+// The "MSL" shifter on the vector MOVI instruction.
+def MoveVecShifterOperand : AsmOperandClass {
+  let SuperClasses = [ShifterOperand];
+  let Name = "MoveVecShifter";
+  let RenderMethod = "addShifterOperands";
 }
 
-class A64InstRt<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt;
+// Extend operand for arithmetic encodings.
+def ExtendOperand : AsmOperandClass {
+  let Name = "Extend";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+def ExtendOperand64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "Extend64";
+  let DiagnosticType = "AddSubRegExtendSmall";
+}
+// 'extend' that's a lsl of a 64-bit register.
+def ExtendOperandLSL64 : AsmOperandClass {
+  let SuperClasses = [ExtendOperand];
+  let Name = "ExtendLSL64";
+  let RenderMethod = "addExtend64Operands";
+  let DiagnosticType = "AddSubRegExtendLarge";
+}
+
+// 8-bit floating-point immediate encodings.
+def FPImmOperand : AsmOperandClass {
+  let Name = "FPImm";
+  let ParserMethod = "tryParseFPImm";
+  let DiagnosticType = "InvalidFPImm";
+}
+
+def CondCode : AsmOperandClass {
+  let Name = "CondCode";
+  let DiagnosticType = "InvalidCondCode";
+}
+
+// A 32-bit register pasrsed as 64-bit
+def GPR32as64Operand : AsmOperandClass {
+  let Name = "GPR32as64";
+}
+def GPR32as64 : RegisterOperand<GPR32> {
+  let ParserMatchClass = GPR32as64Operand;
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
+
+
+//===----------------------------------------------------------------------===//
+// Operand Definitions.
+//
+
+// ADR[P] instruction labels.
+def AdrpOperand : AsmOperandClass {
+  let Name = "AdrpLabel";
+  let ParserMethod = "tryParseAdrpLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrplabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let PrintMethod = "printAdrpLabel";
+  let ParserMatchClass = AdrpOperand;
+}
+
+def AdrOperand : AsmOperandClass {
+  let Name = "AdrLabel";
+  let ParserMethod = "tryParseAdrLabel";
+  let DiagnosticType = "InvalidLabel";
+}
+def adrlabel : Operand<i64> {
+  let EncoderMethod = "getAdrLabelOpValue";
+  let ParserMatchClass = AdrOperand;
+}
+
+// simm9 predicate - True if the immediate is in the range [-256, 255].
+def SImm9Operand : AsmOperandClass {
+  let Name = "SImm9";
+  let DiagnosticType = "InvalidMemoryIndexedSImm9";
+}
+def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
+  let ParserMatchClass = SImm9Operand;
+}
+
+// simm7sN predicate - True if the immediate is a multiple of N in the range
+// [-64 * N, 63 * N].
+class SImm7Scaled<int Scale> : AsmOperandClass {
+  let Name = "SImm7s" # Scale;
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale # "SImm7";
+}
+
+def SImm7s4Operand : SImm7Scaled<4>;
+def SImm7s8Operand : SImm7Scaled<8>;
+def SImm7s16Operand : SImm7Scaled<16>;
+
+def simm7s4 : Operand<i32> {
+  let ParserMatchClass = SImm7s4Operand;
+  let PrintMethod = "printImmScale<4>";
+}
+
+def simm7s8 : Operand<i32> {
+  let ParserMatchClass = SImm7s8Operand;
+  let PrintMethod = "printImmScale<8>";
+}
+
+def simm7s16 : Operand<i32> {
+  let ParserMatchClass = SImm7s16Operand;
+  let PrintMethod = "printImmScale<16>";
+}
+
+class AsmImmRange<int Low, int High> : AsmOperandClass {
+  let Name = "Imm" # Low # "_" # High;
+  let DiagnosticType = "InvalidImm" # Low # "_" # High;
+}
+
+def Imm1_8Operand : AsmImmRange<1, 8>;
+def Imm1_16Operand : AsmImmRange<1, 16>;
+def Imm1_32Operand : AsmImmRange<1, 32>;
+def Imm1_64Operand : AsmImmRange<1, 64>;
+
+def MovZSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG3AsmOperand;
+}
+
+def MovZSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG2AsmOperand;
+}
+
+def MovZSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG1AsmOperand;
+}
+
+def MovZSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovZSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movz_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovZSymbolG0AsmOperand;
+}
+
+def MovKSymbolG3AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG3";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g3 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG3AsmOperand;
+}
+
+def MovKSymbolG2AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG2";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g2 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG2AsmOperand;
+}
+
+def MovKSymbolG1AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG1";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g1 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG1AsmOperand;
+}
+
+def MovKSymbolG0AsmOperand : AsmOperandClass {
+  let Name = "MovKSymbolG0";
+  let RenderMethod = "addImmOperands";
+}
+
+def movk_symbol_g0 : Operand<i32> {
+  let ParserMatchClass = MovKSymbolG0AsmOperand;
+}
+
+class fixedpoint_i32<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm32";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+class fixedpoint_i64<ValueType FloatVT>
+  : Operand<FloatVT>,
+    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm, ld]> {
+  let EncoderMethod = "getFixedPointScaleOpValue";
+  let DecoderMethod = "DecodeFixedPointScaleImm64";
+  let ParserMatchClass = Imm1_64Operand;
+}
+
+def fixedpoint_f32_i32 : fixedpoint_i32<f32>;
+def fixedpoint_f64_i32 : fixedpoint_i32<f64>;
+
+def fixedpoint_f32_i64 : fixedpoint_i64<f32>;
+def fixedpoint_f64_i64 : fixedpoint_i64<f64>;
+
+def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR8OpValue";
+  let DecoderMethod = "DecodeVecShiftR8Imm";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16Imm";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+  let EncoderMethod = "getVecShiftR16OpValue";
+  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
+  let ParserMatchClass = Imm1_8Operand;
+}
+def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32Imm";
+  let ParserMatchClass = Imm1_32Operand;
+}
+def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+  let EncoderMethod = "getVecShiftR32OpValue";
+  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
+  let ParserMatchClass = Imm1_16Operand;
+}
+def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
+def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
+  let ParserMatchClass = Imm1_32Operand;
+}
+
+def Imm0_7Operand : AsmImmRange<0, 7>;
+def Imm0_15Operand : AsmImmRange<0, 15>;
+def Imm0_31Operand : AsmImmRange<0, 31>;
+def Imm0_63Operand : AsmImmRange<0, 63>;
+
+def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+
+// Crazy immediate formats used by 32-bit and 64-bit logical immediate
+// instructions for splatting repeating bit patterns across the immediate.
+def logical_imm32_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+def logical_imm64_XFORM : SDNodeXForm<imm, [{
+  uint64_t enc = AArch64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
+  return CurDAG->getTargetConstant(enc, MVT::i32);
+}]>;
+
+def LogicalImm32Operand : AsmOperandClass {
+  let Name = "LogicalImm32";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def LogicalImm64Operand : AsmOperandClass {
+  let Name = "LogicalImm64";
+  let DiagnosticType = "LogicalSecondSource";
+}
+def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 32);
+}], logical_imm32_XFORM> {
+  let PrintMethod = "printLogicalImm32";
+  let ParserMatchClass = LogicalImm32Operand;
+}
+def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
+  return AArch64_AM::isLogicalImmediate(N->getZExtValue(), 64);
+}], logical_imm64_XFORM> {
+  let PrintMethod = "printLogicalImm64";
+  let ParserMatchClass = LogicalImm64Operand;
+}
+
+// imm0_65535 predicate - True if the immediate is in the range [0,65535].
+def Imm0_65535Operand : AsmImmRange<0, 65535>;
+def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 65536;
+}]> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_255 predicate - True if the immediate is in the range [0,255].
+def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 256;
+}]> {
+  let ParserMatchClass = Imm0_255Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// imm0_127 predicate - True if the immediate is in the range [0,127]
+def Imm0_127Operand : AsmImmRange<0, 127>;
+def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
+  return ((uint32_t)Imm) < 128;
+}]> {
+  let ParserMatchClass = Imm0_127Operand;
+  let PrintMethod = "printHexImm";
+}
+
+// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
+// for all shift-amounts.
+
+// imm0_63 predicate - True if the immediate is in the range [0,63]
+def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 64;
+}]> {
+  let ParserMatchClass = Imm0_63Operand;
+}
+
+// imm0_31 predicate - True if the immediate is in the range [0,31]
+def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 32;
+}]> {
+  let ParserMatchClass = Imm0_31Operand;
+}
+
+// imm0_15 predicate - True if the immediate is in the range [0,15]
+def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = Imm0_15Operand;
+}
+
+// imm0_7 predicate - True if the immediate is in the range [0,7]
+def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = Imm0_7Operand;
+}
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
+//  {5-0} - imm6
+class arith_shift<ValueType Ty, int width> : Operand<Ty> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "ArithmeticShifterOperand" # width);
+}
+
+def arith_shift32 : arith_shift<i32, 32>;
+def arith_shift64 : arith_shift<i64, 64>;
+
+class arith_shifted_reg<ValueType Ty, RegisterClass regclass, int width>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, !cast<Operand>("arith_shift" # width));
+}
+
+def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32, 32>;
+def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64, 64>;
+
+// An arithmetic shifter operand:
+//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
+//  {5-0} - imm6
+class logical_shift<int width> : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = !cast<AsmOperandClass>(
+                         "LogicalShifterOperand" # width);
+}
+
+def logical_shift32 : logical_shift<32>;
+def logical_shift64 : logical_shift<64>;
+
+class logical_shifted_reg<ValueType Ty, RegisterClass regclass, Operand shiftop>
+    : Operand<Ty>,
+      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
+  let PrintMethod = "printShiftedRegister";
+  let MIOperandInfo = (ops regclass, shiftop);
+}
 
+def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32, logical_shift32>;
+def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64, logical_shift64>;
+
+// A logical vector shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0, #8, #16, or #24
+def logical_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecShifterOperand;
+}
+
+// A logical vector half-word shifter operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #8
+def logical_vec_hw_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getVecShifterOpValue";
+  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
+}
+
+// A vector move shifter operand:
+//  {0} - imm1: #8 or #16
+def move_vec_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let EncoderMethod = "getMoveVecShifterOpValue";
+  let ParserMatchClass = MoveVecShifterOperand;
+}
+
+def AddSubImmOperand : AsmOperandClass {
+  let Name = "AddSubImm";
+  let ParserMethod = "tryParseAddSubImm";
+  let DiagnosticType = "AddSubSecondSource";
+}
+// An ADD/SUB immediate shifter operand:
+//  second operand:
+//  {7-6} - shift type: 00 = lsl
+//  {5-0} - imm6: #0 or #12
+class addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
+def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
+
+class neg_addsub_shifted_imm<ValueType Ty>
+    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
+  let PrintMethod = "printAddSubImm";
+  let EncoderMethod = "getAddSubImmOpValue";
+  let ParserMatchClass = AddSubImmOperand;
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+
+def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
+def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
+
+// An extend operand:
+//  {5-3} - extend type
+//  {2-0} - imm3
+def arith_extend : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand;
+}
+def arith_extend64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperand64;
+}
+
+// 'extend' that's a lsl of a 64-bit register.
+def arith_extendlsl64 : Operand<i32> {
+  let PrintMethod = "printArithExtend";
+  let ParserMatchClass = ExtendOperandLSL64;
+}
+
+class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend);
+}
+
+class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
+                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
+  let PrintMethod = "printExtendedRegister";
+  let MIOperandInfo = (ops GPR32, arith_extend64);
+}
+
+// Floating-point immediate.
+def fpimm32 : Operand<f32>,
+              PatLeaf<(f32 fpimm), [{
+      return AArch64_AM::getFP32Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP32Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+def fpimm64 : Operand<f64>,
+              PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::getFP64Imm(N->getValueAPF()) != -1;
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::getFP64Imm(InVal);
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm8 : Operand<i32> {
+  let ParserMatchClass = FPImmOperand;
+  let PrintMethod = "printFPImmOperand";
+}
+
+def fpimm0 : PatLeaf<(fpimm), [{
+  return N->isExactlyValue(+0.0);
+}]>;
+
+// Vector lane operands
+class AsmVectorIndex<string Suffix> : AsmOperandClass {
+  let Name = "VectorIndex" # Suffix;
+  let DiagnosticType = "InvalidIndex" # Suffix;
+}
+def VectorIndex1Operand : AsmVectorIndex<"1">;
+def VectorIndexBOperand : AsmVectorIndex<"B">;
+def VectorIndexHOperand : AsmVectorIndex<"H">;
+def VectorIndexSOperand : AsmVectorIndex<"S">;
+def VectorIndexDOperand : AsmVectorIndex<"D">;
+
+def VectorIndex1 : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) == 1;
+}]> {
+  let ParserMatchClass = VectorIndex1Operand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 16;
+}]> {
+  let ParserMatchClass = VectorIndexBOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 8;
+}]> {
+  let ParserMatchClass = VectorIndexHOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 4;
+}]> {
+  let ParserMatchClass = VectorIndexSOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
+  return ((uint64_t)Imm) < 2;
+}]> {
+  let ParserMatchClass = VectorIndexDOperand;
+  let PrintMethod = "printVectorIndex";
+  let MIOperandInfo = (ops i64imm);
+}
+
+// 8-bit immediate for AdvSIMD where 64-bit values of the form:
+// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
+// are encoded as the eight bit value 'abcdefgh'.
+def simdimmtype10 : Operand<i32>,
+                    PatLeaf<(f64 fpimm), [{
+      return AArch64_AM::isAdvSIMDModImmType10(N->getValueAPF()
+                                               .bitcastToAPInt()
+                                               .getZExtValue());
+    }], SDNodeXForm<fpimm, [{
+      APFloat InVal = N->getValueAPF();
+      uint32_t enc = AArch64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
+                                                           .bitcastToAPInt()
+                                                           .getZExtValue());
+      return CurDAG->getTargetConstant(enc, MVT::i32);
+    }]>> {
+  let ParserMatchClass = SIMDImmType10Operand;
+  let PrintMethod = "printSIMDType10Operand";
+}
+
+
+//---
+// System management
+//---
+
+// Base encoding for system instruction operands.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-22} = 0b1101010100;
+  let Inst{21}    = L;
+}
+
+// System instructions which do not have an Rt register.
+class SimpleSystemI<bit L, dag iops, string asm, string operands>
+    : BaseSystemI<L, (outs), iops, asm, operands> {
+  let Inst{4-0} = 0b11111;
+}
+
+// System instructions which have an Rt register.
+class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
+    : BaseSystemI<L, oops, iops, asm, operands>,
+      Sched<[WriteSys]> {
+  bits<5> Rt;
   let Inst{4-0} = Rt;
 }
 
+// Hint instructions that take both a CRm and a 3-bit immediate.
+class HintI<string mnemonic>
+    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
+      Sched<[WriteHint]> {
+  bits <7> imm;
+  let Inst{20-12} = 0b000110010;
+  let Inst{11-5} = imm;
+}
+
+// System instructions taking a single literal operand which encodes into
+// CRm. op2 differentiates the opcodes.
+def BarrierAsmOperand : AsmOperandClass {
+  let Name = "Barrier";
+  let ParserMethod = "tryParseBarrierOperand";
+}
+def barrier_op : Operand<i32> {
+  let PrintMethod = "printBarrierOption";
+  let ParserMatchClass = BarrierAsmOperand;
+}
+class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
+    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
+      Sched<[WriteBarrier]> {
+  bits<4> CRm;
+  let Inst{20-12} = 0b000110011;
+  let Inst{11-8} = CRm;
+  let Inst{7-5} = opc;
+}
+
+// MRS/MSR system instructions. These have different operand classes because
+// a different subset of registers can be accessed through each instruction.
+def MRSSystemRegisterOperand : AsmOperandClass {
+  let Name = "MRSSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MRS";
+}
+// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
+def mrs_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MRSSystemRegisterOperand;
+  let DecoderMethod = "DecodeMRSSystemRegister";
+  let PrintMethod = "printMRSSystemRegister";
+}
 
-class A64InstRdn<dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-    : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
-  bits<5> Rn;
+def MSRSystemRegisterOperand : AsmOperandClass {
+  let Name = "MSRSystemRegister";
+  let ParserMethod = "tryParseSysReg";
+  let DiagnosticType = "MSR";
+}
+def msr_sysreg_op : Operand<i32> {
+  let ParserMatchClass = MSRSystemRegisterOperand;
+  let DecoderMethod = "DecodeMSRSystemRegister";
+  let PrintMethod = "printMSRSystemRegister";
+}
+
+class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
+                       "mrs", "\t$Rt, $systemreg"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+// FIXME: Some of these def NZCV, others don't. Best way to model that?
+// Explicitly modeling each of the system register as a register class
+// would do it, but feels like overkill at this point.
+class MSRI : RtSystemI<0, (outs), (ins msr_sysreg_op:$systemreg, GPR64:$Rt),
+                       "msr", "\t$systemreg, $Rt"> {
+  bits<15> systemreg;
+  let Inst{20} = 1;
+  let Inst{19-5} = systemreg;
+}
+
+def SystemPStateFieldOperand : AsmOperandClass {
+  let Name = "SystemPStateField";
+  let ParserMethod = "tryParseSysReg";
+}
+def pstatefield_op : Operand<i32> {
+  let ParserMatchClass = SystemPStateFieldOperand;
+  let PrintMethod = "printSystemPStateField";
+}
+
+let Defs = [NZCV] in
+class MSRpstateI
+  : SimpleSystemI<0, (ins pstatefield_op:$pstate_field, imm0_15:$imm),
+                  "msr", "\t$pstate_field, $imm">,
+    Sched<[WriteSys]> {
+  bits<6> pstatefield;
+  bits<4> imm;
+  let Inst{20-19} = 0b00;
+  let Inst{18-16} = pstatefield{5-3};
+  let Inst{15-12} = 0b0100;
+  let Inst{11-8} = imm;
+  let Inst{7-5} = pstatefield{2-0};
+
+  let DecoderMethod = "DecodeSystemPStateInstruction";
+}
+
+// SYS and SYSL generic system instructions.
+def SysCRAsmOperand : AsmOperandClass {
+  let Name = "SysCR";
+  let ParserMethod = "tryParseSysCROperand";
+}
+
+def sys_cr_op : Operand<i32> {
+  let PrintMethod = "printSysCROperand";
+  let ParserMatchClass = SysCRAsmOperand;
+}
+
+class SystemXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
+       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+class SystemLXtI<bit L, string asm>
+  : RtSystemI<L, (outs),
+       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
+       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
+  bits<3> op1;
+  bits<4> Cn;
+  bits<4> Cm;
+  bits<3> op2;
+  let Inst{20-19} = 0b01;
+  let Inst{18-16} = op1;
+  let Inst{15-12} = Cn;
+  let Inst{11-8}  = Cm;
+  let Inst{7-5}   = op2;
+}
+
+
+// Branch (register) instructions:
+//
+//  case opc of
+//    0001 blr
+//    0000 br
+//    0101 dret
+//    0100 eret
+//    0010 ret
+//    otherwise UNDEFINED
+class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
+                    string operands, list<dag> pattern>
+    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
+  let Inst{31-25} = 0b1101011;
+  let Inst{24-21} = opc;
+  let Inst{20-16} = 0b11111;
+  let Inst{15-10} = 0b000000;
+  let Inst{4-0}   = 0b00000;
+}
 
+class BranchReg<bits<4> opc, string asm, list<dag> pattern>
+    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
+  bits<5> Rn;
   let Inst{9-5} = Rn;
 }
 
-class A64InstRtn<dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-    : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Inherit rdt
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
+class SpecialReturn<bits<4> opc, string asm>
+    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
+  let Inst{9-5} = 0b11111;
+}
+
+//---
+// Conditional branch instruction.
+//---
+
+// Condition code.
+// 4-bit immediate. Pretty-printed as <cc>
+def ccode : Operand<i32> {
+  let PrintMethod = "printCondCode";
+  let ParserMatchClass = CondCode;
+}
+def inv_ccode : Operand<i32> {
+  let PrintMethod = "printInverseCondCode";
+  let ParserMatchClass = CondCode;
+}
+
+// Conditional branch target. 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def PCRelLabel19Operand : AsmOperandClass {
+  let Name = "PCRelLabel19";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_brcond : Operand<OtherVT> {
+  let EncoderMethod = "getCondBranchTargetOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
+
+class BranchCond : I<(outs), (ins ccode:$cond, am_brcond:$target),
+                     "b", ".$cond\t$target", "",
+                     [(AArch64brcond bb:$target, imm:$cond, NZCV)]>,
+                   Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let Uses = [NZCV];
+
+  bits<4> cond;
+  bits<19> target;
+  let Inst{31-24} = 0b01010100;
+  let Inst{23-5} = target;
+  let Inst{4} = 0;
+  let Inst{3-0} = cond;
+}
+
+//---
+// Compare-and-branch instructions.
+//---
+class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
+         asm, "\t$Rt, $target", "",
+         [(node regtype:$Rt, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<19> target;
+  let Inst{30-25} = 0b011010;
+  let Inst{24}    = op;
+  let Inst{23-5}  = target;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass CmpBranch<bit op, string asm, SDNode node> {
+  def W : BaseCmpBranch<GPR32, op, asm, node> {
+    let Inst{31} = 0;
+  }
+  def X : BaseCmpBranch<GPR64, op, asm, node> {
+    let Inst{31} = 1;
+  }
+}
+
+//---
+// Test-bit-and-branch instructions.
+//---
+// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
+// the target offset are implied zero and so are not part of the immediate.
+def BranchTarget14Operand : AsmOperandClass {
+  let Name = "BranchTarget14";
+}
+def am_tbrcond : Operand<OtherVT> {
+  let EncoderMethod = "getTestBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget14Operand;
+}
+
+// AsmOperand classes to emit (or not) special diagnostics
+def TBZImm0_31Operand : AsmOperandClass {
+  let Name = "TBZImm0_31";
+  let PredicateMethod = "isImm0_31";
+  let RenderMethod = "addImm0_31Operands";
+}
+def TBZImm32_63Operand : AsmOperandClass {
+  let Name = "Imm32_63";
+  let DiagnosticType = "InvalidImm0_63";
+}
+
+class tbz_imm0_31<AsmOperandClass matcher> : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let ParserMatchClass = matcher;
+}
+
+def tbz_imm0_31_diag : tbz_imm0_31<Imm0_31Operand>;
+def tbz_imm0_31_nodiag : tbz_imm0_31<TBZImm0_31Operand>;
+
+def tbz_imm32_63 : Operand<i64>, ImmLeaf<i64, [{
+  return (((uint32_t)Imm) > 31) && (((uint32_t)Imm) < 64);
+}]> {
+  let ParserMatchClass = TBZImm32_63Operand;
+}
+
+class BaseTestBranch<RegisterClass regtype, Operand immtype,
+                     bit op, string asm, SDNode node>
+    : I<(outs), (ins regtype:$Rt, immtype:$bit_off, am_tbrcond:$target),
+       asm, "\t$Rt, $bit_off, $target", "",
+       [(node regtype:$Rt, immtype:$bit_off, bb:$target)]>,
+      Sched<[WriteBr]> {
+  let isBranch = 1;
+  let isTerminator = 1;
+
+  bits<5> Rt;
+  bits<6> bit_off;
+  bits<14> target;
+
+  let Inst{30-25} = 0b011011;
+  let Inst{24}    = op;
+  let Inst{23-19} = bit_off{4-0};
+  let Inst{18-5}  = target;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeTestAndBranch";
+}
+
+multiclass TestBranch<bit op, string asm, SDNode node> {
+  def W : BaseTestBranch<GPR32, tbz_imm0_31_diag, op, asm, node> {
+    let Inst{31} = 0;
+  }
+
+  def X : BaseTestBranch<GPR64, tbz_imm32_63, op, asm, node> {
+    let Inst{31} = 1;
+  }
+
+  // Alias X-reg with 0-31 imm to W-Reg.
+  def : InstAlias<asm # "\t$Rd, $imm, $target",
+                  (!cast<Instruction>(NAME#"W") GPR32as64:$Rd,
+                  tbz_imm0_31_nodiag:$imm, am_tbrcond:$target), 0>;
+  def : Pat<(node GPR64:$Rn, tbz_imm0_31_diag:$imm, bb:$target),
+            (!cast<Instruction>(NAME#"W") (EXTRACT_SUBREG GPR64:$Rn, sub_32),
+            tbz_imm0_31_diag:$imm, bb:$target)>;
+}
+
+//---
+// Unconditional branch (immediate) instructions.
+//---
+def BranchTarget26Operand : AsmOperandClass {
+  let Name = "BranchTarget26";
+  let DiagnosticType = "InvalidLabel";
+}
+def am_b_target : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+def am_bl_target : Operand<i64> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = BranchTarget26Operand;
+}
+
+class BImm<bit op, dag iops, string asm, list<dag> pattern>
+    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
+  bits<26> addr;
+  let Inst{31}    = op;
+  let Inst{30-26} = 0b00101;
+  let Inst{25-0}  = addr;
+
+  let DecoderMethod = "DecodeUnconditionalBranch";
+}
+
+class BranchImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
+class CallImm<bit op, string asm, list<dag> pattern>
+    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
+
+//---
+// Basic one-operand data processing instructions.
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
+                         SDPatternOperator node>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+      [(set regtype:$Rd, (node regtype:$Rn))]>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
   bits<5> Rn;
 
-  let Inst{9-5} = Rn;
+  let Inst{30-13} = 0b101101011000000000;
+  let Inst{12-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Instructions taking Rt,Rt2,Rn
-class A64InstRtt2n<dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rt2;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass OneOperandData<bits<3> opc, string asm,
+                          SDPatternOperator node = null_frag> {
+  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
+    let Inst{31} = 0;
+  }
 
-  let Inst{14-10} = Rt2;
+  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
+    let Inst{31} = 1;
+  }
 }
 
-class A64InstRdnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR32, asm, node> {
+  let Inst{31} = 0;
+}
 
-  let Inst{20-16} = Rm;
+class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
+    : BaseOneOperandData<opc, GPR64, asm, node> {
+  let Inst{31} = 1;
 }
 
-class A64InstRtnm<dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
+//---
+// Basic two-operand data processing instructions.
+//---
+class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                          list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  bits<5> Rd;
+  bits<5> Rn;
   bits<5> Rm;
-
+  let Inst{30}    = isSub;
+  let Inst{28-21} = 0b11010000;
   let Inst{20-16} = Rm;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-//===----------------------------------------------------------------------===//
-//
-// Actual A64 Instruction Formats
-//
+class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
+                      SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV))]>;
+
+class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
+                              SDNode OpNode>
+    : BaseBaseAddSubCarry<isSub, regtype, asm,
+        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, NZCV)),
+         (implicit NZCV)]> {
+  let Defs = [NZCV];
+}
 
-// Format for Add-subtract (extended register) instructions.
-class A64I_addsubext<bit sf, bit op, bit S, bits<2> opt, bits<3> option,
-                     dag outs, dag ins, string asmstr, list<dag> patterns,
-                     InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<3> Imm3;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = opt;
-    let Inst{21} = 0b1;
-    // Rm inherited in 20-16
-    let Inst{15-13} = option;
-    let Inst{12-10} = Imm3;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (immediate) instructions.
-class A64I_addsubimm<bit sf, bit op, bit S, bits<2> shift,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<12> Imm12;
+multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
+                       SDNode OpNode, SDNode OpNode_setflags> {
+  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
+    let Inst{31} = 0;
+    let Inst{29} = 0;
+  }
+  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+    let Inst{29} = 0;
+  }
+
+  // Sets flags.
+  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 0;
+    let Inst{29} = 1;
+  }
+  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
+                                    OpNode_setflags> {
+    let Inst{31} = 1;
+    let Inst{29} = 1;
+  }
+}
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = S;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = shift;
-  let Inst{21-10} = Imm12;
-}
-
-// Format for Add-subtract (shifted register) instructions.
-class A64I_addsubshift<bit sf, bit op, bit S, bits<2> shift,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    bits<6> Imm6;
-
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-24} = 0b01011;
-    let Inst{23-22} = shift;
-    let Inst{21} = 0b0;
-    // Rm inherited in 20-16
-    let Inst{15-10} = Imm6;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-// Format for Add-subtract (with carry) instructions.
-class A64I_addsubcarry<bit sf, bit op, bit S, bits<6> opcode2,
-                       dag outs, dag ins, string asmstr, list<dag> patterns,
-                       InstrItinClass itin>
-    : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-    let Inst{31} = sf;
-    let Inst{30} = op;
-    let Inst{29} = S;
-    let Inst{28-21} = 0b11010000;
-    // Rm inherited in 20-16
-    let Inst{15-10} = opcode2;
-    // Rn inherited in 9-5
-    // Rd inherited in 4-0
-}
-
-
-// Format for Bitfield instructions
-class A64I_bitfield<bit sf, bits<2> opc, bit n,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> ImmR;
-  bits<6> ImmS;
+class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
+                     SDPatternOperator OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{30-21} = 0b0011010110;
+  let Inst{20-16} = Rm;
+  let Inst{15-14} = 0b00;
+  let Inst{13-10} = opc;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{22} = n;
-  let Inst{21-16} = ImmR;
-  let Inst{15-10} = ImmS;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
+              SDPatternOperator OpNode>
+    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
+  let Inst{10}    = isSigned;
 }
 
-// Format for compare and branch (immediate) instructions.
-class A64I_cmpbr<bit sf, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
+multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
+  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
+           Sched<[WriteID32, ReadID, ReadID]> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
+           Sched<[WriteID64, ReadID, ReadID]> {
+    let Inst{31} = 1;
+  }
+}
 
-  let Inst{31} = sf;
-  let Inst{30-25} = 0b011010;
-  let Inst{24} = op;
-  let Inst{23-5} = Label;
-  // Inherit Rt in 4-0
+class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
+                SDPatternOperator OpNode = null_frag>
+  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
+    Sched<[WriteIS, ReadI]> {
+  let Inst{11-10} = shift_type;
 }
 
-// Format for conditional branch (immediate) instructions.
-class A64I_condbr<bit o1, bit o0,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<19> Label;
-  bits<4> Cond;
+multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
+  def Wr : BaseShift<shift_type, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
+                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
 
-  let Inst{31-25} = 0b0101010;
-  let Inst{24} = o1;
-  let Inst{23-5} = Label;
-  let Inst{4} = o0;
-  let Inst{3-0} = Cond;
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
+
+  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
+            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
 }
 
-// Format for conditional compare (immediate) instructions.
-class A64I_condcmpimm<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
+class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2), 0>;
+
+class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
+                       RegisterClass addtype, string asm,
+                       list<dag> pattern>
+  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
+      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
+  bits<5> Rd;
   bits<5> Rn;
-  bits<5> UImm5;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{30-24} = 0b0011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
-  let Inst{20-16} = UImm5;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b1;
-  let Inst{10} = o2;
-  let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
+  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
+      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
+      Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 0;
+  }
+
+  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
+      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
+      Sched<[WriteIM64, ReadIMA, ReadIM, ReadIM]> {
+    let Inst{31} = 1;
+  }
+}
+
+class WideMulAccum<bit isSub, bits<3> opc, string asm,
+                   SDNode AccNode, SDNode ExtNode>
+  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
+    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
+                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
+    Sched<[WriteIM32, ReadIMA, ReadIM, ReadIM]> {
+  let Inst{31} = 1;
 }
 
-// Format for conditional compare (register) instructions.
-class A64I_condcmpreg<bit sf, bit op, bit o2, bit o3, bit s,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
+class MulHi<bits<3> opc, string asm, SDNode OpNode>
+  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
+    Sched<[WriteIM64, ReadIM, ReadIM]> {
+  bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+  let Inst{31-24} = 0b10011011;
+  let Inst{23-21} = opc;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  // The Ra field of SMULH and UMULH is unused: it should be assembled as 31
+  // (i.e. all bits 1) but is ignored by the processor.
+  let PostEncoderMethod = "fixMulHigh";
+}
 
+class MulAccumWAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
+class MulAccumXAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
+class WideMulAccumAlias<string asm, Instruction inst>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
+
+class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
+              SDPatternOperator OpNode, string asm>
+  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
+      asm, "\t$Rd, $Rn, $Rm", "",
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
+    Sched<[WriteISReg, ReadI, ReadISReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
 
   let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010010;
+  let Inst{30-21} = 0b0011010110;
   let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11} = 0b0;
-  let Inst{10} = o2;
+  let Inst{15-13} = 0b010;
+  let Inst{12} = C;
+  let Inst{11-10} = sz;
   let Inst{9-5} = Rn;
-  let Inst{4} = o3;
-  let Inst{3-0} = NZCVImm;
+  let Inst{4-0} = Rd;
+  let Predicates = [HasCRC];
 }
 
-// Format for conditional select instructions.
-class A64I_condsel<bit sf, bit op, bit s, bits<2> op2,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+//---
+// Address generation.
+//---
+
+class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
+    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
+        pattern>,
+      Sched<[WriteI]> {
+  bits<5>  Xd;
+  bits<21> label;
+  let Inst{31}    = page;
+  let Inst{30-29} = label{1-0};
+  let Inst{28-24} = 0b10000;
+  let Inst{23-5}  = label{20-2};
+  let Inst{4-0}   = Xd;
 
-  let Inst{31} = sf;
-  let Inst{30} = op;
-  let Inst{29} = s;
-  let Inst{28-21} = 0b11010100;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = op2;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let DecoderMethod = "DecodeAdrInstruction";
 }
 
-// Format for data processing (1 source) instructions
-class A64I_dp_1src<bit sf, bit S, bits<5> opcode2, bits<6> opcode,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b1;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{20-16} = opcode2;
-  let Inst{15-10} = opcode;
-}
-
-// Format for data processing (2 source) instructions
-class A64I_dp_2src<bit sf, bits<6> opcode, bit S,
-                string asmstr, dag outs, dag ins,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = S;
-  let Inst{28-21} = 0b11010110;
-  let Inst{15-10} = opcode;
+//---
+// Move immediate.
+//---
+
+def movimm32_imm : Operand<i32> {
+  let ParserMatchClass = Imm0_65535Operand;
+  let EncoderMethod = "getMoveWideImmOpValue";
+  let PrintMethod = "printHexImm";
+}
+def movimm32_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm32ShifterOperand;
+}
+def movimm64_shift : Operand<i32> {
+  let PrintMethod = "printShifter";
+  let ParserMatchClass = MovImm64ShifterOperand;
 }
 
-// Format for data-processing (3 source) instructions
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                        string asm>
+  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "", []>,
+    Sched<[WriteImm]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
 
-class A64I_dp3<bit sf, bits<6> opcode,
-               dag outs, dag ins, string asmstr,
-               list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30-29} = opcode{5-4};
-  let Inst{28-24} = 0b11011;
-  let Inst{23-21} = opcode{3-1};
-  // Inherits Rm in 20-16
-  let Inst{15} = opcode{0};
-  // {14-10} mostly Ra, but unspecified for SMULH/UMULH
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
-}
-
-// Format for exception generation instructions
-class A64I_exception<bits<3> opc, bits<3> op2, bits<2> ll,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
+  let DecoderMethod = "DecodeMoveImmInstruction";
+}
 
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = opc;
-  let Inst{20-5} = UImm16;
-  let Inst{4-2} = op2;
-  let Inst{1-0} = ll;
+multiclass MoveImmediate<bits<2> opc, string asm> {
+  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
 }
 
-// Format for extract (immediate) instructions
-class A64I_extract<bit sf, bits<3> op, bit n,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> LSB;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
+                          string asm>
+  : I<(outs regtype:$Rd),
+      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
+       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
+    Sched<[WriteI, ReadI]> {
+  bits<5> Rd;
+  bits<16> imm;
+  bits<6> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100101;
+  let Inst{22-21} = shift{5-4};
+  let Inst{20-5}  = imm;
+  let Inst{4-0}   = Rd;
 
-  let Inst{31} = sf;
-  let Inst{30-29} = op{2-1};
-  let Inst{28-23} = 0b100111;
-  let Inst{22} = n;
-  let Inst{21} = op{0};
-  // Inherits Rm in bits 20-16
-  let Inst{15-10} = LSB;
-  // Inherits Rn in 9-5
-  // Inherits Rd in 4-0
+  let DecoderMethod = "DecodeMoveImmInstruction";
 }
 
-let Predicates = [HasFPARMv8] in {
+multiclass InsertImmediate<bits<2> opc, string asm> {
+  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
+    let Inst{31} = 0;
+  }
+
+  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
+    let Inst{31} = 1;
+  }
+}
 
-// Format for floating-point compare instructions.
-class A64I_fpcmp<bit m, bit s, bits<2> type, bits<2> op, bits<5> opcode2,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
+//---
+// Add/Subtract
+//---
+
+class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
+                    string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
+        asm, "\t$Rd, $Rn, $imm", "",
+        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
+      Sched<[WriteI, ReadI]>  {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<14> imm;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b10001;
+  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
+  let Inst{21-10} = imm{11-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+  let DecoderMethod = "DecodeBaseAddSubImm";
+}
+
+class BaseAddSubRegPseudo<RegisterClass regtype,
+                          SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
+                     arith_shifted_reg shifted_regtype, string asm,
+                     SDPatternOperator OpNode>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "",
+        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = 0;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, Operand src2Regtype,
+                     string asm, SDPatternOperator OpNode>
+    : I<(outs dstRegtype:$R1),
+        (ins src1Regtype:$R2, src2Regtype:$R3),
+        asm, "\t$R1, $R2, $R3", "",
+        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = ext{5-3};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
+                       RegisterClass src1Regtype, RegisterClass src2Regtype,
+                       Operand ext_op, string asm>
+    : I<(outs dstRegtype:$Rd),
+        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
+        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
+      Sched<[WriteIEReg, ReadI, ReadIEReg]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<6> ext;
+  let Inst{30}    = isSub;
+  let Inst{29}    = setFlags;
+  let Inst{28-24} = 0b01011;
+  let Inst{23-21} = 0b001;
   let Inst{20-16} = Rm;
-  let Inst{15-14} = op;
-  let Inst{13-10} = 0b1000;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = opcode2;
+  let Inst{15}    = ext{5};
+  let Inst{12-10} = ext{2-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeAddSubERegInstruction";
+}
+
+// Aliases for register+register add/subtract.
+class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
+                     RegisterClass src1Regtype, RegisterClass src2Regtype,
+                     int shiftExt>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
+                      shiftExt)>;
+
+multiclass AddSub<bit isSub, string mnemonic,
+                  SDPatternOperator OpNode = null_frag> {
+  let hasSideEffects = 0 in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register - Only used for CodeGen
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1, hasSideEffects = 0 in {
+  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
+                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when either the destination or
+  // first source register is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sponly, GPR32sp, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32sp, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sponly, GPR64sp, GPR64, 24>; // UXTX #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64sp, GPR64sponly, GPR64, 24>; // UXTX #0
 }
 
-// Format for floating-point conditional compare instructions.
-class A64I_fpccmp<bit m, bit s, bits<2> type, bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
+multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp> {
+  let isCompare = 1, Defs = [NZCV] in {
+  // Add/Subtract immediate
+  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
+                           mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
+                           mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract register
+  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
+
+  // Add/Subtract shifted register
+  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
+                           OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
+                           OpNode> {
+    let Inst{31} = 1;
+  }
+
+  // Add/Subtract extended register
+  let AddedComplexity = 1 in {
+  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
+                           arith_extended_reg32<i32>, mnemonic, OpNode> {
+    let Inst{31} = 0;
+  }
+  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
+                           arith_extended_reg32<i64>, mnemonic, OpNode> {
+    let Inst{31} = 1;
+  }
+  }
+
+  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
+                               arith_extendlsl64, mnemonic> {
+    // UXTX and SXTX only.
+    let Inst{14-13} = 0b11;
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  // Compare aliases
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Wri")
+                  WZR, GPR32sp:$src, addsub_shifted_imm32:$imm), 5>;
+  def : InstAlias<cmp#" $src, $imm", (!cast<Instruction>(NAME#"Xri")
+                  XZR, GPR64sp:$src, addsub_shifted_imm64:$imm), 5>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrx")
+                  WZR, GPR32sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx")
+                  XZR, GPR64sp:$src1, GPR32:$src2, arith_extend:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrx64")
+                  XZR, GPR64sp:$src1, GPR64:$src2, arith_extendlsl64:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, arith_shift32:$sh), 4>;
+  def : InstAlias<cmp#" $src1, $src2$sh", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
+
+  // Compare shorthands
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Wrs")
+                  WZR, GPR32:$src1, GPR32:$src2, 0), 5>;
+  def : InstAlias<cmp#" $src1, $src2", (!cast<Instruction>(NAME#"Xrs")
+                  XZR, GPR64:$src1, GPR64:$src2, 0), 5>;
+
+  // Register/register aliases with no shift when SP is not used.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
+                       GPR32, GPR32, GPR32, 0>;
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
+                       GPR64, GPR64, GPR64, 0>;
+
+  // Register/register aliases with no shift when the first source register
+  // is SP.
+  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
+                       GPR32, GPR32sponly, GPR32, 16>; // UXTW #0
+  def : AddSubRegAlias<mnemonic,
+                       !cast<Instruction>(NAME#"Xrx64"),
+                       GPR64, GPR64sponly, GPR64, 24>; // UXTX #0
+}
+
+//---
+// Extract
+//---
+def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
+                                      SDTCisPtrTy<3>]>;
+def AArch64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
+
+class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
+                     list<dag> patterns>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
+      Sched<[WriteExtr, ReadExtrHi]> {
+  bits<5> Rd;
   bits<5> Rn;
   bits<5> Rm;
-  bits<4> NZCVImm;
-  bits<4> Cond;
+  bits<6> imm;
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
+  let Inst{30-23} = 0b00100111;
+  let Inst{21}    = 0;
   let Inst{20-16} = Rm;
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5} = Rn;
-  let Inst{4} = op;
-  let Inst{3-0} = NZCVImm;
+  let Inst{15-10} = imm;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format for floating-point conditional select instructions.
-class A64I_fpcondsel<bit m, bit s, bits<2> type,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<4> Cond;
+multiclass ExtractImm<string asm> {
+  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
+                      [(set GPR32:$Rd,
+                        (AArch64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imm<5> must be zero.
+    let imm{5}   = 0;
+  }
+  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
+                      [(set GPR64:$Rd,
+                        (AArch64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
+
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = Cond;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+//---
+// Bitfield
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImm<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+multiclass BitfieldImm<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
 
-// Format for floating-point data-processing (1 source) instructions.
-class A64I_fpdp1<bit m, bit s, bits<2> type, bits<6> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format for floating-point data-processing (2 sources) instructions.
-class A64I_fpdp2<bit m, bit s, bits<2> type, bits<4> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseBitfieldImmWith2RegArgs<bits<2> opc,
+                      RegisterClass regtype, Operand imm_type, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
+                             imm_type:$imms),
+         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
+      Sched<[WriteIS, ReadI]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> immr;
+  bits<6> imms;
+
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100110;
+  let Inst{21-16} = immr;
+  let Inst{15-10} = imms;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
+  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
+    let Inst{31} = 0;
+    let Inst{22} = 0;
+    // imms<5> and immr<5> must be zero, else ReservedValue().
+    let Inst{21} = 0;
+    let Inst{15} = 0;
+  }
+  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
+    let Inst{31} = 1;
+    let Inst{22} = 1;
+  }
+}
+
+//---
+// Logical
+//---
+
+// Logical (immediate)
+class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
+                     RegisterClass sregtype, Operand imm_type, string asm,
+                     list<dag> pattern>
+    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
+         asm, "\t$Rd, $Rn, $imm", "", pattern>,
+      Sched<[WriteI, ReadI]> {
+  bits<5>  Rd;
+  bits<5>  Rn;
+  bits<13> imm;
+  let Inst{30-29} = opc;
+  let Inst{28-23} = 0b100100;
+  let Inst{22}    = imm{12};
+  let Inst{21-16} = imm{11-6};
+  let Inst{15-10} = imm{5-0};
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod = "DecodeLogicalImmInstruction";
+}
+
+// Logical (shifted register)
+class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
+                      logical_shifted_reg shifted_regtype, string asm,
+                      list<dag> pattern>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
+        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
+      Sched<[WriteISReg, ReadI, ReadISReg]> {
+  // The operands are in order to match the 'addr' MI operands, so we
+  // don't need an encoder method and by-name matching. Just use the default
+  // in-order handling. Since we're using by-order, make sure the names
+  // do not match.
+  bits<5> dst;
+  bits<5> src1;
+  bits<5> src2;
+  bits<8> shift;
+  let Inst{30-29} = opc;
+  let Inst{28-24} = 0b01010;
+  let Inst{23-22} = shift{7-6};
+  let Inst{21}    = N;
+  let Inst{20-16} = src2;
+  let Inst{15-10} = shift{5-0};
+  let Inst{9-5}   = src1;
+  let Inst{4-0}   = dst;
+
+  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
+}
+
+// Aliases for register+register logical instructions.
+class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
+    : InstAlias<asm#" $dst, $src1, $src2",
+                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
+
+let AddedComplexity = 6 in
+multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
+  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
+                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
+                                               logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
+                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
+                                               logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+}
+
+multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
+  let isCompare = 1, Defs = [NZCV] in {
+  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
+      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
+    let Inst{31} = 0;
+    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
+  }
+  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
+      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
+    let Inst{31} = 1;
+  }
+  } // end Defs = [NZCV]
+}
+
+class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
+    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
+      Sched<[WriteI, ReadI, ReadI]>;
+
+// Split from LogicalImm as not all instructions have both.
+multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
+                      SDPatternOperator OpNode> {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
+                                                 logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
+                                                 logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+// Split from LogicalReg to allow setting NZCV Defs
+multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic,
+                       SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
+  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
+
+  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
+            [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_shifted_reg32:$Rm))]> {
+    let Inst{31} = 0;
+  }
+  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
+            [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_shifted_reg64:$Rm))]> {
+    let Inst{31} = 1;
+  }
+  } // Defs = [NZCV]
+
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
+  def : LogicalRegAlias<mnemonic,
+                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
+}
+
+//---
+// Conditionally set flags
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> imm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = imm;
+  let Inst{15-12} = cond;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
 }
 
-// Format for floating-point data-processing (3 sources) instructions.
-class A64I_fpdp3<bit m, bit s, bits<2> type, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<5> Ra;
+multiclass CondSetFlagsImm<bit op, string asm> {
+  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11111;
-  let Inst{23-22} = type;
-  let Inst{21} = o1;
-  // Inherit Rm in 20-16
-  let Inst{15} = o0;
-  let Inst{14-10} = Ra;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+  let Defs = [NZCV];
+
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b111010010;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = 0b0;
+  let Inst{3-0}   = nzcv;
 }
 
-// Format for floating-point <-> fixed-point conversion instructions.
-class A64I_fpfixed<bit sf, bit s, bits<2> type, bits<2> mode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<6> Scale;
+multiclass CondSetFlagsReg<bit op, string asm> {
+  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
+    let Inst{31} = 1;
+  }
+}
 
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b0;
-  let Inst{20-19} = mode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = Scale;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+//---
+// Conditional select
+//---
+
+class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
+
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format for floating-point <-> integer conversion instructions.
-class A64I_fpint<bit sf, bit s, bits<2> type, bits<2> rmode, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = sf;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass CondSelect<bit op, bits<2> op2, string asm> {
+  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
+    let Inst{31} = 1;
+  }
 }
 
+class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
+                       PatFrag frag>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel regtype:$Rn, (frag regtype:$Rm),
+               (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteI, ReadI, ReadI]> {
+  let Uses = [NZCV];
 
-// Format for floating-point immediate instructions.
-class A64I_fpimm<bit m, bit s, bits<2> type, bits<5> imm5,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<8> Imm8;
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
 
-  let Inst{31} = m;
-  let Inst{30} = 0b0;
-  let Inst{29} = s;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21} = 0b1;
-  let Inst{20-13} = Imm8;
-  let Inst{12-10} = 0b100;
-  let Inst{9-5} = imm5;
-  // Inherit Rd in 4-0
+  let Inst{30}    = op;
+  let Inst{29-21} = 0b011010100;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = op2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+def inv_cond_XFORM : SDNodeXForm<imm, [{
+  AArch64CC::CondCode CC = static_cast<AArch64CC::CondCode>(N->getZExtValue());
+  return CurDAG->getTargetConstant(AArch64CC::getInvertedCondCode(CC), MVT::i32);
+}]>;
+
+multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
+  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
+    let Inst{31} = 0;
+  }
+  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
+    let Inst{31} = 1;
+  }
+
+  def : Pat<(AArch64csel (frag GPR32:$Rm), GPR32:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Wr) GPR32:$Rn, GPR32:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+
+  def : Pat<(AArch64csel (frag GPR64:$Rm), GPR64:$Rn, (i32 imm:$cond), NZCV),
+            (!cast<Instruction>(NAME # Xr) GPR64:$Rn, GPR64:$Rm,
+                                           (inv_cond_XFORM imm:$cond))>;
+}
+
+//---
+// Special Mask Value
+//---
+def maski8_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
+}
+def maski16_or_more : Operand<i32>,
+  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
+}
+
+
+//---
+// Load/store
+//---
+
+// (unsigned immediate)
+// Indexed for 8-bit registers. offset is in range [0,4095].
+def am_indexed8 : ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []>;
+def am_indexed16 : ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []>;
+def am_indexed32 : ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []>;
+def am_indexed64 : ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []>;
+def am_indexed128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []>;
+
+class UImm12OffsetOperand<int Scale> : AsmOperandClass {
+  let Name = "UImm12Offset" # Scale;
+  let RenderMethod = "addUImm12OffsetOperands<" # Scale # ">";
+  let PredicateMethod = "isUImm12Offset<" # Scale # ">";
+  let DiagnosticType = "InvalidMemoryIndexed" # Scale;
+}
+
+def UImm12OffsetScale1Operand : UImm12OffsetOperand<1>;
+def UImm12OffsetScale2Operand : UImm12OffsetOperand<2>;
+def UImm12OffsetScale4Operand : UImm12OffsetOperand<4>;
+def UImm12OffsetScale8Operand : UImm12OffsetOperand<8>;
+def UImm12OffsetScale16Operand : UImm12OffsetOperand<16>;
+
+class uimm12_scaled<int Scale> : Operand<i64> {
+  let ParserMatchClass
+   = !cast<AsmOperandClass>("UImm12OffsetScale" # Scale # "Operand");
+  let EncoderMethod
+   = "getLdStUImm12OpValue<AArch64::fixup_aarch64_ldst_imm12_scale" # Scale # ">";
+  let PrintMethod = "printUImm12Offset<" # Scale # ">";
+}
+
+def uimm12s1 : uimm12_scaled<1>;
+def uimm12s2 : uimm12_scaled<2>;
+def uimm12s4 : uimm12_scaled<4>;
+def uimm12s8 : uimm12_scaled<8>;
+def uimm12s16 : uimm12_scaled<16>;
+
+class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                      string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+
+  bits<5> Rn;
+  bits<12> offset;
+
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b01;
+  let Inst{23-22} = opc;
+  let Inst{21-10} = offset;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeUnsignedLdStInstruction";
+}
+
+multiclass LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs regtype:$Rt),
+                           (ins GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
+multiclass StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             Operand indextype, string asm, list<dag> pattern> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def ui : BaseLoadStoreUI<sz, V, opc, (outs),
+                           (ins regtype:$Rt, GPR64sp:$Rn, indextype:$offset),
+                           asm, pattern>,
+           Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "ui") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+def PrefetchOperand : AsmOperandClass {
+  let Name = "Prefetch";
+  let ParserMethod = "tryParsePrefetch";
+}
+def prfop : Operand<i32> {
+  let PrintMethod = "printPrefetchOp";
+  let ParserMatchClass = PrefetchOperand;
 }
 
-// Format for load-register (literal) instructions.
-class A64I_LDRlit<bits<2> opc, bit v,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  bits<19> Imm19;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
+    : BaseLoadStoreUI<sz, V, opc,
+                      (outs), (ins prfop:$Rt, GPR64sp:$Rn, uimm12s8:$offset),
+                      asm, pat>,
+      Sched<[WriteLD]>;
+
+//---
+// Load literal
+//---
+
+// Load literal address: 19-bit immediate. The low two bits of the target
+// offset are implied zero and so are not part of the immediate.
+def am_ldrlit : Operand<OtherVT> {
+  let EncoderMethod = "getLoadLiteralOpValue";
+  let DecoderMethod = "DecodePCRelLabel19";
+  let PrintMethod = "printAlignedLabel";
+  let ParserMatchClass = PCRelLabel19Operand;
+}
 
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
+    : I<(outs regtype:$Rt), (ins am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", []>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
   let Inst{31-30} = opc;
   let Inst{29-27} = 0b011;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
-  let Inst{23-5} = Imm19;
-  // Inherit Rt in 4-0
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
 }
 
-// Format for load-store exclusive instructions.
-class A64I_LDSTex_tn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                 dag outs, dag ins, string asmstr,
-                 list <dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = size;
-  let Inst{29-24} = 0b001000;
-  let Inst{23} = o2;
-  let Inst{22} = L;
-  let Inst{21} = o1;
-  let Inst{15} = o0;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
+    : I<(outs), (ins prfop:$Rt, am_ldrlit:$label),
+        asm, "\t$Rt, $label", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<19> label;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b011;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-5}  = label;
+  let Inst{4-0}   = Rt;
 }
 
-class A64I_LDSTex_tt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+//---
+// Load/store register offset
+//---
+
+def ro_Xindexed8 : ComplexPattern<i64, 4, "SelectAddrModeXRO<8>", []>;
+def ro_Xindexed16 : ComplexPattern<i64, 4, "SelectAddrModeXRO<16>", []>;
+def ro_Xindexed32 : ComplexPattern<i64, 4, "SelectAddrModeXRO<32>", []>;
+def ro_Xindexed64 : ComplexPattern<i64, 4, "SelectAddrModeXRO<64>", []>;
+def ro_Xindexed128 : ComplexPattern<i64, 4, "SelectAddrModeXRO<128>", []>;
+
+def ro_Windexed8 : ComplexPattern<i64, 4, "SelectAddrModeWRO<8>", []>;
+def ro_Windexed16 : ComplexPattern<i64, 4, "SelectAddrModeWRO<16>", []>;
+def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
+def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
+def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+
+class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
+  let Name = "Mem" # Reg # "Extend" # Width;
+  let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
+  let RenderMethod = "addMemExtendOperands";
+  let DiagnosticType = "InvalidMemory" # Reg # "Extend" # Width;
 }
 
-class A64I_LDSTex_stn<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_tn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rs;
-   let Inst{20-16} = Rs;
+def MemWExtend8Operand : MemExtendOperand<"W", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemWExtend16Operand : MemExtendOperand<"W", 16>;
+def MemWExtend32Operand : MemExtendOperand<"W", 32>;
+def MemWExtend64Operand : MemExtendOperand<"W", 64>;
+def MemWExtend128Operand : MemExtendOperand<"W", 128>;
+
+def MemXExtend8Operand : MemExtendOperand<"X", 8> {
+  // The address "[x0, x1, lsl #0]" actually maps to the variant which performs
+  // the trivial shift.
+  let RenderMethod = "addMemExtend8Operands";
+}
+def MemXExtend16Operand : MemExtendOperand<"X", 16>;
+def MemXExtend32Operand : MemExtendOperand<"X", 32>;
+def MemXExtend64Operand : MemExtendOperand<"X", 64>;
+def MemXExtend128Operand : MemExtendOperand<"X", 128>;
+
+class ro_extend<AsmOperandClass ParserClass, string Reg, int Width>
+        : Operand<i32> {
+  let ParserMatchClass = ParserClass;
+  let PrintMethod = "printMemExtend<'" # Reg # "', " # Width # ">";
+  let DecoderMethod = "DecodeMemExtend";
+  let EncoderMethod = "getMemExtendOpValue";
+  let MIOperandInfo = (ops i32imm:$signed, i32imm:$doshift);
 }
 
-class A64I_LDSTex_stt2n<bits<2> size, bit o2, bit L, bit o1, bit o0,
-                     dag outs, dag ins, string asmstr,
-                     list <dag> patterns, InstrItinClass itin>:
-      A64I_LDSTex_stn<size, o2, L, o1, o0, outs, ins, asmstr, patterns, itin>{
-   bits<5> Rt2;
-   let Inst{14-10} = Rt2;
+def ro_Wextend8   : ro_extend<MemWExtend8Operand,   "w", 8>;
+def ro_Wextend16  : ro_extend<MemWExtend16Operand,  "w", 16>;
+def ro_Wextend32  : ro_extend<MemWExtend32Operand,  "w", 32>;
+def ro_Wextend64  : ro_extend<MemWExtend64Operand,  "w", 64>;
+def ro_Wextend128 : ro_extend<MemWExtend128Operand, "w", 128>;
+
+def ro_Xextend8   : ro_extend<MemXExtend8Operand,   "x", 8>;
+def ro_Xextend16  : ro_extend<MemXExtend16Operand,  "x", 16>;
+def ro_Xextend32  : ro_extend<MemXExtend32Operand,  "x", 32>;
+def ro_Xextend64  : ro_extend<MemXExtend64Operand,  "x", 64>;
+def ro_Xextend128 : ro_extend<MemXExtend128Operand, "x", 128>;
+
+class ROAddrMode<ComplexPattern windex, ComplexPattern xindex,
+                  Operand wextend, Operand xextend>  {
+  // CodeGen-level pattern covering the entire addressing mode.
+  ComplexPattern Wpat = windex;
+  ComplexPattern Xpat = xindex;
+
+  // Asm-level Operand covering the valid "uxtw #3" style syntax.
+  Operand Wext = wextend;
+  Operand Xext = xextend;
 }
 
-// Format for load-store register (immediate post-indexed) instructions
-class A64I_LSpostind<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+def ro8 : ROAddrMode<ro_Windexed8, ro_Xindexed8, ro_Wextend8, ro_Xextend8>;
+def ro16 : ROAddrMode<ro_Windexed16, ro_Xindexed16, ro_Wextend16, ro_Xextend16>;
+def ro32 : ROAddrMode<ro_Windexed32, ro_Xindexed32, ro_Wextend32, ro_Xextend32>;
+def ro64 : ROAddrMode<ro_Windexed64, ro_Xindexed64, ro_Wextend64, ro_Xextend64>;
+def ro128 : ROAddrMode<ro_Windexed128, ro_Xindexed128, ro_Wextend128,
+                       ro_Xextend128>;
 
-  let Inst{31-30} = size;
+class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b01;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 }
 
-// Format for load-store register (immediate pre-indexed) instructions
-class A64I_LSpreind<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+class ROInstAlias<string asm, RegisterClass regtype, Instruction INST>
+  : InstAlias<asm # " $Rt, [$Rn, $Rm]",
+              (INST regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
+
+multiclass Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm,
+                 (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend8:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
+multiclass Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Windexed8 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore8RO<sz, V, opc, regtype, asm, (outs),
+                 (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend8:$extend),
+                 [(storeop (Ty regtype:$Rt),
+                           (ro_Xindexed8 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend8:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  let Inst{31-30} = size;
+class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b11;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 }
 
-// Format for load-store register (unprivileged) instructions
-class A64I_LSunpriv<bits<2> size, bit v, bits<2> opc,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+multiclass Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                             ro_Xextend16:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
+multiclass Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed16 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore16RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend16:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed16 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend16:$extend))]>,
+           Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  let Inst{31-30} = size;
+class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                              ro_Wextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                 (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend32:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
 }
 
-// Format for load-store (unscaled immediate) instructions.
-class A64I_LSunalimm<bits<2> size, bit v, bits<2> opc,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<9> SImm9;
+multiclass Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10 in
+  def roW : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed32 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10 in
+  def roX : LoadStore32RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm,
+                                        ro_Xextend32:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  let Inst{31-30} = size;
+class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21} = 0b0;
-  let Inst{20-12} = SImm9;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 }
 
+multiclass Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                    string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(set (Ty regtype:$Rt),
+                      (loadop (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                             ro_Wextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                              ro_Xextend64:$extend)))]>,
+           Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-// Format for load-store (unsigned immediate) instructions.
-class A64I_LSunsigimm<bits<2> size, bit v, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<12> UImm12;
+multiclass Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                         ro_Wextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore64RO<sz, V, opc, regtype, asm, (outs),
+                (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                         ro_Xextend64:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  let Inst{31-30} = size;
+class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, dag ins, dag outs, list<dag> pat>
+    : I<ins, outs, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
-  let Inst{25-24} = 0b01;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21-10} = UImm12;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 }
 
-// Format for load-store register (register offset) instructions.
-class A64I_LSregoff<bits<2> size, bit v, bits<2> opc, bit optionlo,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Rm;
+multiclass Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                     string asm, ValueType Ty, SDPatternOperator loadop> {
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                               ro_Wextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs regtype:$Rt),
+                (ins GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                 [(set (Ty regtype:$Rt),
+                       (loadop (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                               ro_Xextend128:$extend)))]>,
+            Sched<[WriteLDIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  // Complex operand selection needed for these instructions, so they
-  // need an "addr" field for encoding/decoding to be generated.
-  bits<3> Ext;
-  // OptionHi = Ext{2-1}
-  // S = Ext{0}
+multiclass Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                      string asm, ValueType Ty, SDPatternOperator storeop> {
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roW : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Windexed128 GPR64sp:$Rn, GPR32:$Rm,
+                                          ro_Wextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b0;
+  }
+
+  let AddedComplexity = 10, mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+  def roX : LoadStore128RO<sz, V, opc, regtype, asm, (outs),
+               (ins regtype:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend128:$extend),
+                [(storeop (Ty regtype:$Rt),
+                          (ro_Xindexed128 GPR64sp:$Rn, GPR64:$Rm,
+                                          ro_Xextend128:$extend))]>,
+            Sched<[WriteSTIdx, ReadAdrBase]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : ROInstAlias<asm, regtype, !cast<Instruction>(NAME # "roX")>;
+}
 
-  let Inst{31-30} = size;
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class BasePrefetchRO<bits<2> sz, bit V, bits<2> opc, dag outs, dag ins,
+                     string asm, list<dag> pat>
+    : I<outs, ins, asm, "\t$Rt, [$Rn, $Rm, $extend]", "", pat>,
+      Sched<[WriteLD]> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<2> extend;
+  let Inst{31-30} = sz;
   let Inst{29-27} = 0b111;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-24} = 0b00;
   let Inst{23-22} = opc;
-  let Inst{21} = 0b1;
+  let Inst{21}    = 1;
   let Inst{20-16} = Rm;
-  let Inst{15-14} = Ext{2-1};
-  let Inst{13} = optionlo;
-  let Inst{12} = Ext{0};
+  let Inst{15}    = extend{1}; // sign extend Rm?
+  let Inst{14}    = 1;
+  let Inst{12}    = extend{0}; // do shift?
   let Inst{11-10} = 0b10;
-  // Inherits Rn in 9-5
-  // Inherits Rt in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
 
-  let AddedComplexity = 50;
+multiclass PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm> {
+  def roW : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend),
+                asm, [(AArch64Prefetch imm:$Rt,
+                                     (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                                    ro_Wextend64:$extend))]> {
+    let Inst{13} = 0b0;
+  }
+
+  def roX : BasePrefetchRO<sz, V, opc, (outs),
+                (ins prfop:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend),
+                asm,  [(AArch64Prefetch imm:$Rt,
+                                      (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                                     ro_Xextend64:$extend))]> {
+    let Inst{13} = 0b1;
+  }
+
+  def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
+               (!cast<Instruction>(NAME # "roX") prfop:$Rt,
+                                                 GPR64sp:$Rn, GPR64:$Rm, 0, 0)>;
 }
 
-// Format for Load-store register pair (offset) instructions
-class A64I_LSPoffset<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+//---
+// Load/store unscaled immediate
+//---
 
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b010;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+def am_unscaled8 :  ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
+def am_unscaled16 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
+def am_unscaled32 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
+def am_unscaled64 : ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
+def am_unscaled128 :ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
+
+class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                           string asm, list<dag> pattern>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", pattern> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before LoadUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
+                               (ins GPR64sp:$Rn, simm9:$offset), asm, pattern>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                         string asm, list<dag> pattern> {
+  let AddedComplexity = 1 in // try this before StoreUI
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pattern>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm,
+                            list<dag> pat> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+  def i : BaseLoadStoreUnscale<sz, V, opc, (outs),
+                               (ins prfop:$Rt, GPR64sp:$Rn, simm9:$offset),
+                               asm, pat>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") prfop:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+//---
+// Load/store unscaled immediate, unprivileged
+//---
+
+class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                                dag oops, dag iops, string asm>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+multiclass LoadUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                            RegisterClass regtype, string asm> {
+  let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs regtype:$Rt),
+                                    (ins GPR64sp:$Rn, simm9:$offset), asm>,
+          Sched<[WriteLD]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
+}
+
+multiclass StoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
+                             RegisterClass regtype, string asm> {
+  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
+  def i : BaseLoadStoreUnprivileged<sz, V, opc, (outs),
+                                 (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                                 asm>,
+          Sched<[WriteST]>;
+
+  def : InstAlias<asm # " $Rt, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, GPR64sp:$Rn, 0)>;
 }
 
-// Format for Load-store register pair (post-indexed) instructions
-class A64I_LSPpostind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+//---
+// Load/store pre-indexed
+//---
+
+class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn, $offset]!", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
 
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                     (outs GPR64sp:$wback, regtype:$Rt),
+                     (ins GPR64sp:$Rn, simm9:$offset), asm,
+                     "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                  string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePreIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+      Sched<[WriteAdr, WriteST]>;
+} // hasSideEffects = 0
+
+//---
+// Load/store post-indexed
+//---
+
+// (pre-index) load/stores.
+class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
+                          string asm, string cstr, list<dag> pat>
+    : I<oops, iops, asm, "\t$Rt, [$Rn], $offset", cstr, pat> {
+  bits<5> Rt;
+  bits<5> Rn;
+  bits<9> offset;
+  let Inst{31-30} = sz;
+  let Inst{29-27} = 0b111;
+  let Inst{26}    = V;
+  let Inst{25-24} = 0b00;
+  let Inst{23-22} = opc;
+  let Inst{21}    = 0b0;
+  let Inst{20-12} = offset;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodeSignedLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+             string asm>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback, regtype:$Rt),
+                      (ins GPR64sp:$Rn, simm9:$offset),
+                      asm, "$Rn = $wback", []>,
+      Sched<[WriteLD, WriteI]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
+                   string asm, SDPatternOperator storeop, ValueType Ty>
+    : BaseLoadStorePostIdx<sz, V, opc,
+                      (outs GPR64sp:$wback),
+                      (ins regtype:$Rt, GPR64sp:$Rn, simm9:$offset),
+                       asm, "$Rn = $wback",
+      [(set GPR64sp:$wback,
+            (storeop (Ty regtype:$Rt), GPR64sp:$Rn, simm9:$offset))]>,
+    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
+} // hasSideEffects = 0
+
+
+//---
+// Load/store pair
+//---
+
+// (indexed, offset)
+
+class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
   let Inst{31-30} = opc;
   let Inst{29-27} = 0b101;
-  let Inst{26} = v;
-  let Inst{25-23} = 0b001;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b010;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+multiclass LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                          Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 1,
+                                  (outs regtype:$Rt, regtype:$Rt2),
+                                  (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
 }
 
-// Format for Load-store register pair (pre-indexed) instructions
-class A64I_LSPpreind<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
 
+multiclass StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStorePairOffset<opc, V, 0, (outs),
+                                  (ins regtype:$Rt, regtype:$Rt2,
+                                       GPR64sp:$Rn, indextype:$offset),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # " $Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
+
+// (pre-indexed)
+class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]!", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
   let Inst{31-30} = opc;
   let Inst{29-27} = 0b101;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-23} = 0b011;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+
+  let DecoderMethod = "DecodePairLdStInstruction";
 }
 
-// Format for Load-store non-temporal register pair (offset) instructions
-class A64I_LSPnontemp<bits<2> opc, bit v, bit l,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRtt2n<outs, ins, asmstr, patterns, itin> {
-  bits<7> SImm7;
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                     Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, indextype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm>
+    : BaseLoadStorePairPreIdx<opc, V, 0, (outs GPR64sp:$wback),
+                             (ins regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, indextype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+// (post-indexed)
+
+class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn], $offset", "$Rn = $wback", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
+  let Inst{31-30} = opc;
+  let Inst{29-27} = 0b101;
+  let Inst{26}    = V;
+  let Inst{25-23} = 0b001;
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 
+  let DecoderMethod = "DecodePairLdStInstruction";
+}
+
+let hasSideEffects = 0 in {
+let mayStore = 0, mayLoad = 1 in
+class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 1,
+                              (outs GPR64sp:$wback, regtype:$Rt, regtype:$Rt2),
+                              (ins GPR64sp:$Rn, idxtype:$offset), asm>,
+      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
+
+let mayStore = 1, mayLoad = 0 in
+class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
+                       Operand idxtype, string asm>
+    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
+                             (ins GPR64sp:$wback, regtype:$Rt, regtype:$Rt2,
+                                  GPR64sp:$Rn, idxtype:$offset),
+                             asm>,
+      Sched<[WriteAdr, WriteSTP]>;
+} // hasSideEffects = 0
+
+//  (no-allocate)
+
+class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, $offset]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  bits<7> offset;
   let Inst{31-30} = opc;
   let Inst{29-27} = 0b101;
-  let Inst{26} = v;
+  let Inst{26}    = V;
   let Inst{25-23} = 0b000;
-  let Inst{22} = l;
-  let Inst{21-15} = SImm7;
-  // Inherit Rt2 in 14-10
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format for Logical (immediate) instructions
-class A64I_logicalimm<bit sf, bits<2> opc,
-                      dag outs, dag ins, string asmstr,
-                      list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bit N;
-  bits<6> ImmR;
-  bits<6> ImmS;
-
-  // N, ImmR and ImmS have no separate existence in any assembly syntax (or for
-  // selection), so we'll combine them into a single field here.
-  bits<13> Imm;
-  // N = Imm{12};
-  // ImmR = Imm{11-6};
-  // ImmS = Imm{5-0};
+  let Inst{22}    = L;
+  let Inst{21-15} = offset;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22} = Imm{12};
-  let Inst{21-16} = Imm{11-6};
-  let Inst{15-10} = Imm{5-0};
-  // Rn inherited in 9-5
-  // Rd inherited in 4-0
+  let DecoderMethod = "DecodePairLdStInstruction";
 }
 
-// Format for Logical (shifted register) instructions
-class A64I_logicalshift<bit sf, bits<2> opc, bits<2> shift, bit N,
-                        dag outs, dag ins, string asmstr,
-                        list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  bits<6> Imm6;
+multiclass LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                           Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 1,
+                                   (outs regtype:$Rt, regtype:$Rt2),
+                                   (ins GPR64sp:$Rn, indextype:$offset), asm>,
+          Sched<[WriteLD, WriteLDHi]>;
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift;
-  let Inst{21} = N;
-  // Rm inherited
-  let Inst{15-10} = Imm6;
-  // Rn inherited
-  // Rd inherited
-}
-
-// Format for Move wide (immediate)
-class A64I_movw<bit sf, bits<2> opc,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<16> UImm16;
-  bits<2> Shift; // Called "hw" officially
 
-  let Inst{31} = sf;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = Shift;
-  let Inst{20-5} = UImm16;
-  // Inherits Rd in 4-0
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
 }
 
-// Format for PC-relative addressing instructions, ADR and ADRP.
-class A64I_PCADR<bit op,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs, ins, asmstr, patterns, itin> {
-  bits<21> Label;
+multiclass StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
+                      Operand indextype, string asm> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in
+  def i : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
+                                   (ins regtype:$Rt, regtype:$Rt2,
+                                        GPR64sp:$Rn, indextype:$offset),
+                                   asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") regtype:$Rt, regtype:$Rt2,
+                                                  GPR64sp:$Rn, 0)>;
+}
 
-  let Inst{31} = op;
-  let Inst{30-29} = Label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5} = Label{20-2};
+//---
+// Load/store exclusive
+//---
+
+// True exclusive operations write to and/or read from the system's exclusive
+// monitors, which as far as a compiler is concerned can be modelled as a
+// random shared memory address. Hence LoadExclusive mayStore.
+//
+// Since these instructions have the undefined register bits set to 1 in
+// their canonical form, we need a post encoder method to set those bits
+// to 1 when encoding these instructions. We do this using the
+// fixLoadStoreExclusive function. This function has template parameters:
+//
+// fixLoadStoreExclusive<int hasRs, int hasRt2>
+//
+// hasRs indicates that the instruction uses the Rs field, so we won't set
+// it to 1 (and the same for Rt2). We don't need template parameters for
+// the other register fields since Rt and Rn are always used.
+//
+let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
+class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                             dag oops, dag iops, string asm, string operands>
+    : I<oops, iops, asm, operands, "", []> {
+  let Inst{31-30} = sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23}    = o2;
+  let Inst{22}    = L;
+  let Inst{21}    = o1;
+  let Inst{15}    = o0;
+
+  let DecoderMethod = "DecodeExclusiveLdStInstruction";
 }
 
-// Format for system instructions
-class A64I_system<bit l,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  bits<2> Op0;
-  bits<3> Op1;
-  bits<4> CRn;
-  bits<4> CRm;
-  bits<3> Op2;
+// Neither Rs nor Rt2 operands.
+class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                               dag oops, dag iops, string asm, string operands>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
   bits<5> Rt;
+  bits<5> Rn;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
 
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21} = l;
-  let Inst{20-19} = Op0;
-  let Inst{18-16} = Op1;
-  let Inst{15-12} = CRn;
-  let Inst{11-8} = CRm;
-  let Inst{7-5} = Op2;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
+}
+
+// Simple load acquires don't set the exclusive monitor
+let mayLoad = 1, mayStore = 0 in
+class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                  RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                    RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
+                               (ins GPR64sp0:$Rn), asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteLD]>;
+
+class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                       RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs regtype:$Rt, regtype:$Rt2),
+                             (ins GPR64sp0:$Rn), asm,
+                             "\t$Rt, $Rt2, [$Rn]">,
+      Sched<[WriteLD, WriteLDHi]> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
   let Inst{4-0} = Rt;
 
-  // These instructions can do horrible things.
-  let hasSideEffects = 1;
+  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
 }
 
-// Format for unconditional branch (immediate) instructions
-class A64I_Bimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<26> Label;
+// Simple store release operations do not check the exclusive monitor.
+let mayLoad = 0, mayStore = 1 in
+class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                   RegisterClass regtype, string asm>
+    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
+                               (ins regtype:$Rt, GPR64sp0:$Rn),
+                               asm, "\t$Rt, [$Rn]">,
+      Sched<[WriteST]>;
+
+let mayLoad = 1, mayStore = 1 in
+class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                     RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
+                             (ins regtype:$Rt, GPR64sp0:$Rn),
+                             asm, "\t$Ws, $Rt, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
 
-  let Inst{31} = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0} = Label;
+  let Constraints = "@earlyclobber $Ws";
+  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
+}
+
+class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
+                         RegisterClass regtype, string asm>
+    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
+                             (outs GPR32:$Ws),
+                             (ins regtype:$Rt, regtype:$Rt2, GPR64sp0:$Rn),
+                              asm, "\t$Ws, $Rt, $Rt2, [$Rn]">,
+      Sched<[WriteSTX]> {
+  bits<5> Ws;
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{20-16} = Ws;
+  let Inst{14-10} = Rt2;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+
+  let Constraints = "@earlyclobber $Ws";
 }
 
-// Format for Test & branch (immediate) instructions
-class A64I_TBimm<bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRt<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
-  bits<6> Imm;
-  bits<14> Label;
+//---
+// Exception generation
+//---
 
-  let Inst{31} = Imm{5};
-  let Inst{30-25} = 0b011011;
-  let Inst{24} = op;
-  let Inst{23-19} = Imm{4-0};
-  let Inst{18-5} = Label;
-  // Inherit Rt in 4-0
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
+class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
+    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
+      Sched<[WriteSys]> {
+  bits<16> imm;
+  let Inst{31-24} = 0b11010100;
+  let Inst{23-21} = op1;
+  let Inst{20-5}  = imm;
+  let Inst{4-2}   = 0b000;
+  let Inst{1-0}   = ll;
+}
+
+let Predicates = [HasFPARMv8] in {
+
+//---
+// Floating point to integer conversion
+//---
+
+class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format for Unconditional branch (register) instructions, including
-// RET.  Shares no fields with instructions further up the hierarchy
-// so top-level.
-class A64I_Breg<bits<4> opc, bits<5> op2, bits<6> op3, bits<5> op4,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64Inst<outs, ins, asmstr, patterns, itin> {
-  // Doubly special in not even sharing register fields with other
-  // instructions, so we create our own Rn here.
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
   bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-29} = 0b00;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = type;
+  let Inst{21}    = 0;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = scale;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = opc;
-  let Inst{20-16} = op2;
-  let Inst{15-10} = op3;
+multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
+           SDPatternOperator OpN> {
+  // Unscaled single-precision to 32-bit
+  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled single-precision to 64-bit
+  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Unscaled double-precision to 32-bit
+  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
+                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+  }
+
+  // Unscaled double-precision to 64-bit
+  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
+                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
+                             SDPatternOperator OpN> {
+  // Scaled single-precision to 32-bit
+  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
+                              fixedpoint_f32_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled single-precision to 64-bit
+  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
+                              fixedpoint_f32_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR32:$Rn,
+                                          fixedpoint_f32_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+
+  // Scaled double-precision to 32-bit
+  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
+                              fixedpoint_f64_i32, asm,
+              [(set GPR32:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i32:$scale)))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let scale{5} = 1;
+  }
+
+  // Scaled double-precision to 64-bit
+  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
+                              fixedpoint_f64_i64, asm,
+              [(set GPR64:$Rd, (OpN (fmul FPR64:$Rn,
+                                          fixedpoint_f64_i64:$scale)))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+  }
+}
+
+//---
+// Integer to floating point conversion
+//---
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseIntegerToFP<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      Operand immType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
+         asm, "\t$Rd, $Rn, $scale", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b00001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = scale;
   let Inst{9-5}   = Rn;
-  let Inst{4-0}   = op4;
+  let Inst{4-0}   = Rd;
 }
 
+class BaseIntegerToFPUnscaled<bit isUnsigned,
+                      RegisterClass srcType, RegisterClass dstType,
+                      ValueType dvt, string asm, SDNode node>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn),
+         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<6> scale;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21-17} = 0b10001;
+  let Inst{16}    = isUnsigned;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-//===----------------------------------------------------------------------===//
-//
-// Neon Instruction Format Definitions.
-//
+multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
+  // Unscaled
+  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  // Scaled
+  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint_f32_i32, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f32_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint_f64_i32, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR32:$Rn),
+                                         fixedpoint_f64_i32:$scale))]> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+    let scale{5} = 1;
+  }
+
+  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint_f32_i64, asm,
+                             [(set FPR32:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f32_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint_f64_i64, asm,
+                             [(set FPR64:$Rd,
+                                   (fdiv (node GPR64:$Rn),
+                                         fixedpoint_f64_i64:$scale))]> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+}
+
+//---
+// Unscaled integer <-> floating point conversion (i.e. FMOV)
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
+                      RegisterClass srcType, RegisterClass dstType,
+                      string asm>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
+        // We use COPY_TO_REGCLASS for these bitconvert operations.
+        // copyPhysReg() expands the resultant COPY instructions after
+        // regalloc is done. This gives greater freedom for the allocator
+        // and related passes (coalescing, copy propagation, et. al.) to
+        // be more effective.
+        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111100;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterClass srcType, RegisterOperand dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd"#kind#"$idx, $Rn|"#kind#"\t$Rd$idx, $Rn}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
+                     RegisterOperand srcType, RegisterClass dstType, string asm,
+                     string kind>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn, VectorIndex1:$idx), asm,
+        "{\t$Rd, $Rn"#kind#"$idx|"#kind#"\t$Rd, $Rn$idx}", "", []>,
+      Sched<[WriteFCopy]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{30-23} = 0b00111101;
+  let Inst{21}    = 1;
+  let Inst{20-19} = rmode;
+  let Inst{18-16} = opcode;
+  let Inst{15-10} = 0b000000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+
+  let DecoderMethod =  "DecodeFMOVLaneInstruction";
+}
+
+
+
+multiclass UnscaledConversion<string asm> {
+  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
+    let Inst{31} = 0; // 32-bit GPR flag
+    let Inst{22} = 0; // 32-bit FPR flag
+  }
+
+  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
+    let Inst{31} = 1; // 64-bit GPR flag
+    let Inst{22} = 1; // 64-bit FPR flag
+  }
+
+  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
+                                             asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+
+  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
+                                               asm, ".d"> {
+    let Inst{31} = 1;
+    let Inst{22} = 0;
+  }
+}
+
+//---
+// Floating point conversion
+//---
+
+class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
+                       RegisterClass srcType, string asm, list<dag> pattern>
+    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
+      Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-24} = 0b00011110;
+  let Inst{23-22} = type;
+  let Inst{21-17} = 0b10001;
+  let Inst{16-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPConversion<string asm> {
+  // Double-precision to Half-precision
+  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
+                             [(set FPR16:$Rd, (fround FPR64:$Rn))]>;
+
+  // Double-precision to Single-precision
+  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
+                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
+
+  // Half-precision to Double-precision
+  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
+                             [(set FPR64:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Half-precision to Single-precision
+  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
+                             [(set FPR32:$Rd, (fextend FPR16:$Rn))]>;
+
+  // Single-precision to Double-precision
+  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
+                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
+
+  // Single-precision to Half-precision
+  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
+                             [(set FPR16:$Rd, (fround FPR32:$Rn))]>;
+}
+
+//---
+// Single operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
+                              ValueType vt, string asm, SDPatternOperator node>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
+         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21-19} = 0b100;
+  let Inst{18-15} = opcode;
+  let Inst{14-10} = 0b10000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SingleOperandFPData<bits<4> opcode, string asm,
+                               SDPatternOperator node = null_frag> {
+  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Two operand floating point data processing
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
+                           string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
+         asm, "\t$Rd, $Rn, $Rm", "", pat>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass TwoOperandFPData<bits<4> opcode, string asm,
+                            SDPatternOperator node = null_frag> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                         [(set (f32 FPR32:$Rd),
+                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                         [(set (f64 FPR64:$Rd),
+                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
+  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
+                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
+                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+
+//---
+// Three operand floating point data processing
+//---
+
+class BaseThreeOperandFPData<bit isNegated, bit isSub,
+                             RegisterClass regtype, string asm, list<dag> pat>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
+         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
+      Sched<[WriteFMul]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<5> Ra;
+  let Inst{31-23} = 0b000111110;
+  let Inst{21}    = isNegated;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = isSub;
+  let Inst{14-10} = Ra;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
+                              SDPatternOperator node> {
+  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
+            [(set FPR32:$Rd,
+                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
+    let Inst{22} = 0; // 32-bit size flag
+  }
+
+  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
+            [(set FPR64:$Rd,
+                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
+    let Inst{22} = 1; // 64-bit size flag
+  }
+}
+
+//---
+// Floating point data comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseOneOperandFPComparison<bit signalAllNans,
+                                 RegisterClass regtype, string asm,
+                                 list<dag> pat>
+    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b1000;
+
+  // Rm should be 0b00000 canonically, but we need to accept any value.
+  let PostEncoderMethod = "fixOneOperandFPComparison";
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
+                                string asm, list<dag> pat>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rm;
+  bits<5> Rn;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-10} = 0b001000;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = 0b0000;
+}
+
+multiclass FPComparison<bit signalAllNans, string asm,
+                        SDPatternOperator OpNode = null_frag> {
+  let Defs = [NZCV] in {
+  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
+      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+
+  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
+      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit NZCV)]> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV]
+}
+
+//---
+// Floating point conditional comparisons
+//---
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseFPCondComparison<bit signalAllNans,
+                              RegisterClass regtype, string asm>
+    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
+         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
+      Sched<[WriteFCmp]> {
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> nzcv;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = Rn;
+  let Inst{4}     = signalAllNans;
+  let Inst{3-0}   = nzcv;
+}
+
+multiclass FPCondComparison<bit signalAllNans, string asm> {
+  let Defs = [NZCV], Uses = [NZCV] in {
+  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Defs = [NZCV], Uses = [NZCV]
+}
+
+//---
+// Floating point conditional select
+//---
+
+class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
+    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
+         asm, "\t$Rd, $Rn, $Rm, $cond", "",
+         [(set regtype:$Rd,
+               (AArch64csel (vt regtype:$Rn), regtype:$Rm,
+                          (i32 imm:$cond), NZCV))]>,
+      Sched<[WriteF]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> cond;
+
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = cond;
+  let Inst{11-10} = 0b11;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPCondSelect<string asm> {
+  let Uses = [NZCV] in {
+  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
+    let Inst{22} = 1;
+  }
+  } // Uses = [NZCV]
+}
+
+//---
+// Floating move immediate
+//---
+
+class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
+  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
+      [(set regtype:$Rd, fpimmtype:$imm)]>,
+    Sched<[WriteFImm]> {
+  bits<5> Rd;
+  bits<8> imm;
+  let Inst{31-23} = 0b000111100;
+  let Inst{21}    = 1;
+  let Inst{20-13} = imm;
+  let Inst{12-5}  = 0b10000000;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass FPMoveImmediate<string asm> {
+  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
+    let Inst{22} = 0;
+  }
+
+  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
+    let Inst{22} = 1;
+  }
+}
+} // end of 'let Predicates = [HasFPARMv8]'
+
+//----------------------------------------------------------------------------
+// AdvSIMD
+//----------------------------------------------------------------------------
 
 let Predicates = [HasNEON] in {
 
-class NeonInstAlias<string Asm, dag Result, bit Emit = 0b1>
-  : InstAlias<Asm, Result, Emit> {
+//----------------------------------------------------------------------------
+// AdvSIMD three register vector instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string kind,
+                        list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// All operand sizes distinguished in the encoding.
+multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
+                                      asm, ".2d",
+         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+}
+
+// As above, but D sized elements unsupported.
+multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
+}
+
+multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+      [(set (v8i8 V64:$dst),
+            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+      [(set (v16i8 V128:$dst),
+            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
+  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+      [(set (v4i16 V64:$dst),
+            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+      [(set (v8i16 V128:$dst),
+            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+      [(set (v2i32 V64:$dst),
+            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+      [(set (v4i32 V128:$dst),
+            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// As above, but only B sized elements supported.
+multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b",
+    [(set (v16i8 V128:$Rd),
+          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
 }
 
-// Format AdvSIMD bitwise extract
-class NeonI_BitExtract<bit q, bits<2> op2,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+// As above, but only S and D sized floating point elements supported.
+multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
+                                    string asm,
+                                    SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
+                                 string asm, SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
+                                      asm, ".2s",
+     [(set (v2f32 V64:$dst),
+           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
+  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
+                                      asm, ".4s",
+     [(set (v4f32 V128:$dst),
+           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
+  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
+                                      asm, ".2d",
+     [(set (v2f64 V128:$dst),
+           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
+}
+
+// As above, but D and B sized elements unsupported.
+multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h",
+        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s",
+        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+}
+
+// Logical three vector ops share opcode bits, and only use B sized elements.
+multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
+  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
+
+  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
+          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
+}
+
+multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
+                                  string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
+                                     asm, ".8b",
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
+                                     asm, ".16b",
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, string asm, string dstkind,
+                        string srckind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, list<dag> pattern>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind #
+      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Supports B, H, and S element sizes.
+multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".8b", ".8b",
+                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".16b", ".16b",
+                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".4h", ".4h",
+                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".8h", ".8h",
+                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
+                            RegisterOperand regtype, string asm, string dstkind,
+                            string srckind, string amount>
+  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
+      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
   let Inst{29-24} = 0b101110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  // imm4 in 14-11
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD perm
-class NeonI_Perm<bit q, bits<2> size, bits<3> opcode,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
+  let Inst{21-10} = 0b100001001110;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorLShiftLongBySizeBHS {
+  let neverHasSideEffects = 1 in {
+  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
+                                             "shll", ".8h",  ".8b", "8">;
+  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
+                                             "shll2", ".8h", ".16b", "8">;
+  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
+                                             "shll", ".4s",  ".4h", "16">;
+  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
+                                             "shll2", ".4s", ".8h", "16">;
+  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
+                                             "shll", ".2d",  ".2s", "32">;
+  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
+                                             "shll2", ".2d", ".4s", "32">;
+  }
+}
+
+// Supports all element sizes.
+multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                      asm, ".4h", ".8b",
+               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                      asm, ".8h", ".16b",
+               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                      asm, ".2s", ".4h",
+               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                      asm, ".4s", ".8h",
+               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                      asm, ".1d", ".2s",
+               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                      asm, ".2d", ".4s",
+               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                          asm, ".4h", ".8b",
+      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
+                                      (v8i8 V64:$Rn)))]>;
+  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                          asm, ".8h", ".16b",
+      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
+                                      (v16i8 V128:$Rn)))]>;
+  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                          asm, ".2s", ".4h",
+      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
+                                      (v4i16 V64:$Rn)))]>;
+  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                          asm, ".4s", ".8h",
+      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                      (v8i16 V128:$Rn)))]>;
+  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                          asm, ".1d", ".2s",
+      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
+                                      (v2i32 V64:$Rn)))]>;
+  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                          asm, ".2d", ".4s",
+      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
+                                      (v4i32 V128:$Rn)))]>;
+}
+
+// Supports all element sizes, except 1xD.
+multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
+                                    asm, ".8b", ".8b",
+    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
+                                    asm, ".16b", ".16b",
+    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
+                                    asm, ".4h", ".4h",
+    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
+                                    asm, ".8h", ".8h",
+    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
+                                    asm, ".2s", ".2s",
+    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
+                                    asm, ".4s", ".4s",
+    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
+                                    asm, ".2d", ".2d",
+    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
+                                asm, ".2s", ".2s",
+    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
+                                asm, ".4s", ".4s",
+    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
+                                asm, ".2d", ".2d",
+    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+// Supports only B element sizes.
+multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
+                          SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+}
+
+// Supports only B and H element sizes.
+multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
+                                asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
+                                asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
+  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
+                                asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
+  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
+                                asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
+}
+
+// Supports only S and D element sizes, uses high bit of the size field
+// as an extra opcode bit.
+multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+// Supports only S element size.
+multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+}
+
+
+multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+}
+
+multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
+                                asm, ".2s", ".2s",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
+  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
+                                asm, ".4s", ".4s",
+                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
+                                asm, ".2d", ".2d",
+                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+}
+
+
+class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD table lookup
-class NeonI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-                dag outs, dag ins, string asmstr,
-                list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = op2;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-13} = len;
-  let Inst{12} = op;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with same vector type
-class NeonI_3VSame<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand inreg, RegisterOperand outreg,
+                           string asm, string outkind, string inkind,
+                           list<dag> pattern>
+  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind #
+      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 3 vector registers with different vector type
-class NeonI_3VDiff<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
+                                      asm, ".8b", ".8h",
+        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
+  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
+                                      asm#"2", ".16b", ".8h", []>;
+  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
+                                      asm, ".4h", ".4s",
+        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
+  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
+                                      asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
+                                      asm, ".2s", ".2d",
+        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
+  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
+                                      asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                           RegisterOperand regtype,
+                           string asm, string kind, string zero,
+                           ValueType dty, ValueType sty, SDNode OpNode>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", #" # zero #
+      "|" # kind # "\t$Rd, $Rn, #" # zero # "}", "",
+      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
   let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11} = 0b0;
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD two registers and an element
-class NeonI_2VElem<bit q, bit u, bits<2> size, bits<4> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01111;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// Comparisons support all element sizes, except 1xD.
+multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
+                            SDNode OpNode> {
+  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
+                                     asm, ".8b", "0",
+                                     v8i8, v8i8, OpNode>;
+  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
+                                     asm, ".16b", "0",
+                                     v16i8, v16i8, OpNode>;
+  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
+                                     asm, ".4h", "0",
+                                     v4i16, v4i16, OpNode>;
+  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
+                                     asm, ".8h", "0",
+                                     v8i16, v8i16, OpNode>;
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
+                                     asm, ".2s", "0",
+                                     v2i32, v2i32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
+                                     asm, ".4s", "0",
+                                     v4i32, v4i32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
+                                     asm, ".2d", "0",
+                                     v2i64, v2i64, OpNode>;
+}
+
+// FP Comparisons support only S and D element sizes.
+multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
+                              string asm, SDNode OpNode> {
+
+  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
+                                     asm, ".2s", "0.0",
+                                     v2i32, v2f32, OpNode>;
+  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
+                                     asm, ".4s", "0.0",
+                                     v4i32, v4f32, OpNode>;
+  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
+                                     asm, ".2d", "0.0",
+                                     v2i64, v2f64, OpNode>;
+
+  def : InstAlias<asm # " $Vd.2s, $Vn.2s, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.4s, $Vn.4s, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # " $Vd.2d, $Vn.2d, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i32rz) V64:$Vd, V64:$Vn), 0>;
+  def : InstAlias<asm # ".4s $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v4i32rz) V128:$Vd, V128:$Vn), 0>;
+  def : InstAlias<asm # ".2d $Vd, $Vn, #0",
+                  (!cast<Instruction>(NAME # v2i64rz) V128:$Vd, V128:$Vn), 0>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
+                             RegisterOperand outtype, RegisterOperand intype,
+                             string asm, string VdTy, string VnTy,
+                             list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
+      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  // l in Inst{21}
-  // m in Inst{20}
-  // Inherit Rm in 19-16
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
+                                    asm, ".4s", ".4h", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".4s", ".8h", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
+                                    asm, ".2d", ".2s", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".2d", ".4s", []>;
+}
+
+multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
+  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
+                                    asm, ".4h", ".4s", []>;
+  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
+                                    asm#"2", ".8h", ".4s", []>;
+  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                    asm, ".2s", ".2d", []>;
+  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+}
+
+multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
+                                     Intrinsic OpNode> {
+  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
+                                     asm, ".2s", ".2d",
+                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
+  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
+                                    asm#"2", ".4s", ".2d", []>;
+
+  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
+            (!cast<Instruction>(NAME # "v4f32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD three register different-size vector instructions.
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
   let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD 1 vector register with modified immediate
-class NeonI_1VModImm<bit q, bit op,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRd<outs,ins, asmstr, patterns, itin> {
-  bits<8> Imm;
-  bits<4> cmode;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-19} = 0b0111100000;
-  let Inst{15-12} = cmode;
-  let Inst{11} = 0b0; // o2
-  let Inst{10} = 1;
-  // Inherit Rd in 4-0
-  let Inst{18-16} = Imm{7-5}; // imm a:b:c
-  let Inst{9-5} = Imm{4-0};   // imm d:e:f:g:h
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
+                      RegisterOperand outtype, RegisterOperand intype1,
+                      RegisterOperand intype2, string asm,
+                      string outkind, string inkind1, string inkind2,
+                      list<dag> pattern>
+  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
+      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
+      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opcode;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+// FIXME: TableGen doesn't know how to deal with expanded types that also
+//        change the element count (in this case, placing the results in
+//        the high elements of the result register rather than the low
+//        elements). Until that's fixed, we can't code-gen those.
+multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                    Intrinsic IntOp> {
+  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".8b", ".8h", ".8h",
+     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
+  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".16b", ".8h", ".8h",
+     []>;
+  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".4h", ".4s", ".4s",
+     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
+  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".4s", ".4s",
+     []>;
+  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V64, V128, V128,
+                                                  asm, ".2s", ".2d", ".2d",
+     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
+  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".2d", ".2d",
+     []>;
+
+
+  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
+  // a version attached to an instruction.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v8i16_v16i8")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
+                                                    (v4i32 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v4i32_v8i16")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
+                                                    (v2i64 V128:$Rm))),
+            (!cast<Instruction>(NAME # "v2i64_v4i32")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
+                                      Intrinsic IntOp> {
+  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                            V128, V64, V64,
+                                            asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                            V128, V128, V128,
+                                            asm#"2", ".8h", ".16b", ".16b", []>;
+  let Predicates = [HasCrypto] in {
+    def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
+                                              V128, V64, V64,
+                                              asm, ".1q", ".1d", ".1d", []>;
+    def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
+                                              V128, V128, V128,
+                                              asm#"2", ".1q", ".2d", ".2d", []>;
+  }
+
+  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
+                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
+      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
+}
+
+multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
+                                 SDPatternOperator OpNode> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                (extract_high_v16i8 V128:$Rm)))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                  (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                 (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
+                                          string asm,
+                                          SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (add (v8i16 V128:$Rd),
+               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
+                                   (extract_high_v16i8 V128:$Rm))))))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (add (v4i32 V128:$Rd),
+               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
+                                    (extract_high_v8i16 V128:$Rm))))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (add (v2i64 V128:$Rd),
+               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
+                                    (extract_high_v4i32 V128:$Rm))))))]>;
+}
+
+multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
+                                      (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
+                                      (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
+                                      (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
+                                      string asm,
+                                      SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".8h", ".8b", ".8b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
+                                                 V128, V128, V128,
+                                                 asm#"2", ".8h", ".16b", ".16b",
+    [(set (v8i16 V128:$dst),
+          (OpNode (v8i16 V128:$Rd),
+                  (extract_high_v16i8 V128:$Rn),
+                  (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
+                                           SDPatternOperator Accum> {
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".4s", ".4h", ".4h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                                                (v4i16 V64:$Rm)))))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".8h", ".8h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
+                                            (extract_high_v8i16 V128:$Rm)))))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
+                                                  V128, V64, V64,
+                                                  asm, ".2d", ".2s", ".2s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (v2i32 V64:$Rn),
+                                                (v2i32 V64:$Rm)))))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".4s", ".4s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
+                                            (extract_high_v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
+                                  SDPatternOperator OpNode> {
+  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".8h", ".8h", ".8b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".8h", ".8h", ".16b",
+       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                                       (extract_high_v16i8 V128:$Rm)))]>;
+  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".4s", ".4s", ".4h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
+  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".4s", ".4s", ".8h",
+       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                                       (extract_high_v8i16 V128:$Rm)))]>;
+  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
+                                                  V128, V128, V64,
+                                                  asm, ".2d", ".2d", ".2s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
+  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
+                                                  V128, V128, V128,
+                                                  asm#"2", ".2d", ".2d", ".4s",
+       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                                       (extract_high_v4i32 V128:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
+                             string asm, string kind>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
+      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
+      [(set (vty regtype:$Rd),
+            (AArch64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  bits<4> imm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size;
+  let Inst{29-21} = 0b101110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-11} = imm;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+
+multiclass SIMDBitwiseExtract<string asm> {
+  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b"> {
+    let imm{3} = 0;
+  }
+  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
+                        string asm, string kind, SDNode OpNode, ValueType valty>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
+      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
+      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31}    = 0;
+  let Inst{30}    = size{0};
+  let Inst{29-24} = 0b001110;
+  let Inst{23-22} = size{2-1};
+  let Inst{21}    = 0;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD 3 scalar registers with same type
+multiclass SIMDZipVector<bits<3>opc, string asm,
+                         SDNode OpNode> {
+  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
+      asm, ".8b", OpNode, v8i8>;
+  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
+      asm, ".16b", OpNode, v16i8>;
+  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
+      asm, ".4h", OpNode, v4i16>;
+  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
+      asm, ".8h", OpNode, v8i16>;
+  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
+      asm, ".2s", OpNode, v2i32>;
+  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
+      asm, ".4s", OpNode, v4i32>;
+  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
+      asm, ".2d", OpNode, v2i64>;
+
+  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
+        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
+  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
+  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
+        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
+}
 
-class NeonI_Scalar3Same<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
+//----------------------------------------------------------------------------
+// AdvSIMD three register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm,
+                        list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
+      "\t$Rd, $Rn, $Rm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
   let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
+                            SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+}
 
-// Format AdvSIMD 2 vector registers miscellaneous
-class NeonI_2VMisc<bit q, bit u, bits<2> size, bits<5> opcode,
-                   dag outs, dag ins, string asmstr,
-                   list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
+  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
+            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
+}
+
+multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
+                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
+}
+
+multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
+  }
+
+  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
+                                SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
+      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
+    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
+      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
+}
+
+class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
+              dag oops, dag iops, string asm, string cstr, list<dag> pat>
+  : I<oops, iops, asm,
+      "\t$Rd, $Rn, $Rm", cstr, pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$Rd),
+                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$Rd),
+                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
+            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
+                                  SDPatternOperator OpNode = null_frag> {
+  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
+                                      (outs FPR32:$dst),
+                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
+                                      asm, "$Rd = $dst", []>;
+  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
+                                      (outs FPR64:$dst),
+                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
+                                      asm, "$Rd = $dst",
+            [(set (i64 FPR64:$dst),
+                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD two register scalar instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
   let Inst{21-17} = 0b10000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, RegisterClass regtype2,
+                        string asm, list<dag> pat>
+  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
+      "\t$Rd, $Rn", "$Rd = $dst", pat>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD 2 vector 1 immediate shift
-class NeonI_2VShiftImm<bit q, bit u, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<7> Imm;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = Imm;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD duplicate and insert
-class NeonI_copy<bit q, bit op, bits<4> imm4,
-                 dag outs, dag ins, string asmstr,
-                 list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD insert from element to vector
-class NeonI_insert<bit q, bit op,
-                  dag outs, dag ins, string asmstr,
-                  list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<5> Imm5;
-  bits<4> Imm4;
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{20-16} = Imm5;
-  let Inst{15} = 0b0;
-  let Inst{14-11} = Imm4;
-  let Inst{10} = 0b1;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar pairwise
-class NeonI_ScalarPair<bit u, bits<2> size, bits<5> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterClass regtype, string asm, string zero>
+  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
+      "\t$Rd, $Rn, #" # zero, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
   let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
+  let Inst{21-17} = 0b10000;
+  let Inst{16-12} = opcode;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
+  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
+     [(set (f32 FPR32:$Rd), (int_aarch64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-17} = 0b011111100110000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm, "0">;
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
 }
 
-// Format AdvSIMD 2 vector across lanes
-class NeonI_2VAcross<bit q, bit u, bits<2> size, bits<5> opcode,
-                     dag outs, dag ins, string asmstr,
-                     list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b01110;
+multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm, "0.0">;
+  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm, "0.0">;
+
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rd, FPR64:$Rn), 0>;
+  def : InstAlias<asm # " $Rd, $Rn, #0",
+                  (!cast<Instruction>(NAME # v1i32rz) FPR32:$Rd, FPR32:$Rn), 0>;
+
+  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
+                          SDPatternOperator OpNode = null_frag> {
+  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
+
+  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
+  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
+}
+
+multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
+                              SDPatternOperator OpNode> {
+  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
+                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
+  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
+                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
+}
+
+multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode = null_frag> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
+           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
+           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
+}
+
+multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
+                                 Intrinsic OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
+        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
+    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
+        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
+    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
+    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
+            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
+}
+
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
+        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
+  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
+  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
+                        RegisterOperand regtype, RegisterOperand vectype,
+                        string asm, string kind>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
   let Inst{23-22} = size;
   let Inst{21-17} = 0b11000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
+  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
+                                      asm, ".2d">;
 }
 
-// Format AdvSIMD scalar two registers miscellaneous
-class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag ins,
-                            string asmstr, list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
+multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
+  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
+                                      asm, ".2s">;
+  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
+                                      asm, ".2d">;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
+                          RegisterClass regtype, RegisterOperand vectype,
+                          string asm, string kind, list<dag> pattern>
+  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
+      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b01110;
   let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
+  let Inst{21-17} = 0b11000;
   let Inst{16-12} = opcode;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure
-class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
-                    dag outs, dag ins, string asmstr,
-                    list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
+                              string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
+  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
+                                   asm, ".8b", []>;
+  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
+                                   asm, ".16b", []>;
+  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
+                                   asm, ".4h", []>;
+  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
+                                   asm, ".8h", []>;
+  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
+                                   asm, ".4s", []>;
+}
+
+multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
+                            Intrinsic intOp> {
+  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
+                                   asm, ".4s",
+        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+// FIXME: There has got to be a better way to factor these. ugh.
+
+class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
+                     string operands, string constraints, list<dag> pattern>
+  : I<outs, ins, asm, operands, constraints, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
+  let Inst{29} = op;
+  let Inst{28-21} = 0b01110000;
+  let Inst{15} = 0;
+  let Inst{10} = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
+                      RegisterOperand vecreg, RegisterClass regtype>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
+                   "{\t$Rd" # size # ", $Rn" #
+                   "|" # size # "\t$Rd, $Rn}", "",
+                   [(set (vectype vecreg:$Rd), (AArch64dup regtype:$Rn))]> {
+  let Inst{20-16} = imm5;
+  let Inst{14-11} = 0b0001;
+}
+
+class SIMDDupFromElement<bit Q, string dstkind, string srckind,
+                         ValueType vectype, ValueType insreg,
+                         RegisterOperand vecreg, Operand idxtype,
+                         ValueType elttype, SDNode OpNode>
+  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
+                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
+                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
+                 [(set (vectype vecreg:$Rd),
+                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
+  let Inst{14-11} = 0b0000;
+}
+
+class SIMDDup64FromElement
+  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
+                       VectorIndexD, i64, AArch64duplane64> {
+  bits<1> idx;
+  let Inst{20} = idx;
+  let Inst{19-16} = 0b1000;
+}
+
+class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
+                       VectorIndexS, i64, AArch64duplane32> {
+  bits<2> idx;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = 0b100;
+}
+
+class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
+                           RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
+                       VectorIndexH, i64, AArch64duplane16> {
+  bits<3> idx;
+  let Inst{20-18} = idx;
+  let Inst{17-16} = 0b10;
+}
+
+class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
+                          RegisterOperand vecreg>
+  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
+                       VectorIndexB, i64, AArch64duplane8> {
+  bits<4> idx;
+  let Inst{20-17} = idx;
+  let Inst{16} = 1;
+}
+
+class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
+                  Operand idxtype, string asm, list<dag> pattern>
+  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
+                   "{\t$Rd, $Rn" # size # "$idx" #
+                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
+  let Inst{14-11} = imm4;
+}
+
+class SIMDSMov<bit Q, string size, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
+class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
+               Operand idxtype>
+  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
+      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
+
+class SIMDMovAlias<string asm, string size, Instruction inst,
+                   RegisterClass regtype, Operand idxtype>
+    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
+                    "|" # size # "\t$dst, $src$idx}",
+                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
+
+multiclass SMov {
+  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+}
+
+multiclass UMov {
+  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+  def : SIMDMovAlias<"mov", ".s",
+                     !cast<Instruction>(NAME#"vi32"),
+                     GPR32, VectorIndexS>;
+  def : SIMDMovAlias<"mov", ".d",
+                     !cast<Instruction>(NAME#"vi64"),
+                     GPR64, VectorIndexD>;
+}
+
+class SIMDInsFromMain<string size, ValueType vectype,
+                      RegisterClass regtype, Operand idxtype>
+  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" #
+                   "|" # size # "\t$Rd$idx, $Rn}",
+                   "$Rd = $dst",
+            [(set V128:$dst,
+              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
+  let Inst{14-11} = 0b0011;
+}
+
+class SIMDInsFromElement<string size, ValueType vectype,
+                         ValueType elttype, Operand idxtype>
+  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
+                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
+                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
+                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
+                   "$Rd = $dst",
+         [(set V128:$dst,
+               (vector_insert
+                 (vectype V128:$Rd),
+                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
+                 idxtype:$idx))]>;
+
+class SIMDInsMainMovAlias<string size, Instruction inst,
+                          RegisterClass regtype, Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
+                        "|" # size #"\t$dst$idx, $src}",
+                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
+class SIMDInsElementMovAlias<string size, Instruction inst,
+                             Operand idxtype>
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+                      # "|" # size #" $dst$idx, $src$idx2}",
+                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
+
+
+multiclass SIMDIns {
+  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
+    bits<4> idx;
+    bits<4> idx2;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+    let Inst{14-11} = idx2;
+  }
+  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
+    bits<3> idx;
+    bits<3> idx2;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+    let Inst{14-12} = idx2;
+    let Inst{11} = 0;
+  }
+  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
+    bits<2> idx;
+    bits<2> idx2;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+    let Inst{14-13} = idx2;
+    let Inst{12-11} = 0;
+  }
+  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
+    bits<1> idx;
+    bits<1> idx2;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+    let Inst{14} = idx2;
+    let Inst{13-11} = 0;
+  }
+
+  // For all forms of the INS instruction, the "mov" mnemonic is the
+  // preferred alias. Why they didn't just call the instruction "mov" in
+  // the first place is a very good question indeed...
+  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
+                         GPR32, VectorIndexB>;
+  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
+                         GPR32, VectorIndexH>;
+  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
+                         GPR32, VectorIndexS>;
+  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
+                         GPR64, VectorIndexD>;
+
+  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
+                         VectorIndexB>;
+  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
+                         VectorIndexH>;
+  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
+                         VectorIndexS>;
+  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
+                         VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
+                          RegisterOperand listtype, string asm, string kind>
+  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
+       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
+    Sched<[WriteV]> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-21} = 0b001110000;
+  let Inst{20-16} = Vm;
+  let Inst{15}    = 0;
+  let Inst{14-13} = len;
+  let Inst{12}    = op;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5}   = Vn;
+  let Inst{4-0}   = Vd;
+}
+
+class SIMDTableLookupAlias<string asm, Instruction inst,
+                          RegisterOperand vectype, RegisterOperand listtype>
+    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
+                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
+
+multiclass SIMDTableLookup<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+multiclass SIMDTableLookupTied<bit op, string asm> {
+  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
+                                      asm, ".8b">;
+  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
+                                      asm, ".8b">;
+  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
+                                      asm, ".8b">;
+  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
+                                      asm, ".8b">;
+  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
+                                      asm, ".16b">;
+  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
+                                      asm, ".16b">;
+  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
+                                      asm, ".16b">;
+  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
+                                      asm, ".16b">;
+
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8One"),
+                         V64, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Two"),
+                         V64, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Three"),
+                         V64, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".8b",
+                         !cast<Instruction>(NAME#"v8i8Four"),
+                         V64, VecListFour128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8One"),
+                         V128, VecListOne128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Two"),
+                         V128, VecListTwo128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Three"),
+                         V128, VecListThree128>;
+  def : SIMDTableLookupAlias<asm # ".16b",
+                         !cast<Instruction>(NAME#"v16i8Four"),
+                         V128, VecListFour128>;
+}
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY
+//----------------------------------------------------------------------------
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
+                        string kind, Operand idxtype>
+  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
+       "{\t$dst, $src" # kind # "$idx" #
+       "|\t$dst, $src$idx}", "", []>,
+    Sched<[WriteV]> {
+  bits<5> dst;
+  bits<5> src;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{15-10} = 0b000001;
+  let Inst{9-5}   = src;
+  let Inst{4-0}   = dst;
+}
+
+class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
+      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+                    # "|\t$dst, $src$index}",
+                (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
+
+
+multiclass SIMDScalarCPY<string asm> {
+  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
+    bits<4> idx;
+    let Inst{20-17} = idx;
+    let Inst{16} = 1;
+  }
+  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
+    bits<3> idx;
+    let Inst{20-18} = idx;
+    let Inst{17-16} = 0b10;
+  }
+  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
+    bits<2> idx;
+    let Inst{20-19} = idx;
+    let Inst{18-16} = 0b100;
+  }
+  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
+    bits<1> idx;
+    let Inst{20} = idx;
+    let Inst{19-16} = 0b1000;
+  }
+
+  def : Pat<(v1i64 (scalar_to_vector (i64 (vector_extract (v2i64 V128:$src),
+                                                          VectorIndexD:$idx)))),
+            (!cast<Instruction>(NAME # i64) V128:$src, VectorIndexD:$idx)>;
+
+  // 'DUP' mnemonic aliases.
+  def : SIMDScalarCPYAlias<"dup", ".b",
+                           !cast<Instruction>(NAME#"i8"),
+                           FPR8, V128, VectorIndexB>;
+  def : SIMDScalarCPYAlias<"dup", ".h",
+                           !cast<Instruction>(NAME#"i16"),
+                           FPR16, V128, VectorIndexH>;
+  def : SIMDScalarCPYAlias<"dup", ".s",
+                           !cast<Instruction>(NAME#"i32"),
+                           FPR32, V128, VectorIndexS>;
+  def : SIMDScalarCPYAlias<"dup", ".d",
+                           !cast<Instruction>(NAME#"i64"),
+                           FPR64, V128, VectorIndexD>;
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//----------------------------------------------------------------------------
+
+class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
+                          string asm, string op_string,
+                          string cstr, list<dag> pattern>
+  : I<oops, iops, asm, op_string, cstr, pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<8> imm8;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = op;
+  let Inst{28-19} = 0b0111100000;
+  let Inst{18-16} = imm8{7-5};
+  let Inst{11-10} = 0b01;
+  let Inst{9-5}   = imm8{4-0};
+  let Inst{4-0}   = Rd;
+}
+
+class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
+                        !con((ins immtype:$imm8), opt_shift_iop), asm,
+                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "", pattern> {
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
+                                Operand immtype, dag opt_shift_iop,
+                                string opt_shift, string asm, string kind,
+                                list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
+                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
+                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
+                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
+                        "$Rd = $dst", pattern> {
+  let DecoderMethod = "DecodeModImmTiedInstruction";
+}
+
+class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
+                                     RegisterOperand vectype, string asm,
+                                     string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15}    = b15_b12{1};
+  let Inst{14-13} = shift;
+  let Inst{12}    = b15_b12{0};
+}
+
+
+class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
+                                         RegisterOperand vectype, string asm,
+                                         string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
+                              (ins logical_vec_hw_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<2> shift;
+  let Inst{15} = b15_b12{1};
+  let Inst{14} = 0;
+  let Inst{13} = shift{0};
+  let Inst{12} = b15_b12{0};
+}
+
+multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
+                                      string asm> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
+                                                 asm, ".4h", []>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
+                                                 asm, ".8h", []>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
+                                             asm, ".2s", []>;
+  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
+                                             asm, ".4s", []>;
+}
+
+multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
+                                      bits<2> w_cmode, string asm,
+                                      SDNode OpNode> {
+  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
+                                                 asm, ".4h",
+             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
+                                                 asm, ".8h",
+             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+
+  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
+                                             asm, ".2s",
+             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
+                                             imm0_255:$imm8,
+                                             (i32 imm:$shift)))]>;
+  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
+                                             asm, ".4s",
+             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
+                                              imm0_255:$imm8,
+                                              (i32 imm:$shift)))]>;
+}
+
+class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
+                             RegisterOperand vectype, string asm,
+                             string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
+                              (ins move_vec_shift:$shift),
+                              "$shift", asm, kind, pattern> {
+  bits<1> shift;
+  let Inst{15-13} = cmode{3-1};
+  let Inst{12}    = shift;
+}
+
+class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
+                                   RegisterOperand vectype,
+                                   Operand imm_type, string asm,
+                                   string kind, list<dag> pattern>
+  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
+                              asm, kind, pattern> {
+  let Inst{15-12} = cmode;
+}
+
+class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
+                                   list<dag> pattern>
+  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
+                        "\t$Rd, $imm8", "", pattern> {
+  let Inst{15-12} = cmode;
+  let DecoderMethod = "DecodeModImmInstruction";
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
+      asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
+                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
+                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
+                      string apple_kind, string dst_kind, string lhs_kind,
+                      string rhs_kind, list<dag> pattern>
+  : I<(outs dst_reg:$dst),
+      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
+      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
+      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28}    = Scalar;
+  let Inst{27-24} = 0b1111;
+  let Inst{23-22} = size;
+  // Bit 21 must be set by the derived class.
+  let Inst{20-16} = Rm;
+  let Inst{15-12} = opc;
+  // Bit 11 must be set by the derived class.
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
+                           SDPatternOperator OpNode> {
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2f32 V64:$Rd),
+        (OpNode (v2f32 V64:$Rn),
+         (v2f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4f32 V128:$Rd),
+        (OpNode (v4f32 V128:$Rn),
+         (v4f32 (AArch64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d",
+    [(set (v2f64 V128:$Rd),
+        (OpNode (v2f64 V128:$Rn),
+         (v2f64 (AArch64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (f32 FPR32Op:$Rd),
+          (OpNode (f32 FPR32Op:$Rn),
+                  (f32 (vector_extract (v4f32 V128:$Rm),
+                                       VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d",
+    [(set (f64 FPR64Op:$Rd),
+          (OpNode (f64 FPR64Op:$Rn),
+                  (f64 (vector_extract (v2f64 V128:$Rm),
+                                       VectorIndexD:$idx))))]> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
+  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # v2i32_indexed)
+                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+
+  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 V128:$Rm),
+                                           VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v4i32_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 FPR32Op:$Rm)))),
+            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 V128:$Rm),
+                                           VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v2i64_indexed")
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 FPR64Op:$Rm)))),
+            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
+            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
+            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexD:$idx)>;
+}
+
+multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
+                                          V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
+                                      V128, V128,
+                                      V128, VectorIndexD,
+                                      asm, ".2d", ".2d", ".2d", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
+                                      FPR64Op, FPR64Op, V128, VectorIndexD,
+                                      asm, ".d", "", "", ".d", []> {
+    bits<1> idx;
+    let Inst{11} = idx{0};
+    let Inst{21} = 0;
+  }
+}
+
+multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
+                         SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s",  ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR32Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+      [(set (i32 FPR32Op:$Rd),
+            (OpNode FPR32Op:$Rn,
+                    (i32 (vector_extract (v4i32 V128:$Rm),
+                                         VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
+                               SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V64, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$Rd),
+       (OpNode (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$Rd),
+       (OpNode (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$Rd),
+       (OpNode (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
+                                          V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+         (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V64, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s", []> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                             (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
+  // intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                (i32 (vector_extract (v4i32
+                         (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+                             (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx)))),
+                         (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v4i16_indexed)
+                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
+                    V128_lo:$Rm, VectorIndexH:$idx),
+                ssub)>;
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqdmull
+                            (extract_high_v8i16 V128:$Rn),
+                            (extract_high_v8i16
+                                (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (Accum (v2i64 V128:$Rd),
+               (v2i64 (int_aarch64_neon_sqdmull
+                          (v2i32 V64:$Rn),
+                          (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (Accum (v2i64 V128:$Rd),
+                 (v2i64 (int_aarch64_neon_sqdmull
+                            (extract_high_v4i32 V128:$Rn),
+                            (extract_high_v4i32
+                                (AArch64duplane32 (v4i32 V128:$Rm),
+                                                VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
+                                      asm, ".h", "", "", ".h", []> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+
+  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                      FPR64Op, FPR32Op, V128, VectorIndexS,
+                                      asm, ".s", "", "", ".s",
+    [(set (i64 FPR64Op:$dst),
+          (Accum (i64 FPR64Op:$Rd),
+                 (i64 (int_aarch64_neon_sqdmulls_scalar
+                            (i32 FPR32Op:$Rn),
+                            (i32 (vector_extract (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+
+multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$Rd),
+        (OpNode (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$Rd),
+          (OpNode (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$Rd),
+        (OpNode (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$Rd),
+          (OpNode (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
+                                       SDPatternOperator OpNode> {
+  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                      V128, V64,
+                                      V128_lo, VectorIndexH,
+                                      asm, ".4s", ".4s", ".4h", ".h",
+    [(set (v4i32 V128:$dst),
+        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
+         (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                      V128, V128,
+                                      V128_lo, VectorIndexH,
+                                      asm#"2", ".4s", ".4s", ".8h", ".h",
+    [(set (v4i32 V128:$dst),
+          (OpNode (v4i32 V128:$Rd),
+                  (extract_high_v8i16 V128:$Rn),
+                  (extract_high_v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                      VectorIndexH:$idx))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                      V128, V64,
+                                      V128, VectorIndexS,
+                                      asm, ".2d", ".2d", ".2s", ".s",
+    [(set (v2i64 V128:$dst),
+        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
+         (v2i32 (AArch64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                      V128, V128,
+                                      V128, VectorIndexS,
+                                      asm#"2", ".2d", ".2d", ".4s", ".s",
+    [(set (v2i64 V128:$dst),
+          (OpNode (v2i64 V128:$Rd),
+                  (extract_high_v4i32 V128:$Rn),
+                  (extract_high_v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                      VectorIndexS:$idx))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift by immediate
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterClass regtype1, RegisterClass regtype2,
+                     Operand immtype, string asm, list<dag> pattern>
+  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
+      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<7> imm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b111110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rd;
+}
+
+
+multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$Rd),
+     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm,
+  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
+                                                   (i32 vecshiftR64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                           (i32 vecshiftR64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
+                                            vecshiftR64:$imm)>;
+}
+
+multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
+                             SDPatternOperator OpNode> {
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (v1i64 FPR64:$Rd),
+       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
+  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
+                               SDPatternOperator OpNode = null_frag> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR16, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR32, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR64, vecshiftR32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+}
+
+multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftL8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftL16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftL32, asm,
+    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftL64, asm,
+    [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
+    let Inst{21-16} = imm{5-0};
+  }
+
+  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm))),
+            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftL64:$imm)>;
+}
+
+multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
+  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
+                              FPR8, FPR8, vecshiftR8, asm, []> {
+    let Inst{18-16} = imm{2-0};
+  }
+
+  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
+                              FPR16, FPR16, vecshiftR16, asm, []> {
+    let Inst{19-16} = imm{3-0};
+  }
+
+  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
+                              FPR32, FPR32, vecshiftR32, asm, []> {
+    let Inst{20-16} = imm{4-0};
+  }
+
+  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
+                              FPR64, FPR64, vecshiftR64, asm, []> {
+    let Inst{21-16} = imm{5-0};
+  }
+}
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector x indexed element
+//----------------------------------------------------------------------------
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand dst_reg, RegisterOperand src_reg,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
+                     RegisterOperand vectype1, RegisterOperand vectype2,
+                     Operand immtype,
+                     string asm, string dst_kind, string src_kind,
+                     list<dag> pattern>
+  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
+      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
+           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29}    = U;
+  let Inst{28-23} = 0b011110;
+  let Inst{22-16} = fixed_imm;
+  let Inst{15-11} = opc;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
+                              Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
+                                  Intrinsic OpNode> {
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
+                                     SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V128, vecshiftR16Narrow,
+                                  asm, ".8b", ".8h",
+      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR16Narrow,
+                                  asm#"2", ".16b", ".8h", []> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V128, vecshiftR32Narrow,
+                                  asm, ".4h", ".4s",
+      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR32Narrow,
+                                  asm#"2", ".8h", ".4s", []> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V128, vecshiftR64Narrow,
+                                  asm, ".2s", ".2d",
+      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR64Narrow,
+                                  asm#"2", ".4s", ".2d", []> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+    let hasSideEffects = 0;
+  }
+
+  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
+  // themselves, so put them here instead.
+
+  // Patterns involving what's effectively an insert high and a normal
+  // intrinsic, represented by CONCAT_VECTORS.
+  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
+                                                   vecshiftR16Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v16i8_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR16Narrow:$imm)>;
+  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
+                                                     vecshiftR32Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v8i16_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR32Narrow:$imm)>;
+  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
+                                                     vecshiftR64Narrow:$imm)),
+            (!cast<Instruction>(NAME # "v4i32_shift")
+                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                V128:$Rn, vecshiftR64Narrow:$imm)>;
+}
+
+multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
+                                SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8,
+                                  asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8,
+                                  asm, ".16b", ".16b",
+             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
+                   (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16,
+                                  asm, ".4h", ".4h",
+              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
+                    (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16,
+                                  asm, ".8h", ".8h",
+            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
+                  (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32,
+                                  asm, ".2s", ".2s",
+              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
+                    (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32,
+                                  asm, ".4s", ".4s",
+            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
+                  (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d",
+            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
+                  (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
+                 [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                           (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
+             [(set (v16i8 V128:$dst),
+               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                       (i32 vecshiftR8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
+              [(set (v4i16 V64:$dst),
+                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                        (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
+            [(set (v8i16 V128:$dst),
+              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                      (i32 vecshiftR16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
+              [(set (v2i32 V64:$dst),
+                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                        (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
+            [(set (v4i32 V128:$dst),
+              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                      (i32 vecshiftR32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftR64,
+                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
+              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                      (i32 vecshiftR64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
+                                    SDPatternOperator OpNode = null_frag> {
+  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
+                                  V64, V64, vecshiftL8,
+                                  asm, ".8b", ".8b",
+                    [(set (v8i8 V64:$dst),
+                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm, ".16b", ".16b",
+                    [(set (v16i8 V128:$dst),
+                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                                  (i32 vecshiftL8:$imm)))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
+                                  V64, V64, vecshiftL16,
+                                  asm, ".4h", ".4h",
+                    [(set (v4i16 V64:$dst),
+                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
+                                   (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm, ".8h", ".8h",
+                    [(set (v8i16 V128:$dst),
+                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
+                                  (i32 vecshiftL16:$imm)))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
+                                  V64, V64, vecshiftL32,
+                                  asm, ".2s", ".2s",
+                    [(set (v2i32 V64:$dst),
+                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm, ".4s", ".4s",
+                    [(set (v4i32 V128:$dst),
+                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                  (i32 vecshiftL32:$imm)))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
+                                  V128, V128, vecshiftL64,
+                                  asm, ".2d", ".2d",
+                    [(set (v2i64 V128:$dst),
+                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
+                                  (i32 vecshiftL64:$imm)))]> {
+    bits<6> imm;
+    let Inst{21-16} = imm;
+  }
+}
+
+multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
+                                   SDPatternOperator OpNode> {
+  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
+      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
+                                  V128, V128, vecshiftL8,
+                                  asm#"2", ".8h", ".16b",
+      [(set (v8i16 V128:$Rd),
+            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
+    bits<3> imm;
+    let Inst{18-16} = imm;
+  }
+
+  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
+      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
+                                  V128, V128, vecshiftL16,
+                                  asm#"2", ".4s", ".8h",
+      [(set (v4i32 V128:$Rd),
+            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
+
+    bits<4> imm;
+    let Inst{19-16} = imm;
+  }
+
+  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
+      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+
+  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
+                                  V128, V128, vecshiftL32,
+                                  asm#"2", ".2d", ".4s",
+      [(set (v2i64 V128:$Rd),
+            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
+    bits<5> imm;
+    let Inst{20-16} = imm;
+  }
+}
+
+
+//---
+// Vector load/store
+//---
+// SIMD ldX/stX no-index memory references don't allow the optional
+// ", #0" constant and handle post-indexing explicitly, so we use
+// a more specialized parse method for them. Otherwise, it's the same as
+// the general GPR64sp handling.
+
+class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
+                   string asm, dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, "\t$Vt, [$Rn]", "", pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011000;
-  let Inst{22} = l;
+  let Inst{22} = L;
   let Inst{21-16} = 0b000000;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store multiple N-element structure (post-index)
-class NeonI_LdStMult_Post<bit q, bit l, bits<4> opcode, bits<2> size,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
+                       string asm, dag oops, dag iops>
+  : I<oops, iops, asm, "\t$Vt, [$Rn], $Xm", "$Rn = $wback", []> {
+  bits<5> Vt;
+  bits<5> Rn;
+  bits<5> Xm;
+  let Inst{31} = 0;
+  let Inst{30} = Q;
   let Inst{29-23} = 0b0011001;
-  let Inst{22} = l;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
+  let Inst{22} = L;
+  let Inst{21} = 0;
+  let Inst{20-16} = Xm;
   let Inst{15-12} = opcode;
   let Inst{11-10} = size;
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                      dag ins, string asmstr, list<dag> patterns,
-                      InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  let Inst{20-16} = 0b00000;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+// The immediate form of AdvSIMD post-indexed addressing is encoded with
+// register post-index addressing from the zero register.
+multiclass SIMDLdStAliases<string asm, string layout, string Count,
+                           int Offset, int Size> {
+  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
+  //      "ld1\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
+  //      "ld1.8b\t$Vt, [$Rn], #16"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1]"
+  //      "ld1\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Twov8b VecListTwo64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # Count # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
+  //      "ld1\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Twov8b_POST VecListTwo64:$Vt, GPR64sp:$Rn, GPR64pi8:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
+                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
+                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
+                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+
+    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "16b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "8b"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "4h"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "2s"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+// Only ld1/st1 has a v1d version.
+multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
+                       int Offset64, bits<4> opcode> {
+  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
+    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
+                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                                 GPR64sp:$Rn), []>;
+    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                                GPR64sp:$Rn), []>;
+    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
+    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
+  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
+  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDLd1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // LD1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
+                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                           (ins GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback,
+                             !cast<RegisterOperand>(veclist # "1d"):$Vt),
+                       (ins GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass BaseSIMDSt1<string Count, string asm, string veclist,
+                       int Offset128, int Offset64, bits<4> opcode>
+  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
+
+  // ST1 instructions have extra "1d" variants.
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
+                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                                GPR64sp:$Rn), []>;
+
+    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm,
+                       (outs GPR64sp:$wback),
+                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
+                            GPR64sp:$Rn,
+                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
+  }
+
+  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
+}
+
+multiclass SIMDLd1Multiple<string asm> {
+  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDSt1Multiple<string asm> {
+  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
+  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
+  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
+  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
+}
+
+multiclass SIMDLd2Multiple<string asm> {
+  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDSt2Multiple<string asm> {
+  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
+}
+
+multiclass SIMDLd3Multiple<string asm> {
+  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDSt3Multiple<string asm> {
+  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
+}
+
+multiclass SIMDLd4Multiple<string asm> {
+  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+multiclass SIMDSt4Multiple<string asm> {
+  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
+}
+
+//---
+// AdvSIMD Load/store single-element
+//---
+
+class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
+  let Inst{15-13} = opcode;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
+                         string asm, string operands, string cst,
+                         dag oops, dag iops, list<dag> pattern>
+  : I<oops, iops, asm, operands, "$Vt = $dst," # cst, pattern> {
+  bits<5> Vt;
+  bits<5> Rn;
+  let Inst{31} = 0;
+  let Inst{29-24} = 0b001101;
+  let Inst{22} = L;
+  let Inst{21} = R;
   let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Vt;
+}
+
+
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
+                  Operand listtype>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn]", "",
+                       (outs listtype:$Vt), (ins GPR64sp:$Rn),
+                       []> {
+  let Inst{30} = Q;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = S;
+  let Inst{11-10} = size;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
+                      string asm, Operand listtype, Operand GPR64pi>
+  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, [$Rn], $Xm",
+                       "$Rn = $wback",
+                       (outs GPR64sp:$wback, listtype:$Vt),
+                       (ins GPR64sp:$Rn, GPR64pi:$Xm), []> {
+  bits<5> Xm;
+  let Inst{30} = Q;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = S;
   let Inst{11-10} = size;
+}
+
+multiclass SIMDLdrAliases<string asm, string layout, string Count,
+                          int Offset, int Size> {
+  // E.g. "ld1r { v0.8b }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      XZR), 1>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], #1"
+  //      "ld1r.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      XZR), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1]"
+  //      "ld1r.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn]",
+                  (!cast<Instruction>(NAME # "v" # layout)
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1r.8b { v0 }, [x1], x2"
+  //      "ld1r.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt, [$Rn], $Xm",
+                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
+                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
+
+multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
+  int Offset1, int Offset2, int Offset4, int Offset8> {
+  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count # "8b")>;
+  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
+                        !cast<Operand>("VecList" # Count #"16b")>;
+  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"4h")>;
+  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
+                        !cast<Operand>("VecList" # Count #"8h")>;
+  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"2s")>;
+  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
+                        !cast<Operand>("VecList" # Count #"4s")>;
+  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"1d")>;
+  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
+                        !cast<Operand>("VecList" # Count #"2d")>;
+
+  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "8b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
+                                 !cast<Operand>("VecList" # Count # "16b"),
+                                 !cast<Operand>("GPR64pi" # Offset1)>;
+  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "4h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
+                                 !cast<Operand>("VecList" # Count # "8h"),
+                                 !cast<Operand>("GPR64pi" # Offset2)>;
+  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "2s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
+                                 !cast<Operand>("VecList" # Count # "4s"),
+                                 !cast<Operand>("GPR64pi" # Offset4)>;
+  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "1d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
+                                 !cast<Operand>("VecList" # Count # "2d"),
+                                 !cast<Operand>("GPR64pi" # Offset8)>;
+
+  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
+  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
+  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
+  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
+  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
+  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
+  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
+  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD vector load/store Single N-element structure to/from one lane
-class NeonI_LdStOne_Lane<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011010;
-  let Inst{22} = l;
-  let Inst{21} = r;
+class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
   let Inst{20-16} = 0b00000;
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load Single N-element structure to all lanes
-class NeonI_LdOne_Dup_Post<bit q, bit r, bits<3> opcode, bits<2> size, dag outs,
-                           dag ins, string asmstr, list<dag> patterns,
-                           InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = q;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = 0b1;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-13} = opcode;
-  let Inst{12} = 0b0;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  let Inst{30} = idx{3};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size fields.
+  bits<4> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{3};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{2};
+  let Inst{11-10} = idx{1-0};
+}
+
+class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  let Inst{30} = idx{2};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+
+class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
+                          dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S:size<1> fields.
+  bits<3> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{2};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{1};
+  let Inst{11} = idx{0};
+  let Inst{10} = size;
+}
+class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  let Inst{30} = idx{1};
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q:S fields.
+  bits<2> idx;
+  bits<5> Xm;
+  let Inst{30} = idx{1};
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = idx{0};
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "", oops, iops,
+                       pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
+                      dag oops, dag iops, list<dag> pattern>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn]", "",
+                           oops, iops, pattern> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  let Inst{30} = idx;
+  let Inst{23} = 0;
+  let Inst{20-16} = 0b00000;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                       "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
+  let Inst{11-10} = size;
+}
+class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
+                          string asm, dag oops, dag iops>
+  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, [$Rn], $Xm",
+                           "$Rn = $wback", oops, iops, []> {
+  // idx encoded in Q field.
+  bits<1> idx;
+  bits<5> Xm;
+  let Inst{30} = idx;
+  let Inst{23} = 1;
+  let Inst{20-16} = Xm;
+  let Inst{12} = 0;
   let Inst{11-10} = size;
+}
 
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD post-index vector load/store Single N-element structure
-// to/from one lane
-class NeonI_LdStOne_Lane_Post<bit l, bit r, bits<2> op2_1, bit op0, dag outs,
-                         dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRtnm<outs, ins, asmstr, patterns, itin>
-{
-  bits<4> lane;
-  let Inst{31} = 0b0;
-  let Inst{29-23} = 0b0011011;
-  let Inst{22} = l;
-  let Inst{21} = r;
-  // Inherit Rm in 20-16
-  let Inst{15-14} = op2_1;
-  let Inst{13} = op0;
-  
-  // Inherit Rn in 9-5
-  // Inherit Rt in 4-0
-}
-
-// Format AdvSIMD 3 scalar registers with different type
-
-class NeonI_Scalar3Diff<bit u, bits<2> size, bits<4> opcode,
-                          dag outs, dag ins, string asmstr,
-                          list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b1;
-  // Inherit Rm in 20-16
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
+                           (outs listtype:$dst),
+                           (ins listtype:$Vt, VectorIndexB:$idx,
+                                GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexB:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype,
+                         RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
+multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
+                            (outs listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
+                            (outs GPR64sp:$wback, listtype:$dst),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
+                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
+                                        GPR64sp:$Rn), []>;
+
+  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
+                                    (outs GPR64sp:$wback),
+                                    (ins listtype:$Vt, VectorIndexB:$idx,
+                                         GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexH:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexS:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
+}
+let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
+multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
+                         RegisterOperand listtype, RegisterOperand GPR64pi> {
+  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
+                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
+                                         GPR64sp:$Rn), []>;
+
+  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
+                            (outs GPR64sp:$wback),
+                            (ins listtype:$Vt, VectorIndexD:$idx,
+                                 GPR64sp:$Rn, GPR64pi:$Xm)>;
 }
 
-// Format AdvSIMD scalar shift by immediate
+multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
+                                 string Count, int Offset, Operand idxtype> {
+  // E.g. "ld1 { v0.8b }[0], [x1], #1"
+  //      "ld1\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne8b:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type  # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
+                      idxtype:$idx, XZR), 1>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], #1"
+  //      "ld1.8b\t$Vt, [$Rn], #1"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, XZR)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], #" # Offset,
+                  (!cast<Instruction>(NAME # Type # "_POST")
+                      GPR64sp:$Rn,
+                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                      idxtype:$idx, XZR), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1]"
+  //      "ld1.8b\t$Vt, [$Rn]"
+  // may get mapped to
+  //      (LD1Rv8b VecListOne64:$Vt, GPR64sp:$Rn)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn]",
+                      (!cast<Instruction>(NAME # Type)
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx, GPR64sp:$Rn), 0>;
+
+  // E.g. "ld1.8b { v0 }[0], [x1], x2"
+  //      "ld1.8b\t$Vt, [$Rn], $Xm"
+  // may get mapped to
+  //      (LD1Rv8b_POST VecListOne64:$Vt, GPR64sp:$Rn, GPR64pi1:$Xm)
+  def : InstAlias<asm # "." # layout # "\t$Vt$idx, [$Rn], $Xm",
+                      (!cast<Instruction>(NAME # Type # "_POST")
+                         GPR64sp:$Rn,
+                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
+                         idxtype:$idx,
+                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
+}
 
-class NeonI_ScalarShiftImm<bit u, bits<5> opcode,
-                           dag outs, dag ins, string asmstr,
-                           list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  bits<4> Imm4;
-  bits<3> Imm3;
-  let Inst{31-30} = 0b01;
-  let Inst{29} = u;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-19} = Imm4;
-  let Inst{18-16} = Imm3;
-  let Inst{15-11} = opcode;
-  let Inst{10} = 0b1;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt1SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto AES
-class NeonI_Crypto_AES<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01001110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+multiclass SIMDLdSt2SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
 }
 
-// Format AdvSIMD crypto SHA
-class NeonI_Crypto_SHA<bits<2> size, bits<5> opcode,
-                       dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : A64InstRdn<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10100;
-  let Inst{16-12} = opcode;
+multiclass SIMDLdSt3SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
+}
+
+multiclass SIMDLdSt4SingleAliases<string asm> {
+  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
+  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
+  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
+  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
+}
+} // end of 'let Predicates = [HasNEON]'
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+let Predicates = [HasCrypto] in {
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
+              list<dag> pat>
+  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0100111000101000;
+  let Inst{15-12} = opc;
   let Inst{11-10} = 0b10;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
-// Format AdvSIMD crypto 3V SHA
-class NeonI_Crypto_3VSHA<bits<2> size, bits<3> opcode,
-                         dag outs, dag ins, string asmstr,
-                         list<dag> patterns, InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin> {
-  let Inst{31-24} = 0b01011110;
-  let Inst{23-22} = size;
-  let Inst{21} = 0b0;
-  // Inherit Rm in 20-16
-  let Inst{15} = 0b0;
-  let Inst{14-12} = opcode;
+class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
+            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
+
+class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
+  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
+            "$Rd = $dst",
+            [(set (v16i8 V128:$dst),
+                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
+                     dag oops, dag iops, list<dag> pat>
+  : I<oops, iops, asm,
+      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
+      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-21} = 0b01011110000;
+  let Inst{20-16} = Rm;
+  let Inst{15}    = 0;
+  let Inst{14-12} = opc;
   let Inst{11-10} = 0b00;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-
-// Format AdvSIMD scalar x indexed element
-class NeonI_ScalarXIndexedElem<bit u, bit szhi, bit szlo,
-                               bits<4> opcode, dag outs, dag ins,
-                               string asmstr, list<dag> patterns,
-                               InstrItinClass itin>
-  : A64InstRdnm<outs, ins, asmstr, patterns, itin>
-{
-  let Inst{31} = 0b0;
-  let Inst{30} = 0b1;
-  let Inst{29} = u;
-  let Inst{28-24} = 0b11111;
-  let Inst{23} = szhi;
-  let Inst{22} = szlo;
-  // l in Inst{21}
-  // m in Instr{20}
-  // Inherit Rm in 19-16
-  let Inst{15-12} = opcode;
-  // h in Inst{11}
-  let Inst{10} = 0b0;
-  // Inherit Rn in 9-5
-  // Inherit Rd in 4-0
-}
-// Format AdvSIMD scalar copy - insert from element to scalar
-class NeonI_ScalarCopy<dag outs, dag ins, string asmstr,
-                       list<dag> patterns, InstrItinClass itin>
-  : NeonI_copy<0b1, 0b0, 0b0000, outs, ins, asmstr, patterns, itin> {
-  let Inst{28} = 0b1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
+
+class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
+                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
+                   [(set (v4i32 V128:$dst),
+                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
+  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
+                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
+                   [(set (v4i32 FPR128:$dst),
+                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
+                                 (v4i32 V128:$Rm)))]>;
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class SHA2OpInst<bits<4> opc, string asm, string kind,
+                 string cstr, dag oops, dag iops,
+                 list<dag> pat>
+  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
+                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
+    Sched<[WriteV]>{
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-16} = 0b0101111000101000;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
 }
 
+class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
+               (ins V128:$Rd, V128:$Rn),
+               [(set (v4i32 V128:$dst),
+                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
+
+class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
+  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
+               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
+} // end of 'let Predicates = [HasCrypto]'
+
+// Allow the size specifier tokens to be upper case, not just lower.
+def : TokenAlias<".8B", ".8b">;
+def : TokenAlias<".4H", ".4h">;
+def : TokenAlias<".2S", ".2s">;
+def : TokenAlias<".1D", ".1d">;
+def : TokenAlias<".16B", ".16b">;
+def : TokenAlias<".8H", ".8h">;
+def : TokenAlias<".4S", ".4s">;
+def : TokenAlias<".2D", ".2d">;
+def : TokenAlias<".1Q", ".1q">;
+def : TokenAlias<".B", ".b">;
+def : TokenAlias<".H", ".h">;
+def : TokenAlias<".S", ".s">;
+def : TokenAlias<".D", ".d">;
+def : TokenAlias<".Q", ".q">;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index afb2034..ff115c0 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -11,257 +11,83 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
-#include "llvm/CodeGen/MachineConstantPool.h"
-#include "llvm/CodeGen/MachineDominators.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
-#include <algorithm>
+
+using namespace llvm;
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-using namespace llvm;
-
 AArch64InstrInfo::AArch64InstrInfo(const AArch64Subtarget &STI)
-  : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
-    Subtarget(STI) {}
+    : AArch64GenInstrInfo(AArch64::ADJCALLSTACKDOWN, AArch64::ADJCALLSTACKUP),
+      RI(this, &STI), Subtarget(STI) {}
 
-void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator I, DebugLoc DL,
-                                   unsigned DestReg, unsigned SrcReg,
-                                   bool KillSrc) const {
-  unsigned Opc = 0;
-  unsigned ZeroReg = 0;
-  if (DestReg == AArch64::XSP || SrcReg == AArch64::XSP) {
-    // E.g. ADD xDst, xsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDxxi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
-    // E.g. ADD wDST, wsp, #0 (, lsl #0)
-    BuildMI(MBB, I, DL, get(AArch64::ADDwwi_lsl0_s), DestReg)
-      .addReg(SrcReg)
-      .addImm(0);
-    return;
-  } else if (DestReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(SrcReg));
-    // E.g. MSR NZCV, xDST
-    BuildMI(MBB, I, DL, get(AArch64::MSRix))
-      .addImm(A64SysReg::NZCV)
-      .addReg(SrcReg);
-  } else if (SrcReg == AArch64::NZCV) {
-    assert(AArch64::GPR64RegClass.contains(DestReg));
-    // E.g. MRS xDST, NZCV
-    BuildMI(MBB, I, DL, get(AArch64::MRSxi), DestReg)
-      .addImm(A64SysReg::NZCV);
-  } else if (AArch64::GPR64RegClass.contains(DestReg)) {
-    if(AArch64::GPR64RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRxxx_lsl;
-      ZeroReg = AArch64::XZR;
-    } else{
-      assert(AArch64::FPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVxd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::GPR32RegClass.contains(DestReg)) {
-    if(AArch64::GPR32RegClass.contains(SrcReg)){
-      Opc = AArch64::ORRwww_lsl;
-      ZeroReg = AArch64::WZR;
-    } else{
-      assert(AArch64::FPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVws), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR32RegClass.contains(DestReg)) {
-    if(AArch64::FPR32RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVss), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR32RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVsw), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR64RegClass.contains(DestReg)) {
-    if(AArch64::FPR64RegClass.contains(SrcReg)){
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdd), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-    else {
-      assert(AArch64::GPR64RegClass.contains(SrcReg));
-      BuildMI(MBB, I, DL, get(AArch64::FMOVdx), DestReg)
-        .addReg(SrcReg);
-      return;
-    }
-  } else if (AArch64::FPR128RegClass.contains(DestReg)) {
-    assert(AArch64::FPR128RegClass.contains(SrcReg));
+/// GetInstSize - Return the number of bytes of code the specified
+/// instruction may be.  This returns the maximum number of bytes.
+unsigned AArch64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
+  const MCInstrDesc &Desc = MI->getDesc();
 
-    // If NEON is enable, we use ORR to implement this copy.
-    // If NEON isn't available, emit STR and LDR to handle this.
-    if(getSubTarget().hasNEON()) {
-      BuildMI(MBB, I, DL, get(AArch64::ORRvvv_16B), DestReg)
-        .addReg(SrcReg)
-        .addReg(SrcReg);
-      return;
-    } else {
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PreInd_STR), AArch64::XSP)
-        .addReg(SrcReg)
-        .addReg(AArch64::XSP)
-        .addImm(0x1ff & -16);
-
-      BuildMI(MBB, I, DL, get(AArch64::LSFP128_PostInd_LDR), DestReg)
-        .addReg(AArch64::XSP, RegState::Define)
-        .addReg(AArch64::XSP)
-        .addImm(16);
-      return;
-    }
-  } else if (AArch64::FPR8RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR8 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_8,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else if (AArch64::FPR16RegClass.contains(DestReg, SrcReg)) {
-    // The copy of two FPR16 registers is implemented by the copy of two FPR32
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-    unsigned Dst = TRI->getMatchingSuperReg(DestReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    unsigned Src = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_16,
-                                            &AArch64::FPR32RegClass);
-    BuildMI(MBB, I, DL, get(AArch64::FMOVss), Dst)
-      .addReg(Src);
-    return;
-  } else {
-    CopyPhysRegTuple(MBB, I, DL, DestReg, SrcReg);
-    return;
+  switch (Desc.getOpcode()) {
+  default:
+    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
+    return 4;
+  case TargetOpcode::DBG_VALUE:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+    return 0;
   }
 
-  // E.g. ORR xDst, xzr, xSrc, lsl #0
-  BuildMI(MBB, I, DL, get(Opc), DestReg)
-    .addReg(ZeroReg)
-    .addReg(SrcReg)
-    .addImm(0);
-}
-
-void AArch64InstrInfo::CopyPhysRegTuple(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator I,
-                                        DebugLoc DL, unsigned DestReg,
-                                        unsigned SrcReg) const {
-  unsigned SubRegs;
-  bool IsQRegs;
-  if (AArch64::DPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = false;
-  } else if (AArch64::DTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = false;
-  } else if (AArch64::DQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = false;
-  } else if (AArch64::QPairRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 2;
-    IsQRegs = true;
-  } else if (AArch64::QTripleRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 3;
-    IsQRegs = true;
-  } else if (AArch64::QQuadRegClass.contains(DestReg, SrcReg)) {
-    SubRegs = 4;
-    IsQRegs = true;
-  } else
-    llvm_unreachable("Unknown register class");
-
-  unsigned BeginIdx = IsQRegs ? AArch64::qsub_0 : AArch64::dsub_0;
-  int Spacing = 1;
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  // Copy register tuples backward when the first Dest reg overlaps
-  // with SrcReg.
-  if (TRI->regsOverlap(SrcReg, TRI->getSubReg(DestReg, BeginIdx))) {
-    BeginIdx = BeginIdx + (SubRegs - 1);
-    Spacing = -1;
-  }
-
-  unsigned Opc = IsQRegs ? AArch64::ORRvvv_16B : AArch64::ORRvvv_8B;
-  for (unsigned i = 0; i != SubRegs; ++i) {
-    unsigned Dst = TRI->getSubReg(DestReg, BeginIdx + i * Spacing);
-    unsigned Src = TRI->getSubReg(SrcReg, BeginIdx + i * Spacing);
-    assert(Dst && Src && "Bad sub-register");
-    BuildMI(MBB, I, I->getDebugLoc(), get(Opc), Dst)
-        .addReg(Src)
-        .addReg(Src);
-  }
-  return;
-}
-
-/// Does the Opcode represent a conditional branch that we can remove and re-add
-/// at the end of a basic block?
-static bool isCondBranch(unsigned Opc) {
-  return Opc == AArch64::Bcc || Opc == AArch64::CBZw || Opc == AArch64::CBZx ||
-         Opc == AArch64::CBNZw || Opc == AArch64::CBNZx ||
-         Opc == AArch64::TBZwii || Opc == AArch64::TBZxii ||
-         Opc == AArch64::TBNZwii || Opc == AArch64::TBNZxii;
-}
-
-/// Takes apart a given conditional branch MachineInstr (see isCondBranch),
-/// setting TBB to the destination basic block and populating the Cond vector
-/// with data necessary to recreate the conditional branch at a later
-/// date. First element will be the opcode, and subsequent ones define the
-/// conditions being branched on in an instruction-specific manner.
-static void classifyCondBranch(MachineInstr *I, MachineBasicBlock *&TBB,
-                               SmallVectorImpl<MachineOperand> &Cond) {
-  switch(I->getOpcode()) {
-  case AArch64::Bcc:
-  case AArch64::CBZw:
-  case AArch64::CBZx:
-  case AArch64::CBNZw:
-  case AArch64::CBNZx:
-    // These instructions just have one predicate operand in position 0 (either
-    // a condition code or a register being compared).
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    TBB = I->getOperand(1).getMBB();
-    return;
-  case AArch64::TBZwii:
-  case AArch64::TBZxii:
-  case AArch64::TBNZwii:
-  case AArch64::TBNZxii:
-    // These have two predicate operands: a register and a bit position.
-    Cond.push_back(MachineOperand::CreateImm(I->getOpcode()));
-    Cond.push_back(I->getOperand(0));
-    Cond.push_back(I->getOperand(1));
-    TBB = I->getOperand(2).getMBB();
-    return;
+  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  // Block ends with fall-through condbranch.
+  switch (LastInst->getOpcode()) {
   default:
-    llvm_unreachable("Unknown conditional branch to classify");
+    llvm_unreachable("Unknown branch instruction?");
+  case AArch64::Bcc:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+    Target = LastInst->getOperand(1).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    break;
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    Target = LastInst->getOperand(2).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(-1));
+    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
+    Cond.push_back(LastInst->getOperand(0));
+    Cond.push_back(LastInst->getOperand(1));
   }
 }
 
-
-bool
-AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                MachineBasicBlock *&FBB,
-                                SmallVectorImpl<MachineOperand> &Cond,
-                                bool AllowModify) const {
+// Branch analysis.
+bool AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *&TBB,
+                                   MachineBasicBlock *&FBB,
+                                   SmallVectorImpl<MachineOperand> &Cond,
+                                   bool AllowModify) const {
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -281,15 +107,16 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
   // If there is only one terminator instruction, process it.
   unsigned LastOpc = LastInst->getOpcode();
   if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (LastOpc == AArch64::Bimm) {
+    if (isUncondBranchOpcode(LastOpc)) {
       TBB = LastInst->getOperand(0).getMBB();
       return false;
     }
-    if (isCondBranch(LastOpc)) {
-      classifyCondBranch(LastInst, TBB, Cond);
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
       return false;
     }
-    return true;  // Can't handle indirect branch.
+    return true; // Can't handle indirect branch.
   }
 
   // Get the instruction before it if it is a terminator.
@@ -298,8 +125,8 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
 
   // If AllowModify is true and the block ends with two or more unconditional
   // branches, delete all but the first unconditional branch.
-  if (AllowModify && LastOpc == AArch64::Bimm) {
-    while (SecondLastOpc == AArch64::Bimm) {
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
       LastInst->eraseFromParent();
       LastInst = SecondLastInst;
       LastOpc = LastInst->getOpcode();
@@ -319,23 +146,15 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return true;
 
   // If the block ends with a B and a Bcc, handle it.
-  if (LastOpc == AArch64::Bimm) {
-    if (SecondLastOpc == AArch64::Bcc) {
-      TBB =  SecondLastInst->getOperand(1).getMBB();
-      Cond.push_back(MachineOperand::CreateImm(AArch64::Bcc));
-      Cond.push_back(SecondLastInst->getOperand(0));
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    } else if (isCondBranch(SecondLastOpc)) {
-      classifyCondBranch(SecondLastInst, TBB, Cond);
-      FBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
   }
 
   // If the block ends with two unconditional branches, handle it.  The second
   // one is not executed, so remove it.
-  if (SecondLastOpc == AArch64::Bimm && LastOpc == AArch64::Bimm) {
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
     TBB = SecondLastInst->getOperand(0).getMBB();
     I = LastInst;
     if (AllowModify)
@@ -343,84 +162,72 @@ AArch64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
     return false;
   }
 
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
   // Otherwise, can't handle this.
   return true;
 }
 
 bool AArch64InstrInfo::ReverseBranchCondition(
-                                  SmallVectorImpl<MachineOperand> &Cond) const {
-  switch (Cond[0].getImm()) {
-  case AArch64::Bcc: {
-    A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(Cond[1].getImm());
-    CC = A64InvertCondCode(CC);
-    Cond[1].setImm(CC);
-    return false;
-  }
-  case AArch64::CBZw:
-    Cond[0].setImm(AArch64::CBNZw);
-    return false;
-  case AArch64::CBZx:
-    Cond[0].setImm(AArch64::CBNZx);
-    return false;
-  case AArch64::CBNZw:
-    Cond[0].setImm(AArch64::CBZw);
-    return false;
-  case AArch64::CBNZx:
-    Cond[0].setImm(AArch64::CBZx);
-    return false;
-  case AArch64::TBZwii:
-    Cond[0].setImm(AArch64::TBNZwii);
-    return false;
-  case AArch64::TBZxii:
-    Cond[0].setImm(AArch64::TBNZxii);
-    return false;
-  case AArch64::TBNZwii:
-    Cond[0].setImm(AArch64::TBZwii);
-    return false;
-  case AArch64::TBNZxii:
-    Cond[0].setImm(AArch64::TBZxii);
-    return false;
-  default:
-    llvm_unreachable("Unknown branch type");
-  }
-}
-
-
-unsigned
-AArch64InstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                               MachineBasicBlock *FBB,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               DebugLoc DL) const {
-  if (FBB == 0 && Cond.empty()) {
-    BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(TBB);
-    return 1;
-  } else if (FBB == 0) {
-    MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-    for (int i = 1, e = Cond.size(); i != e; ++i)
-      MIB.addOperand(Cond[i]);
-    MIB.addMBB(TBB);
-    return 1;
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    AArch64CC::CondCode CC = (AArch64CC::CondCode)(int)Cond[0].getImm();
+    Cond[0].setImm(AArch64CC::getInvertedCondCode(CC));
+  } else {
+    // Folded compare-and-branch
+    switch (Cond[1].getImm()) {
+    default:
+      llvm_unreachable("Unknown conditional branch!");
+    case AArch64::CBZW:
+      Cond[1].setImm(AArch64::CBNZW);
+      break;
+    case AArch64::CBNZW:
+      Cond[1].setImm(AArch64::CBZW);
+      break;
+    case AArch64::CBZX:
+      Cond[1].setImm(AArch64::CBNZX);
+      break;
+    case AArch64::CBNZX:
+      Cond[1].setImm(AArch64::CBZX);
+      break;
+    case AArch64::TBZW:
+      Cond[1].setImm(AArch64::TBNZW);
+      break;
+    case AArch64::TBNZW:
+      Cond[1].setImm(AArch64::TBZW);
+      break;
+    case AArch64::TBZX:
+      Cond[1].setImm(AArch64::TBNZX);
+      break;
+    case AArch64::TBNZX:
+      Cond[1].setImm(AArch64::TBZX);
+      break;
+    }
   }
 
-  MachineInstrBuilder MIB = BuildMI(&MBB, DL, get(Cond[0].getImm()));
-  for (int i = 1, e = Cond.size(); i != e; ++i)
-    MIB.addOperand(Cond[i]);
-  MIB.addMBB(TBB);
-
-  BuildMI(&MBB, DL, get(AArch64::Bimm)).addMBB(FBB);
-  return 2;
+  return false;
 }
 
 unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin()) return 0;
+  if (I == MBB.begin())
+    return 0;
   --I;
   while (I->isDebugValue()) {
     if (I == MBB.begin())
       return 0;
     --I;
   }
-  if (I->getOpcode() != AArch64::Bimm && !isCondBranch(I->getOpcode()))
+  if (!isUncondBranchOpcode(I->getOpcode()) &&
+      !isCondBranchOpcode(I->getOpcode()))
     return 0;
 
   // Remove the branch.
@@ -428,9 +235,10 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
 
   I = MBB.end();
 
-  if (I == MBB.begin()) return 1;
+  if (I == MBB.begin())
+    return 1;
   --I;
-  if (!isCondBranch(I->getOpcode()))
+  if (!isCondBranchOpcode(I->getOpcode()))
     return 1;
 
   // Remove the branch.
@@ -438,542 +246,1838 @@ unsigned AArch64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
   return 2;
 }
 
-bool
-AArch64InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MBBI) const {
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
+void AArch64InstrInfo::instantiateCondBranch(
+    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
+    const SmallVectorImpl<MachineOperand> &Cond) const {
+  if (Cond[0].getImm() != -1) {
+    // Regular Bcc
+    BuildMI(&MBB, DL, get(AArch64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
+  } else {
+    // Folded compare-and-branch
+    const MachineInstrBuilder MIB =
+        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
+    if (Cond.size() > 3)
+      MIB.addImm(Cond[3].getImm());
+    MIB.addMBB(TBB);
+  }
+}
 
-  unsigned Opcode = MI.getOpcode();
-  switch (Opcode) {
-  case AArch64::TLSDESC_BLRx: {
-    MachineInstr *NewMI =
-      BuildMI(MBB, MBBI, MI.getDebugLoc(), get(AArch64::TLSDESCCALL))
-        .addOperand(MI.getOperand(1));
-    MI.setDesc(get(AArch64::BLRx));
-
-    llvm::finalizeBundle(MBB, NewMI, *++MBBI);
-    return true;
-    }
+unsigned AArch64InstrInfo::InsertBranch(
+    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
+    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
+  // Shouldn't be a fall through.
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (!FBB) {
+    if (Cond.empty()) // Unconditional branch?
+      BuildMI(&MBB, DL, get(AArch64::B)).addMBB(TBB);
+    else
+      instantiateCondBranch(MBB, DL, TBB, Cond);
+    return 1;
+  }
+
+  // Two-way conditional branch.
+  instantiateCondBranch(MBB, DL, TBB, Cond);
+  BuildMI(&MBB, DL, get(AArch64::B)).addMBB(FBB);
+  return 2;
+}
+
+// Find the original register that VReg is copied from.
+static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
+  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
+    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+    if (!DefMI->isFullCopy())
+      return VReg;
+    VReg = DefMI->getOperand(1).getReg();
+  }
+  return VReg;
+}
+
+// Determine if VReg is defined by an instruction that can be folded into a
+// csel instruction. If so, return the folded opcode, and the replacement
+// register.
+static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
+                                unsigned *NewVReg = nullptr) {
+  VReg = removeCopies(MRI, VReg);
+  if (!TargetRegisterInfo::isVirtualRegister(VReg))
+    return 0;
+
+  bool Is64Bit = AArch64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
+  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
+  unsigned Opc = 0;
+  unsigned SrcOpNum = 0;
+  switch (DefMI->getOpcode()) {
+  case AArch64::ADDSXri:
+  case AArch64::ADDSWri:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to ADDXri and ADDWri.
+  case AArch64::ADDXri:
+  case AArch64::ADDWri:
+    // add x, 1 -> csinc.
+    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
+        DefMI->getOperand(3).getImm() != 0)
+      return 0;
+    SrcOpNum = 1;
+    Opc = Is64Bit ? AArch64::CSINCXr : AArch64::CSINCWr;
+    break;
+
+  case AArch64::ORNXrr:
+  case AArch64::ORNWrr: {
+    // not x -> csinv, represented as orn dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSINVXr : AArch64::CSINVWr;
+    break;
+  }
+
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSWrr:
+    // if NZCV is used, do not fold.
+    if (DefMI->findRegisterDefOperandIdx(AArch64::NZCV, true) == -1)
+      return 0;
+  // fall-through to SUBXrr and SUBWrr.
+  case AArch64::SUBXrr:
+  case AArch64::SUBWrr: {
+    // neg x -> csneg, represented as sub dst, xzr, src.
+    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
+    if (ZReg != AArch64::XZR && ZReg != AArch64::WZR)
+      return 0;
+    SrcOpNum = 2;
+    Opc = Is64Bit ? AArch64::CSNEGXr : AArch64::CSNEGWr;
+    break;
+  }
   default:
+    return 0;
+  }
+  assert(Opc && SrcOpNum && "Missing parameters");
+
+  if (NewVReg)
+    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
+  return Opc;
+}
+
+bool AArch64InstrInfo::canInsertSelect(
+    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
+    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
+    int &FalseCycles) const {
+  // Check register classes.
+  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RC =
+      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
+  if (!RC)
     return false;
+
+  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
+  unsigned ExtraCondLat = Cond.size() != 1;
+
+  // GPRs are handled by csel.
+  // FIXME: Fold in x+1, -x, and ~x when applicable.
+  if (AArch64::GPR64allRegClass.hasSubClassEq(RC) ||
+      AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+    // Single-cycle csel, csinc, csinv, and csneg.
+    CondCycles = 1 + ExtraCondLat;
+    TrueCycles = FalseCycles = 1;
+    if (canFoldIntoCSel(MRI, TrueReg))
+      TrueCycles = 0;
+    else if (canFoldIntoCSel(MRI, FalseReg))
+      FalseCycles = 0;
+    return true;
   }
 
+  // Scalar floating point is handled by fcsel.
+  // FIXME: Form fabs, fmin, and fmax when applicable.
+  if (AArch64::FPR64RegClass.hasSubClassEq(RC) ||
+      AArch64::FPR32RegClass.hasSubClassEq(RC)) {
+    CondCycles = 5 + ExtraCondLat;
+    TrueCycles = FalseCycles = 2;
+    return true;
+  }
+
+  // Can't do vectors.
   return false;
 }
 
-void
-AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator MBBI,
-                                      unsigned SrcReg, bool isKill,
-                                      int FrameIdx,
-                                      const TargetRegisterClass *RC,
-                                      const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOStore,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned StoreOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: StoreOp = AArch64::LS32_STR; break;
-    case 8: StoreOp = AArch64::LS64_STR; break;
+void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I, DebugLoc DL,
+                                    unsigned DstReg,
+                                    const SmallVectorImpl<MachineOperand> &Cond,
+                                    unsigned TrueReg, unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  // Parse the condition code, see parseCondBranch() above.
+  AArch64CC::CondCode CC;
+  switch (Cond.size()) {
+  default:
+    llvm_unreachable("Unknown condition opcode in Cond");
+  case 1: // b.cc
+    CC = AArch64CC::CondCode(Cond[0].getImm());
+    break;
+  case 3: { // cbz/cbnz
+    // We must insert a compare against 0.
+    bool Is64Bit;
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP8_STR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    StoreOp = AArch64::LSFP16_STR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: StoreOp = AArch64::LSFP32_STR; break;
-    case 8: StoreOp = AArch64::LSFP64_STR; break;
-    case 16: StoreOp = AArch64::LSFP128_STR; break;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::CBZW:
+      Is64Bit = 0;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBZX:
+      Is64Bit = 1;
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::CBNZW:
+      Is64Bit = 0;
+      CC = AArch64CC::NE;
+      break;
+    case AArch64::CBNZX:
+      Is64Bit = 1;
+      CC = AArch64CC::NE;
+      break;
+    }
+    unsigned SrcReg = Cond[2].getReg();
+    if (Is64Bit) {
+      // cmp reg, #0 is actually subs xzr, reg, #0.
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR64spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSXri), AArch64::XZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    } else {
+      MRI.constrainRegClass(SrcReg, &AArch64::GPR32spRegClass);
+      BuildMI(MBB, I, DL, get(AArch64::SUBSWri), AArch64::WZR)
+          .addReg(SrcReg)
+          .addImm(0)
+          .addImm(0);
+    }
+    break;
+  }
+  case 4: { // tbz/tbnz
+    // We must insert a tst instruction.
+    switch (Cond[1].getImm()) {
     default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      StoreOp = AArch64::ST1x4_16B;
+      llvm_unreachable("Unknown branch opcode in Cond");
+    case AArch64::TBZW:
+    case AArch64::TBZX:
+      CC = AArch64CC::EQ;
+      break;
+    case AArch64::TBNZW:
+    case AArch64::TBNZX:
+      CC = AArch64CC::NE;
+      break;
+    }
+    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
+    if (Cond[1].getImm() == AArch64::TBZW || Cond[1].getImm() == AArch64::TBNZW)
+      BuildMI(MBB, I, DL, get(AArch64::ANDSWri), AArch64::WZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 32));
     else
-      llvm_unreachable("Unknown reg class");
+      BuildMI(MBB, I, DL, get(AArch64::ANDSXri), AArch64::XZR)
+          .addReg(Cond[2].getReg())
+          .addImm(
+              AArch64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
+    break;
+  }
+  }
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-    // Vector store has different operands from other store instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addReg(SrcReg, getKillRegState(isKill))
-         .addMemOperand(MMO);
-    return;
+  unsigned Opc = 0;
+  const TargetRegisterClass *RC = nullptr;
+  bool TryFold = false;
+  if (MRI.constrainRegClass(DstReg, &AArch64::GPR64RegClass)) {
+    RC = &AArch64::GPR64RegClass;
+    Opc = AArch64::CSELXr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::GPR32RegClass)) {
+    RC = &AArch64::GPR32RegClass;
+    Opc = AArch64::CSELWr;
+    TryFold = true;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR64RegClass)) {
+    RC = &AArch64::FPR64RegClass;
+    Opc = AArch64::FCSELDrrr;
+  } else if (MRI.constrainRegClass(DstReg, &AArch64::FPR32RegClass)) {
+    RC = &AArch64::FPR32RegClass;
+    Opc = AArch64::FCSELSrrr;
+  }
+  assert(RC && "Unsupported regclass");
+
+  // Try folding simple instructions into the csel.
+  if (TryFold) {
+    unsigned NewVReg = 0;
+    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
+    if (FoldedOpc) {
+      // The folded opcodes csinc, csinc and csneg apply the operation to
+      // FalseReg, so we need to invert the condition.
+      CC = AArch64CC::getInvertedCondCode(CC);
+      TrueReg = FalseReg;
+    } else
+      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
+
+    // Fold the operation. Leave any dead instructions for DCE to clean up.
+    if (FoldedOpc) {
+      FalseReg = NewVReg;
+      Opc = FoldedOpc;
+      // The extends the live range of NewVReg.
+      MRI.clearKillFlags(NewVReg);
+    }
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(StoreOp));
-  NewMI.addReg(SrcReg, getKillRegState(isKill))
-    .addFrameIndex(FrameIdx)
-    .addImm(0)
-    .addMemOperand(MMO);
+  // Pull all virtual register into the appropriate class.
+  MRI.constrainRegClass(TrueReg, RC);
+  MRI.constrainRegClass(FalseReg, RC);
 
+  // Insert the csel.
+  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
+      CC);
 }
 
-void
-AArch64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MBBI,
-                                       unsigned DestReg, int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-
-  MachineMemOperand *MMO
-    = MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
-                              MachineMemOperand::MOLoad,
-                              MFI.getObjectSize(FrameIdx),
-                              Align);
-
-  unsigned LoadOp = 0;
-  if (RC->hasType(MVT::i64) || RC->hasType(MVT::i32)) {
-    switch(RC->getSize()) {
-    case 4: LoadOp = AArch64::LS32_LDR; break;
-    case 8: LoadOp = AArch64::LS64_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else if (AArch64::FPR8RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP8_LDR;
-  } else if (AArch64::FPR16RegClass.hasSubClassEq(RC)) {
-    LoadOp = AArch64::LSFP16_LDR;
-  } else if (RC->hasType(MVT::f32) || RC->hasType(MVT::f64) ||
-             RC->hasType(MVT::f128)) {
-    switch (RC->getSize()) {
-    case 4: LoadOp = AArch64::LSFP32_LDR; break;
-    case 8: LoadOp = AArch64::LSFP64_LDR; break;
-    case 16: LoadOp = AArch64::LSFP128_LDR; break;
-    default:
-      llvm_unreachable("Unknown size for regclass");
-    }
-  } else { // For a super register class has more than one sub registers
-    if (AArch64::DPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_8B;
-    else if (AArch64::DTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_8B;
-    else if (AArch64::DQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_8B;
-    else if (AArch64::QPairRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x2_16B;
-    else if (AArch64::QTripleRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x3_16B;
-    else if (AArch64::QQuadRegClass.hasSubClassEq(RC))
-      LoadOp = AArch64::LD1x4_16B;
-    else
-      llvm_unreachable("Unknown reg class");
+bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                             unsigned &SrcReg, unsigned &DstReg,
+                                             unsigned &SubIdx) const {
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  case AArch64::SBFMXri: // aka sxtw
+  case AArch64::UBFMXri: // aka uxtw
+    // Check for the 32 -> 64 bit extension case, these instructions can do
+    // much more.
+    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
+      return false;
+    // This is a signed or unsigned 32 -> 64 bit extension.
+    SrcReg = MI.getOperand(1).getReg();
+    DstReg = MI.getOperand(0).getReg();
+    SubIdx = AArch64::sub_32;
+    return true;
+  }
+}
 
-    MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-    // Vector load has different operands from other load instructions.
-    NewMI.addFrameIndex(FrameIdx)
-         .addMemOperand(MMO);
-    return;
+/// analyzeCompare - For a comparison instruction, return the source registers
+/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+/// Return true if the comparison instruction can be analyzed.
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                                      unsigned &SrcReg2, int &CmpMask,
+                                      int &CmpValue) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBSXrx:
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDSXrx:
+    // Replace SUBSWrr with SUBWrr if NZCV is not used.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = MI->getOperand(2).getReg();
+    CmpMask = ~0;
+    CmpValue = 0;
+    return true;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = MI->getOperand(2).getImm();
+    return true;
+  case AArch64::ANDSWri:
+  case AArch64::ANDSXri:
+    // ANDS does not use the same encoding scheme as the others xxxS
+    // instructions.
+    SrcReg = MI->getOperand(1).getReg();
+    SrcReg2 = 0;
+    CmpMask = ~0;
+    CmpValue = AArch64_AM::decodeLogicalImmediate(
+        MI->getOperand(2).getImm(),
+        MI->getOpcode() == AArch64::ANDSWri ? 32 : 64);
+    return true;
+  }
+
+  return false;
+}
+
+static bool UpdateOperandRegClass(MachineInstr *Instr) {
+  MachineBasicBlock *MBB = Instr->getParent();
+  assert(MBB && "Can't get MachineBasicBlock here");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Can't get MachineFunction here");
+  const TargetMachine *TM = &MF->getTarget();
+  const TargetInstrInfo *TII = TM->getInstrInfo();
+  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+
+  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
+       ++OpIdx) {
+    MachineOperand &MO = Instr->getOperand(OpIdx);
+    const TargetRegisterClass *OpRegCstraints =
+        Instr->getRegClassConstraint(OpIdx, TII, TRI);
+
+    // If there's no constraint, there's nothing to do.
+    if (!OpRegCstraints)
+      continue;
+    // If the operand is a frame index, there's nothing to do here.
+    // A frame index operand will resolve correctly during PEI.
+    if (MO.isFI())
+      continue;
+
+    assert(MO.isReg() &&
+           "Operand has register constraints without being a register!");
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (!OpRegCstraints->contains(Reg))
+        return false;
+    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
+               !MRI->constrainRegClass(Reg, OpRegCstraints))
+      return false;
   }
 
-  MachineInstrBuilder NewMI = BuildMI(MBB, MBBI, DL, get(LoadOp), DestReg);
-  NewMI.addFrameIndex(FrameIdx)
-       .addImm(0)
-       .addMemOperand(MMO);
+  return true;
 }
 
-unsigned AArch64InstrInfo::estimateRSStackLimit(MachineFunction &MF) const {
-  unsigned Limit = (1 << 16) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
+/// optimizeCompareInstr - Convert the instruction supplying the argument to the
+/// comparison into one that sets the zero bit in the flags register.
+bool AArch64InstrInfo::optimizeCompareInstr(
+    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    int CmpValue, const MachineRegisterInfo *MRI) const {
+
+  // Replace SUBSWrr with SUBWrr if NZCV is not used.
+  int Cmp_NZCV = CmpInstr->findRegisterDefOperandIdx(AArch64::NZCV, true);
+  if (Cmp_NZCV != -1) {
+    unsigned NewOpc;
+    switch (CmpInstr->getOpcode()) {
+    default:
+      return false;
+    case AArch64::ADDSWrr:      NewOpc = AArch64::ADDWrr; break;
+    case AArch64::ADDSWri:      NewOpc = AArch64::ADDWri; break;
+    case AArch64::ADDSWrs:      NewOpc = AArch64::ADDWrs; break;
+    case AArch64::ADDSWrx:      NewOpc = AArch64::ADDWrx; break;
+    case AArch64::ADDSXrr:      NewOpc = AArch64::ADDXrr; break;
+    case AArch64::ADDSXri:      NewOpc = AArch64::ADDXri; break;
+    case AArch64::ADDSXrs:      NewOpc = AArch64::ADDXrs; break;
+    case AArch64::ADDSXrx:      NewOpc = AArch64::ADDXrx; break;
+    case AArch64::SUBSWrr:      NewOpc = AArch64::SUBWrr; break;
+    case AArch64::SUBSWri:      NewOpc = AArch64::SUBWri; break;
+    case AArch64::SUBSWrs:      NewOpc = AArch64::SUBWrs; break;
+    case AArch64::SUBSWrx:      NewOpc = AArch64::SUBWrx; break;
+    case AArch64::SUBSXrr:      NewOpc = AArch64::SUBXrr; break;
+    case AArch64::SUBSXri:      NewOpc = AArch64::SUBXri; break;
+    case AArch64::SUBSXrs:      NewOpc = AArch64::SUBXrs; break;
+    case AArch64::SUBSXrx:      NewOpc = AArch64::SUBXrx; break;
+    }
+
+    const MCInstrDesc &MCID = get(NewOpc);
+    CmpInstr->setDesc(MCID);
+    CmpInstr->RemoveOperand(Cmp_NZCV);
+    bool succeeded = UpdateOperandRegClass(CmpInstr);
+    (void)succeeded;
+    assert(succeeded && "Some operands reg class are incompatible!");
+    return true;
+  }
+
+  // Continue only if we have a "ri" where immediate is zero.
+  if (CmpValue != 0 || SrcReg2 != 0)
+    return false;
+
+  // CmpInstr is a Compare instruction if destination register is not used.
+  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
+    return false;
+
+  // Get the unique definition of SrcReg.
+  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
+  if (!MI)
+    return false;
+
+  // We iterate backward, starting from the instruction before CmpInstr and
+  // stop when reaching the definition of the source register or done with the
+  // basic block, to check whether NZCV is used or modified in between.
+  MachineBasicBlock::iterator I = CmpInstr, E = MI,
+                              B = CmpInstr->getParent()->begin();
+
+  // Early exit if CmpInstr is at the beginning of the BB.
+  if (I == B)
+    return false;
+
+  // Check whether the definition of SrcReg is in the same basic block as
+  // Compare. If not, we can't optimize away the Compare.
+  if (MI->getParent() != CmpInstr->getParent())
+    return false;
+
+  // Check that NZCV isn't set between the comparison instruction and the one we
+  // want to change.
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  for (--I; I != E; --I) {
+    const MachineInstr &Instr = *I;
 
-        // When using ADDxxi_lsl0_s to get the address of a stack object, 0xfff
-        // is the largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == AArch64::ADDxxi_lsl0_s) {
-          Limit = std::min(Limit, 0xfffu);
-          break;
-        }
+    if (Instr.modifiesRegister(AArch64::NZCV, TRI) ||
+        Instr.readsRegister(AArch64::NZCV, TRI))
+      // This instruction modifies or uses NZCV after the one we want to
+      // change. We can't do this transformation.
+      return false;
+    if (I == B)
+      // The 'and' is below the comparison instruction.
+      return false;
+  }
+
+  unsigned NewOpc = MI->getOpcode();
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case AArch64::ADDSWrr:
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXrr:
+  case AArch64::ADDSXri:
+  case AArch64::SUBSWrr:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXrr:
+  case AArch64::SUBSXri:
+    break;
+  case AArch64::ADDWrr:    NewOpc = AArch64::ADDSWrr; break;
+  case AArch64::ADDWri:    NewOpc = AArch64::ADDSWri; break;
+  case AArch64::ADDXrr:    NewOpc = AArch64::ADDSXrr; break;
+  case AArch64::ADDXri:    NewOpc = AArch64::ADDSXri; break;
+  case AArch64::ADCWr:     NewOpc = AArch64::ADCSWr; break;
+  case AArch64::ADCXr:     NewOpc = AArch64::ADCSXr; break;
+  case AArch64::SUBWrr:    NewOpc = AArch64::SUBSWrr; break;
+  case AArch64::SUBWri:    NewOpc = AArch64::SUBSWri; break;
+  case AArch64::SUBXrr:    NewOpc = AArch64::SUBSXrr; break;
+  case AArch64::SUBXri:    NewOpc = AArch64::SUBSXri; break;
+  case AArch64::SBCWr:     NewOpc = AArch64::SBCSWr; break;
+  case AArch64::SBCXr:     NewOpc = AArch64::SBCSXr; break;
+  case AArch64::ANDWri:    NewOpc = AArch64::ANDSWri; break;
+  case AArch64::ANDXri:    NewOpc = AArch64::ANDSXri; break;
+  }
 
-        int AccessScale, MinOffset, MaxOffset;
-        getAddressConstraints(*I, AccessScale, MinOffset, MaxOffset);
-        Limit = std::min(Limit, static_cast<unsigned>(MaxOffset));
+  // Scan forward for the use of NZCV.
+  // When checking against MI: if it's a conditional code requires
+  // checking of V bit, then this is not safe to do.
+  // It is safe to remove CmpInstr if NZCV is redefined or killed.
+  // If we are done with the basic block, we need to check whether NZCV is
+  // live-out.
+  bool IsSafe = false;
+  for (MachineBasicBlock::iterator I = CmpInstr,
+                                   E = CmpInstr->getParent()->end();
+       !IsSafe && ++I != E;) {
+    const MachineInstr &Instr = *I;
+    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
+         ++IO) {
+      const MachineOperand &MO = Instr.getOperand(IO);
+      if (MO.isRegMask() && MO.clobbersPhysReg(AArch64::NZCV)) {
+        IsSafe = true;
+        break;
+      }
+      if (!MO.isReg() || MO.getReg() != AArch64::NZCV)
+        continue;
+      if (MO.isDef()) {
+        IsSafe = true;
+        break;
+      }
 
-        break; // At most one FI per instruction
+      // Decode the condition code.
+      unsigned Opc = Instr.getOpcode();
+      AArch64CC::CondCode CC;
+      switch (Opc) {
+      default:
+        return false;
+      case AArch64::Bcc:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 2).getImm();
+        break;
+      case AArch64::CSINVWr:
+      case AArch64::CSINVXr:
+      case AArch64::CSINCWr:
+      case AArch64::CSINCXr:
+      case AArch64::CSELWr:
+      case AArch64::CSELXr:
+      case AArch64::CSNEGWr:
+      case AArch64::CSNEGXr:
+      case AArch64::FCSELSrrr:
+      case AArch64::FCSELDrrr:
+        CC = (AArch64CC::CondCode)Instr.getOperand(IO - 1).getImm();
+        break;
+      }
+
+      // It is not safe to remove Compare instruction if Overflow(V) is used.
+      switch (CC) {
+      default:
+        // NZCV can be used multiple times, we should continue.
+        break;
+      case AArch64CC::VS:
+      case AArch64CC::VC:
+      case AArch64CC::GE:
+      case AArch64CC::LT:
+      case AArch64CC::GT:
+      case AArch64CC::LE:
+        return false;
       }
     }
   }
 
-  return Limit;
+  // If NZCV is not killed nor re-defined, we should check whether it is
+  // live-out. If it is live-out, do not optimize.
+  if (!IsSafe) {
+    MachineBasicBlock *ParentBlock = CmpInstr->getParent();
+    for (auto *MBB : ParentBlock->successors())
+      if (MBB->isLiveIn(AArch64::NZCV))
+        return false;
+  }
+
+  // Update the instruction to set NZCV.
+  MI->setDesc(get(NewOpc));
+  CmpInstr->eraseFromParent();
+  bool succeeded = UpdateOperandRegClass(MI);
+  (void)succeeded;
+  assert(succeeded && "Some operands reg class are incompatible!");
+  MI->addRegisterDefined(AArch64::NZCV, TRI);
+  return true;
 }
-void AArch64InstrInfo::getAddressConstraints(const MachineInstr &MI,
-                                             int &AccessScale, int &MinOffset,
-                                             int &MaxOffset) const {
-  switch (MI.getOpcode()) {
+
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasShiftedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown load/store kind");
-  case TargetOpcode::DBG_VALUE:
-    AccessScale = 1;
-    MinOffset = INT_MIN;
-    MaxOffset = INT_MAX;
-    return;
-  case AArch64::LS8_LDR: case AArch64::LS8_STR:
-  case AArch64::LSFP8_LDR: case AArch64::LSFP8_STR:
-  case AArch64::LDRSBw:
-  case AArch64::LDRSBx:
-    AccessScale = 1;
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    return;
-  case AArch64::LS16_LDR: case AArch64::LS16_STR:
-  case AArch64::LSFP16_LDR: case AArch64::LSFP16_STR:
-  case AArch64::LDRSHw:
-  case AArch64::LDRSHx:
-    AccessScale = 2;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS32_LDR:  case AArch64::LS32_STR:
-  case AArch64::LSFP32_LDR: case AArch64::LSFP32_STR:
-  case AArch64::LDRSWx:
-  case AArch64::LDPSWx:
-    AccessScale = 4;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LS64_LDR: case AArch64::LS64_STR:
-  case AArch64::LSFP64_LDR: case AArch64::LSFP64_STR:
-  case AArch64::PRFM:
-    AccessScale = 8;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSFP128_LDR: case AArch64::LSFP128_STR:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LSPair32_LDR: case AArch64::LSPair32_STR:
-  case AArch64::LSFPPair32_LDR: case AArch64::LSFPPair32_STR:
-    AccessScale = 4;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSPair64_LDR: case AArch64::LSPair64_STR:
-  case AArch64::LSFPPair64_LDR: case AArch64::LSFPPair64_STR:
-    AccessScale = 8;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LSFPPair128_LDR: case AArch64::LSFPPair128_STR:
-    AccessScale = 16;
-    MinOffset = -0x40 * AccessScale;
-    MaxOffset = 0x3f * AccessScale;
-    return;
-  case AArch64::LD1x2_8B: case AArch64::ST1x2_8B:
-    AccessScale = 16;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_8B: case AArch64::ST1x3_8B:
-    AccessScale = 24;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_8B: case AArch64::ST1x4_8B:
-  case AArch64::LD1x2_16B: case AArch64::ST1x2_16B:
-    AccessScale = 32;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x3_16B: case AArch64::ST1x3_16B:
-    AccessScale = 48;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
-  case AArch64::LD1x4_16B: case AArch64::ST1x4_16B:
-    AccessScale = 64;
-    MinOffset = 0;
-    MaxOffset = 0xfff * AccessScale;
-    return;
+    break;
+  case AArch64::ADDSWrs:
+  case AArch64::ADDSXrs:
+  case AArch64::ADDWrs:
+  case AArch64::ADDXrs:
+  case AArch64::ANDSWrs:
+  case AArch64::ANDSXrs:
+  case AArch64::ANDWrs:
+  case AArch64::ANDXrs:
+  case AArch64::BICSWrs:
+  case AArch64::BICSXrs:
+  case AArch64::BICWrs:
+  case AArch64::BICXrs:
+  case AArch64::CRC32Brr:
+  case AArch64::CRC32CBrr:
+  case AArch64::CRC32CHrr:
+  case AArch64::CRC32CWrr:
+  case AArch64::CRC32CXrr:
+  case AArch64::CRC32Hrr:
+  case AArch64::CRC32Wrr:
+  case AArch64::CRC32Xrr:
+  case AArch64::EONWrs:
+  case AArch64::EONXrs:
+  case AArch64::EORWrs:
+  case AArch64::EORXrs:
+  case AArch64::ORNWrs:
+  case AArch64::ORNXrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORRXrs:
+  case AArch64::SUBSWrs:
+  case AArch64::SUBSXrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBXrs:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
   }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
-  const MCInstrDesc &MCID = MI.getDesc();
-  const MachineBasicBlock &MBB = *MI.getParent();
-  const MachineFunction &MF = *MBB.getParent();
-  const MCAsmInfo &MAI = *MF.getTarget().getMCAsmInfo();
+/// Return true if this is this instruction has a non-zero immediate
+bool AArch64InstrInfo::hasExtendedReg(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::ADDSWrx:
+  case AArch64::ADDSXrx:
+  case AArch64::ADDSXrx64:
+  case AArch64::ADDWrx:
+  case AArch64::ADDXrx:
+  case AArch64::ADDXrx64:
+  case AArch64::SUBSWrx:
+  case AArch64::SUBSXrx:
+  case AArch64::SUBSXrx64:
+  case AArch64::SUBWrx:
+  case AArch64::SUBXrx:
+  case AArch64::SUBXrx64:
+    if (MI->getOperand(3).isImm()) {
+      unsigned val = MI->getOperand(3).getImm();
+      return (val != 0);
+    }
+    break;
+  }
 
-  if (MCID.getSize())
-    return MCID.getSize();
+  return false;
+}
 
-  if (MI.getOpcode() == AArch64::INLINEASM)
-    return getInlineAsmLength(MI.getOperand(0).getSymbolName(), MAI);
+// Return true if this instruction simply sets its single destination register
+// to zero. This is equivalent to a register rename of the zero-register.
+bool AArch64InstrInfo::isGPRZero(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::MOVZWi:
+  case AArch64::MOVZXi: // movz Rd, #0 (LSL #0)
+    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 3 &&
+             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
+      return true;
+    }
+    break;
+  case AArch64::ANDWri: // and Rd, Rzr, #imm
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  case AArch64::ANDXri:
+    return MI->getOperand(1).getReg() == AArch64::XZR;
+  case TargetOpcode::COPY:
+    return MI->getOperand(1).getReg() == AArch64::WZR;
+  }
+  return false;
+}
 
-  switch (MI.getOpcode()) {
-  case TargetOpcode::BUNDLE:
-    return getInstBundleLength(MI);
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-  case TargetOpcode::CFI_INSTRUCTION:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::GC_LABEL:
-  case TargetOpcode::DBG_VALUE:
-  case AArch64::TLSDESCCALL:
-    return 0;
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
   default:
-    llvm_unreachable("Unknown instruction class");
+    break;
+  case TargetOpcode::COPY: {
+    // GPR32 copies will by lowered to ORRXrs
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::GPR32RegClass.contains(DstReg) ||
+            AArch64::GPR64RegClass.contains(DstReg));
   }
+  case AArch64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
+    if (MI->getOperand(1).getReg() == AArch64::XZR) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
+      return true;
+    }
+  case AArch64::ADDXri: // add Xd, Xn, #0 (LSL #0)
+    if (MI->getOperand(2).getImm() == 0) {
+      assert(MI->getDesc().getNumOperands() == 4 &&
+             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
-  unsigned Size = 0;
-  MachineBasicBlock::const_instr_iterator I = MI;
-  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
-  while (++I != E && I->isInsideBundle()) {
-    assert(!I->isBundle() && "No nested bundle!");
-    Size += getInstSizeInBytes(*I);
+// Return true if this instruction simply renames a general register without
+// modifying bits.
+bool AArch64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case TargetOpcode::COPY: {
+    // FPR64 copies will by lowered to ORR.16b
+    unsigned DstReg = MI->getOperand(0).getReg();
+    return (AArch64::FPR64RegClass.contains(DstReg) ||
+            AArch64::FPR128RegClass.contains(DstReg));
   }
-  return Size;
+  case AArch64::ORRv16i8:
+    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
+      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
+             "invalid ORRv16i8 operands");
+      return true;
+    }
+  }
+  return false;
 }
 
-bool llvm::rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                unsigned FrameReg, int &Offset,
-                                const AArch64InstrInfo &TII) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
+unsigned AArch64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                               int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRWui:
+  case AArch64::LDRXui:
+  case AArch64::LDRBui:
+  case AArch64::LDRHui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
+  }
 
-  MFI.getObjectOffset(FrameRegIdx);
-  llvm_unreachable("Unimplemented rewriteFrameIndex");
+  return 0;
 }
 
-void llvm::emitRegUpdate(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI,
-                         DebugLoc dl, const TargetInstrInfo &TII,
-                         unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                         int64_t NumBytes, MachineInstr::MIFlag MIFlags) {
-  if (NumBytes == 0 && DstReg == SrcReg)
-    return;
-  else if (abs64(NumBytes) & ~0xffffff) {
-    // Generically, we have to materialize the offset into a temporary register
-    // and subtract it. There are a couple of ways this could be done, for now
-    // we'll use a movz/movk or movn/movk sequence.
-    uint64_t Bits = static_cast<uint64_t>(abs64(NumBytes));
-    BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVZxii), ScratchReg)
-      .addImm(0xffff & Bits).addImm(0)
-      .setMIFlags(MIFlags);
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(1)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(2)
-        .setMIFlags(MIFlags);
-    }
-
-    Bits >>= 16;
-    if (Bits & 0xffff) {
-      BuildMI(MBB, MBBI, dl, TII.get(AArch64::MOVKxii), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(0xffff & Bits).addImm(3)
-        .setMIFlags(MIFlags);
-    }
-
-    // ADD DST, SRC, xTMP (, lsl #0)
-    unsigned AddOp = NumBytes > 0 ? AArch64::ADDxxx_uxtx : AArch64::SUBxxx_uxtx;
-    BuildMI(MBB, MBBI, dl, TII.get(AddOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addReg(ScratchReg, RegState::Kill)
-      .addImm(0)
-      .setMIFlag(MIFlags);
-    return;
+unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::STRWui:
+  case AArch64::STRXui:
+  case AArch64::STRBui:
+  case AArch64::STRHui:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
+        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
+      FrameIndex = MI->getOperand(1).getIndex();
+      return MI->getOperand(0).getReg();
+    }
+    break;
   }
+  return 0;
+}
 
-  // Now we know that the adjustment can be done in at most two add/sub
-  // (immediate) instructions, which is always more efficient than a
-  // literal-pool load, or even a hypothetical movz/movk/add sequence
+/// Return true if this is load/store scales or extends its register offset.
+/// This refers to scaling a dynamic index as opposed to scaled immediates.
+/// MI should be a memory op that allows scaled addressing.
+bool AArch64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
+  switch (MI->getOpcode()) {
+  default:
+    break;
+  case AArch64::LDRBBroW:
+  case AArch64::LDRBroW:
+  case AArch64::LDRDroW:
+  case AArch64::LDRHHroW:
+  case AArch64::LDRHroW:
+  case AArch64::LDRQroW:
+  case AArch64::LDRSBWroW:
+  case AArch64::LDRSBXroW:
+  case AArch64::LDRSHWroW:
+  case AArch64::LDRSHXroW:
+  case AArch64::LDRSWroW:
+  case AArch64::LDRSroW:
+  case AArch64::LDRWroW:
+  case AArch64::LDRXroW:
+  case AArch64::STRBBroW:
+  case AArch64::STRBroW:
+  case AArch64::STRDroW:
+  case AArch64::STRHHroW:
+  case AArch64::STRHroW:
+  case AArch64::STRQroW:
+  case AArch64::STRSroW:
+  case AArch64::STRWroW:
+  case AArch64::STRXroW:
+  case AArch64::LDRBBroX:
+  case AArch64::LDRBroX:
+  case AArch64::LDRDroX:
+  case AArch64::LDRHHroX:
+  case AArch64::LDRHroX:
+  case AArch64::LDRQroX:
+  case AArch64::LDRSBWroX:
+  case AArch64::LDRSBXroX:
+  case AArch64::LDRSHWroX:
+  case AArch64::LDRSHXroX:
+  case AArch64::LDRSWroX:
+  case AArch64::LDRSroX:
+  case AArch64::LDRWroX:
+  case AArch64::LDRXroX:
+  case AArch64::STRBBroX:
+  case AArch64::STRBroX:
+  case AArch64::STRDroX:
+  case AArch64::STRHHroX:
+  case AArch64::STRHroX:
+  case AArch64::STRQroX:
+  case AArch64::STRSroX:
+  case AArch64::STRWroX:
+  case AArch64::STRXroX:
+
+    unsigned Val = MI->getOperand(3).getImm();
+    AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getMemExtendType(Val);
+    return (ExtType != AArch64_AM::UXTX) || AArch64_AM::getMemDoShift(Val);
+  }
+  return false;
+}
 
-  // Decide whether we're doing addition or subtraction
-  unsigned LowOp, HighOp;
-  if (NumBytes >= 0) {
-    LowOp = AArch64::ADDxxi_lsl0_s;
-    HighOp = AArch64::ADDxxi_lsl12_s;
-  } else {
-    LowOp = AArch64::SUBxxi_lsl0_s;
-    HighOp = AArch64::SUBxxi_lsl12_s;
-    NumBytes = abs64(NumBytes);
+/// Check all MachineMemOperands for a hint to suppress pairing.
+bool AArch64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  for (auto *MM : MI->memoperands()) {
+    if (MM->getFlags() &
+        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
+      return true;
+    }
   }
+  return false;
+}
 
-  // If we're here, at the very least a move needs to be produced, which just
-  // happens to be materializable by an ADD.
-  if ((NumBytes & 0xfff) || NumBytes == 0) {
-    BuildMI(MBB, MBBI, dl, TII.get(LowOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes & 0xfff)
-      .setMIFlag(MIFlags);
+/// Set a flag on the first MachineMemOperand to suppress pairing.
+void AArch64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
+  if (MI->memoperands_empty())
+    return;
 
-    // Next update should use the register we've just defined.
-    SrcReg = DstReg;
-  }
+  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
+         "Too many target MO flags");
+  (*MI->memoperands_begin())
+      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
+}
+
+bool
+AArch64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                                       unsigned &Offset,
+                                       const TargetRegisterInfo *TRI) const {
+  switch (LdSt->getOpcode()) {
+  default:
+    return false;
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STRQui:
+  case AArch64::STRXui:
+  case AArch64::STRWui:
+  case AArch64::LDRSui:
+  case AArch64::LDRDui:
+  case AArch64::LDRQui:
+  case AArch64::LDRXui:
+  case AArch64::LDRWui:
+    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
+      return false;
+    BaseReg = LdSt->getOperand(1).getReg();
+    MachineFunction &MF = *LdSt->getParent()->getParent();
+    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
+    Offset = LdSt->getOperand(2).getImm() * Width;
+    return true;
+  };
+}
 
-  if (NumBytes & 0xfff000) {
-    BuildMI(MBB, MBBI, dl, TII.get(HighOp), DstReg)
-      .addReg(SrcReg, RegState::Kill)
-      .addImm(NumBytes >> 12)
-      .setMIFlag(MIFlags);
+/// Detect opportunities for ldp/stp formation.
+///
+/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
+bool AArch64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
+                                          MachineInstr *SecondLdSt,
+                                          unsigned NumLoads) const {
+  // Only cluster up to a single pair.
+  if (NumLoads > 1)
+    return false;
+  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
+    return false;
+  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
+  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
+  // Allow 6 bits of positive range.
+  if (Ofs1 > 64)
+    return false;
+  // The caller should already have ordered First/SecondLdSt by offset.
+  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
+  return Ofs1 + 1 == Ofs2;
+}
+
+bool AArch64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
+                                              MachineInstr *Second) const {
+  // Cyclone can fuse CMN, CMP followed by Bcc.
+
+  // FIXME: B0 can also fuse:
+  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
+  if (Second->getOpcode() != AArch64::Bcc)
+    return false;
+  switch (First->getOpcode()) {
+  default:
+    return false;
+  case AArch64::SUBSWri:
+  case AArch64::ADDSWri:
+  case AArch64::ANDSWri:
+  case AArch64::SUBSXri:
+  case AArch64::ADDSXri:
+  case AArch64::ANDSXri:
+    return true;
   }
 }
 
-void llvm::emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                        DebugLoc dl, const TargetInstrInfo &TII,
-                        unsigned ScratchReg, int64_t NumBytes,
-                        MachineInstr::MIFlag MIFlags) {
-  emitRegUpdate(MBB, MI, dl, TII, AArch64::XSP, AArch64::XSP, AArch64::X16,
-                NumBytes, MIFlags);
+MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
+                                                         int FrameIx,
+                                                         uint64_t Offset,
+                                                         const MDNode *MDPtr,
+                                                         DebugLoc DL) const {
+  MachineInstrBuilder MIB = BuildMI(MF, DL, get(AArch64::DBG_VALUE))
+                                .addFrameIndex(FrameIx)
+                                .addImm(0)
+                                .addImm(Offset)
+                                .addMetadata(MDPtr);
+  return &*MIB;
 }
 
+static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
+                                            unsigned Reg, unsigned SubIdx,
+                                            unsigned State,
+                                            const TargetRegisterInfo *TRI) {
+  if (!SubIdx)
+    return MIB.addReg(Reg, State);
 
-namespace {
-  struct LDTLSCleanup : public MachineFunctionPass {
-    static char ID;
-    LDTLSCleanup() : MachineFunctionPass(ID) {}
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
+  return MIB.addReg(Reg, State, SubIdx);
+}
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
-      AArch64MachineFunctionInfo* MFI
-        = MF.getInfo<AArch64MachineFunctionInfo>();
-      if (MFI->getNumLocalDynamicTLSAccesses() < 2) {
-        // No point folding accesses if there isn't at least two.
-        return false;
-      }
+static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
+                                        unsigned NumRegs) {
+  // We really want the positive remainder mod 32 here, that happens to be
+  // easily obtainable with a mask.
+  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
+}
 
-      MachineDominatorTree *DT = &getAnalysis<MachineDominatorTree>();
-      return VisitNode(DT->getRootNode(), 0);
-    }
-
-    // Visit the dominator subtree rooted at Node in pre-order.
-    // If TLSBaseAddrReg is non-null, then use that to replace any
-    // TLS_base_addr instructions. Otherwise, create the register
-    // when the first such instruction is seen, and then use it
-    // as we encounter more instructions.
-    bool VisitNode(MachineDomTreeNode *Node, unsigned TLSBaseAddrReg) {
-      MachineBasicBlock *BB = Node->getBlock();
-      bool Changed = false;
-
-      // Traverse the current block.
-      for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
-           ++I) {
-        switch (I->getOpcode()) {
-        case AArch64::TLSDESC_BLRx:
-          // Make sure it's a local dynamic access.
-          if (!I->getOperand(1).isSymbol() ||
-              strcmp(I->getOperand(1).getSymbolName(), "_TLS_MODULE_BASE_"))
-            break;
-
-          if (TLSBaseAddrReg)
-            I = ReplaceTLSBaseAddrCall(I, TLSBaseAddrReg);
-          else
-            I = SetRegister(I, &TLSBaseAddrReg);
-          Changed = true;
-          break;
-        default:
-          break;
-        }
-      }
+void AArch64InstrInfo::copyPhysRegTuple(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
+    unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode,
+    llvm::ArrayRef<unsigned> Indices) const {
+  assert(getSubTarget().hasNEON() &&
+         "Unexpected register copy without NEON");
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
+  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
+  unsigned NumRegs = Indices.size();
+
+  int SubReg = 0, End = NumRegs, Incr = 1;
+  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
+    SubReg = NumRegs - 1;
+    End = -1;
+    Incr = -1;
+  }
 
-      // Visit the children of this block in the dominator tree.
-      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
-           I != E; ++I) {
-        Changed |= VisitNode(*I, TLSBaseAddrReg);
+  for (; SubReg != End; SubReg += Incr) {
+    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
+    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
+    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
+  }
+}
+
+void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I, DebugLoc DL,
+                                   unsigned DestReg, unsigned SrcReg,
+                                   bool KillSrc) const {
+  if (AArch64::GPR32spRegClass.contains(DestReg) &&
+      (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+    if (DestReg == AArch64::WSP || SrcReg == AArch64::WSP) {
+      // If either operand is WSP, expand to ADD #0.
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestRegX)
+            .addReg(SrcRegX, RegState::Undef)
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        BuildMI(MBB, I, DL, get(AArch64::ADDWri), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc))
+            .addImm(0)
+            .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
       }
+    } else if (SrcReg == AArch64::WZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      if (Subtarget.hasZeroCycleRegMove()) {
+        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
+        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                     &AArch64::GPR64spRegClass);
+        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
+                                                    &AArch64::GPR64spRegClass);
+        // This instruction is reading and writing X registers.  This may upset
+        // the register scavenger and machine verifier, so we need to indicate
+        // that we are reading an undefined value from SrcRegX, but a proper
+        // value from SrcReg.
+        BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestRegX)
+            .addReg(AArch64::XZR)
+            .addReg(SrcRegX, RegState::Undef)
+            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
+      } else {
+        // Otherwise, expand to ORR WZR.
+        BuildMI(MBB, I, DL, get(AArch64::ORRWrr), DestReg)
+            .addReg(AArch64::WZR)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+      }
+    }
+    return;
+  }
 
-      return Changed;
+  if (AArch64::GPR64spRegClass.contains(DestReg) &&
+      (AArch64::GPR64spRegClass.contains(SrcReg) || SrcReg == AArch64::XZR)) {
+    if (DestReg == AArch64::SP || SrcReg == AArch64::SP) {
+      // If either operand is SP, expand to ADD #0.
+      BuildMI(MBB, I, DL, get(AArch64::ADDXri), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (SrcReg == AArch64::XZR && Subtarget.hasZeroCycleZeroing()) {
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestReg).addImm(0).addImm(
+          AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else {
+      // Otherwise, expand to ORR XZR.
+      BuildMI(MBB, I, DL, get(AArch64::ORRXrr), DestReg)
+          .addReg(AArch64::XZR)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
+    return;
+  }
 
-    // Replace the TLS_base_addr instruction I with a copy from
-    // TLSBaseAddrReg, returning the new instruction.
-    MachineInstr *ReplaceTLSBaseAddrCall(MachineInstr *I,
-                                         unsigned TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  // Copy a DDDD register quad by copying the individual sub-registers.
+  if (AArch64::DDDDRegClass.contains(DestReg) &&
+      AArch64::DDDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2, AArch64::dsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Insert a Copy from TLSBaseAddrReg to x0, which is where the rest of the
-      // code sequence assumes the address will be.
-      MachineInstr *Copy = BuildMI(*I->getParent(), I, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   AArch64::X0)
-        .addReg(TLSBaseAddrReg);
+  // Copy a DDD register triple by copying the individual sub-registers.
+  if (AArch64::DDDRegClass.contains(DestReg) &&
+      AArch64::DDDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1,
+                                        AArch64::dsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
 
-      // Erase the TLS_base_addr instruction.
-      I->eraseFromParent();
+  // Copy a DD register pair by copying the individual sub-registers.
+  if (AArch64::DDRegClass.contains(DestReg) &&
+      AArch64::DDRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::dsub0, AArch64::dsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv8i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQQ register quad by copying the individual sub-registers.
+  if (AArch64::QQQQRegClass.contains(DestReg) &&
+      AArch64::QQQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2, AArch64::qsub3 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQQ register triple by copying the individual sub-registers.
+  if (AArch64::QQQRegClass.contains(DestReg) &&
+      AArch64::QQQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1,
+                                        AArch64::qsub2 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
+
+  // Copy a QQ register pair by copying the individual sub-registers.
+  if (AArch64::QQRegClass.contains(DestReg) &&
+      AArch64::QQRegClass.contains(SrcReg)) {
+    static const unsigned Indices[] = { AArch64::qsub0, AArch64::qsub1 };
+    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, AArch64::ORRv16i8,
+                     Indices);
+    return;
+  }
 
-      return Copy;
+  if (AArch64::FPR128RegClass.contains(DestReg) &&
+      AArch64::FPR128RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::STRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addReg(AArch64::SP)
+        .addImm(-16);
+      BuildMI(MBB, I, DL, get(AArch64::LDRQpre))
+        .addReg(AArch64::SP, RegState::Define)
+        .addReg(DestReg, RegState::Define)
+        .addReg(AArch64::SP)
+        .addImm(16);
     }
+    return;
+  }
 
-    // Create a virtal register in *TLSBaseAddrReg, and populate it by
-    // inserting a copy instruction after I. Returns the new instruction.
-    MachineInstr *SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) {
-      MachineFunction *MF = I->getParent()->getParent();
-      const AArch64TargetMachine *TM =
-          static_cast<const AArch64TargetMachine *>(&MF->getTarget());
-      const AArch64InstrInfo *TII = TM->getInstrInfo();
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::dsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::dsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVDr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      // Create a virtual register for the TLS base address.
-      MachineRegisterInfo &RegInfo = MF->getRegInfo();
-      *TLSBaseAddrReg = RegInfo.createVirtualRegister(&AArch64::GPR64RegClass);
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::ssub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::ssub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      // Insert a copy from X0 to TLSBaseAddrReg for later.
-      MachineInstr *Next = I->getNextNode();
-      MachineInstr *Copy = BuildMI(*I->getParent(), Next, I->getDebugLoc(),
-                                   TII->get(TargetOpcode::COPY),
-                                   *TLSBaseAddrReg)
-        .addReg(AArch64::X0);
+  if (AArch64::FPR16RegClass.contains(DestReg) &&
+      AArch64::FPR16RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::hsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::hsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    }
+    return;
+  }
 
-      return Copy;
+  if (AArch64::FPR8RegClass.contains(DestReg) &&
+      AArch64::FPR8RegClass.contains(SrcReg)) {
+    if(getSubTarget().hasNEON()) {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR128RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR128RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::ORRv16i8), DestReg)
+          .addReg(SrcReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
+    } else {
+      DestReg = RI.getMatchingSuperReg(DestReg, AArch64::bsub,
+                                       &AArch64::FPR32RegClass);
+      SrcReg = RI.getMatchingSuperReg(SrcReg, AArch64::bsub,
+                                      &AArch64::FPR32RegClass);
+      BuildMI(MBB, I, DL, get(AArch64::FMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc));
     }
+    return;
+  }
+
+  // Copies between GPR64 and FPR64.
+  if (AArch64::FPR64RegClass.contains(DestReg) &&
+      AArch64::GPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVXDr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR64RegClass.contains(DestReg) &&
+      AArch64::FPR64RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVDXr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  // Copies between GPR32 and FPR32.
+  if (AArch64::FPR32RegClass.contains(DestReg) &&
+      AArch64::GPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVWSr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
+  if (AArch64::GPR32RegClass.contains(DestReg) &&
+      AArch64::FPR32RegClass.contains(SrcReg)) {
+    BuildMI(MBB, I, DL, get(AArch64::FMOVSWr), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc));
+    return;
+  }
 
-    virtual const char *getPassName() const {
-      return "Local Dynamic TLS Access Clean-up";
+  if (DestReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(SrcReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MSR))
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(SrcReg, getKillRegState(KillSrc))
+      .addReg(AArch64::NZCV, RegState::Implicit | RegState::Define);
+    return;
+  }
+
+  if (SrcReg == AArch64::NZCV) {
+    assert(AArch64::GPR64RegClass.contains(DestReg) && "Invalid NZCV copy");
+    BuildMI(MBB, I, DL, get(AArch64::MRS))
+      .addReg(DestReg)
+      .addImm(AArch64SysReg::NZCV)
+      .addReg(AArch64::NZCV, RegState::Implicit | getKillRegState(KillSrc));
+    return;
+  }
+
+  llvm_unreachable("unimplemented reg-to-reg copy");
+}
+
+void AArch64InstrInfo::storeRegToStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    bool isKill, int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRWui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR32RegClass);
+      else
+        assert(SrcReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::STRXui;
+      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
+        MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      else
+        assert(SrcReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::STRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov1d, Offset = false;
+    }
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Twov2d, Offset = false;
     }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register store without NEON");
+      Opc = AArch64::ST1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(SrcReg, getKillRegState(isKill))
+                                      .addFrameIndex(FI);
+
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void AArch64InstrInfo::loadRegFromStackSlot(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    int FI, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (MBBI != MBB.end())
+    DL = MBBI->getDebugLoc();
+  MachineFunction &MF = *MBB.getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned Align = MFI.getObjectAlignment(FI);
+  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-      AU.setPreservesCFG();
-      AU.addRequired<MachineDominatorTree>();
-      MachineFunctionPass::getAnalysisUsage(AU);
+  unsigned Opc = 0;
+  bool Offset = true;
+  switch (RC->getSize()) {
+  case 1:
+    if (AArch64::FPR8RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRBui;
+    break;
+  case 2:
+    if (AArch64::FPR16RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRHui;
+    break;
+  case 4:
+    if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRWui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR32RegClass);
+      else
+        assert(DestReg != AArch64::WSP);
+    } else if (AArch64::FPR32RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRSui;
+    break;
+  case 8:
+    if (AArch64::GPR64allRegClass.hasSubClassEq(RC)) {
+      Opc = AArch64::LDRXui;
+      if (TargetRegisterInfo::isVirtualRegister(DestReg))
+        MF.getRegInfo().constrainRegClass(DestReg, &AArch64::GPR64RegClass);
+      else
+        assert(DestReg != AArch64::SP);
+    } else if (AArch64::FPR64RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRDui;
+    break;
+  case 16:
+    if (AArch64::FPR128RegClass.hasSubClassEq(RC))
+      Opc = AArch64::LDRQui;
+    else if (AArch64::DDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov1d, Offset = false;
     }
-  };
+    break;
+  case 24:
+    if (AArch64::DDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev1d, Offset = false;
+    }
+    break;
+  case 32:
+    if (AArch64::DDDDRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv1d, Offset = false;
+    } else if (AArch64::QQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Twov2d, Offset = false;
+    }
+    break;
+  case 48:
+    if (AArch64::QQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Threev2d, Offset = false;
+    }
+    break;
+  case 64:
+    if (AArch64::QQQQRegClass.hasSubClassEq(RC)) {
+      assert(getSubTarget().hasNEON() &&
+             "Unexpected register load without NEON");
+      Opc = AArch64::LD1Fourv2d, Offset = false;
+    }
+    break;
+  }
+  assert(Opc && "Unknown register class");
+
+  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
+                                      .addReg(DestReg, getDefRegState(true))
+                                      .addFrameIndex(FI);
+  if (Offset)
+    MI.addImm(0);
+  MI.addMemOperand(MMO);
+}
+
+void llvm::emitFrameOffset(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg, int Offset,
+                           const AArch64InstrInfo *TII,
+                           MachineInstr::MIFlag Flag, bool SetNZCV) {
+  if (DestReg == SrcReg && Offset == 0)
+    return;
+
+  bool isSub = Offset < 0;
+  if (isSub)
+    Offset = -Offset;
+
+  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
+  // scratch register.  If DestReg is a virtual register, use it as the
+  // scratch register; otherwise, create a new virtual register (to be
+  // replaced by the scavenger at the end of PEI).  That case can be optimized
+  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
+  // register can be loaded with offset%8 and the add/sub can use an extending
+  // instruction with LSL#3.
+  // Currently the function handles any offsets but generates a poor sequence
+  // of code.
+  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
+
+  unsigned Opc;
+  if (SetNZCV)
+    Opc = isSub ? AArch64::SUBSXri : AArch64::ADDSXri;
+  else
+    Opc = isSub ? AArch64::SUBXri : AArch64::ADDXri;
+  const unsigned MaxEncoding = 0xfff;
+  const unsigned ShiftSize = 12;
+  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  while (((unsigned)Offset) >= (1 << ShiftSize)) {
+    unsigned ThisVal;
+    if (((unsigned)Offset) > MaxEncodableValue) {
+      ThisVal = MaxEncodableValue;
+    } else {
+      ThisVal = Offset & MaxEncodableValue;
+    }
+    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
+           "Encoding cannot handle value that big");
+    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+        .addReg(SrcReg)
+        .addImm(ThisVal >> ShiftSize)
+        .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftSize))
+        .setMIFlag(Flag);
+
+    SrcReg = DestReg;
+    Offset -= ThisVal;
+    if (Offset == 0)
+      return;
+  }
+  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+      .addReg(SrcReg)
+      .addImm(Offset)
+      .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0))
+      .setMIFlag(Flag);
 }
 
-char LDTLSCleanup::ID = 0;
-FunctionPass*
-llvm::createAArch64CleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+MachineInstr *
+AArch64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                                        const SmallVectorImpl<unsigned> &Ops,
+                                        int FrameIndex) const {
+  // This is a bit of a hack. Consider this instruction:
+  //
+  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
+  //
+  // We explicitly chose GPR64all for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
+  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, we are going to constrain the %vreg0 register class here.
+  //
+  // <rdar://problem/11522048>
+  //
+  if (MI->isCopy()) {
+    unsigned DstReg = MI->getOperand(0).getReg();
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (SrcReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(DstReg)) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+    if (DstReg == AArch64::SP &&
+        TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AArch64::GPR64RegClass);
+      return nullptr;
+    }
+  }
+
+  // Cannot fold.
+  return nullptr;
+}
+
+int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                                    bool *OutUseUnscaledOp,
+                                    unsigned *OutUnscaledOp,
+                                    int *EmittableOffset) {
+  int Scale = 1;
+  bool IsSigned = false;
+  // The ImmIdx should be changed case by case if it is not 2.
+  unsigned ImmIdx = 2;
+  unsigned UnscaledOp = 0;
+  // Set output values in case of early exit.
+  if (EmittableOffset)
+    *EmittableOffset = 0;
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = false;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = 0;
+  switch (MI.getOpcode()) {
+  default:
+    assert(0 && "unhandled opcode in rewriteAArch64FrameIndex");
+  // Vector spills/fills can't take an immediate offset.
+  case AArch64::LD1Twov2d:
+  case AArch64::LD1Threev2d:
+  case AArch64::LD1Fourv2d:
+  case AArch64::LD1Twov1d:
+  case AArch64::LD1Threev1d:
+  case AArch64::LD1Fourv1d:
+  case AArch64::ST1Twov2d:
+  case AArch64::ST1Threev2d:
+  case AArch64::ST1Fourv2d:
+  case AArch64::ST1Twov1d:
+  case AArch64::ST1Threev1d:
+  case AArch64::ST1Fourv1d:
+    return AArch64FrameOffsetCannotUpdate;
+  case AArch64::PRFMui:
+    Scale = 8;
+    UnscaledOp = AArch64::PRFUMi;
+    break;
+  case AArch64::LDRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURXi;
+    break;
+  case AArch64::LDRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURWi;
+    break;
+  case AArch64::LDRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBi;
+    break;
+  case AArch64::LDRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHi;
+    break;
+  case AArch64::LDRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSi;
+    break;
+  case AArch64::LDRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::LDURDi;
+    break;
+  case AArch64::LDRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::LDURQi;
+    break;
+  case AArch64::LDRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURBBi;
+    break;
+  case AArch64::LDRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURHHi;
+    break;
+  case AArch64::LDRSBXui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBXi;
+    break;
+  case AArch64::LDRSBWui:
+    Scale = 1;
+    UnscaledOp = AArch64::LDURSBWi;
+    break;
+  case AArch64::LDRSHXui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHXi;
+    break;
+  case AArch64::LDRSHWui:
+    Scale = 2;
+    UnscaledOp = AArch64::LDURSHWi;
+    break;
+  case AArch64::LDRSWui:
+    Scale = 4;
+    UnscaledOp = AArch64::LDURSWi;
+    break;
+
+  case AArch64::STRXui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURXi;
+    break;
+  case AArch64::STRWui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURWi;
+    break;
+  case AArch64::STRBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBi;
+    break;
+  case AArch64::STRHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHi;
+    break;
+  case AArch64::STRSui:
+    Scale = 4;
+    UnscaledOp = AArch64::STURSi;
+    break;
+  case AArch64::STRDui:
+    Scale = 8;
+    UnscaledOp = AArch64::STURDi;
+    break;
+  case AArch64::STRQui:
+    Scale = 16;
+    UnscaledOp = AArch64::STURQi;
+    break;
+  case AArch64::STRBBui:
+    Scale = 1;
+    UnscaledOp = AArch64::STURBBi;
+    break;
+  case AArch64::STRHHui:
+    Scale = 2;
+    UnscaledOp = AArch64::STURHHi;
+    break;
+
+  case AArch64::LDPXi:
+  case AArch64::LDPDi:
+  case AArch64::STPXi:
+  case AArch64::STPDi:
+    IsSigned = true;
+    Scale = 8;
+    break;
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+    IsSigned = true;
+    Scale = 16;
+    break;
+  case AArch64::LDPWi:
+  case AArch64::LDPSi:
+  case AArch64::STPWi:
+  case AArch64::STPSi:
+    IsSigned = true;
+    Scale = 4;
+    break;
+
+  case AArch64::LDURXi:
+  case AArch64::LDURWi:
+  case AArch64::LDURBi:
+  case AArch64::LDURHi:
+  case AArch64::LDURSi:
+  case AArch64::LDURDi:
+  case AArch64::LDURQi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSBWi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSHWi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::STURWi:
+  case AArch64::STURBi:
+  case AArch64::STURHi:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
+  case AArch64::STURQi:
+  case AArch64::STURBBi:
+  case AArch64::STURHHi:
+    Scale = 1;
+    break;
+  }
+
+  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
+
+  bool useUnscaledOp = false;
+  // If the offset doesn't match the scale, we rewrite the instruction to
+  // use the unscaled instruction instead. Likewise, if we have a negative
+  // offset (and have an unscaled op to use).
+  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
+    useUnscaledOp = true;
+
+  // Use an unscaled addressing mode if the instruction has a negative offset
+  // (or if the instruction is already using an unscaled addressing mode).
+  unsigned MaskBits;
+  if (IsSigned) {
+    // ldp/stp instructions.
+    MaskBits = 7;
+    Offset /= Scale;
+  } else if (UnscaledOp == 0 || useUnscaledOp) {
+    MaskBits = 9;
+    IsSigned = true;
+    Scale = 1;
+  } else {
+    MaskBits = 12;
+    IsSigned = false;
+    Offset /= Scale;
+  }
+
+  // Attempt to fold address computation.
+  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
+  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
+  if (Offset >= MinOff && Offset <= MaxOff) {
+    if (EmittableOffset)
+      *EmittableOffset = Offset;
+    Offset = 0;
+  } else {
+    int NewOff = Offset < 0 ? MinOff : MaxOff;
+    if (EmittableOffset)
+      *EmittableOffset = NewOff;
+    Offset = (Offset - NewOff) * Scale;
+  }
+  if (OutUseUnscaledOp)
+    *OutUseUnscaledOp = useUnscaledOp;
+  if (OutUnscaledOp)
+    *OutUnscaledOp = UnscaledOp;
+  return AArch64FrameOffsetCanUpdate |
+         (Offset == 0 ? AArch64FrameOffsetIsLegal : 0);
+}
+
+bool llvm::rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                                    unsigned FrameReg, int &Offset,
+                                    const AArch64InstrInfo *TII) {
+  unsigned Opcode = MI.getOpcode();
+  unsigned ImmIdx = FrameRegIdx + 1;
+
+  if (Opcode == AArch64::ADDSXri || Opcode == AArch64::ADDXri) {
+    Offset += MI.getOperand(ImmIdx).getImm();
+    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
+                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
+                    MachineInstr::NoFlags, (Opcode == AArch64::ADDSXri));
+    MI.eraseFromParent();
+    Offset = 0;
+    return true;
+  }
+
+  int NewOffset;
+  unsigned UnscaledOp;
+  bool UseUnscaledOp;
+  int Status = isAArch64FrameOffsetLegal(MI, Offset, &UseUnscaledOp,
+                                         &UnscaledOp, &NewOffset);
+  if (Status & AArch64FrameOffsetCanUpdate) {
+    if (Status & AArch64FrameOffsetIsLegal)
+      // Replace the FrameIndex with FrameReg.
+      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
+    if (UseUnscaledOp)
+      MI.setDesc(TII->get(UnscaledOp));
+
+    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
+    return Offset == 0;
+  }
+
+  return false;
+}
+
+void AArch64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  NopInst.setOpcode(AArch64::HINT);
+  NopInst.addOperand(MCOperand::CreateImm(0));
+}
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index ad20f9c..90ce75f 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64INSTRINFO_H
-#define LLVM_TARGET_AARCH64INSTRINFO_H
+#ifndef LLVM_TARGET_AArch64INSTRINFO_H
+#define LLVM_TARGET_AArch64INSTRINFO_H
 
+#include "AArch64.h"
 #include "AArch64RegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
@@ -23,89 +24,208 @@
 namespace llvm {
 
 class AArch64Subtarget;
+class AArch64TargetMachine;
 
 class AArch64InstrInfo : public AArch64GenInstrInfo {
+  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
+  // They will be shifted into MOTargetHintStart when accessed.
+  enum TargetMemOperandFlags {
+    MOSuppressPair = 1
+  };
+
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
+
 public:
-  explicit AArch64InstrInfo(const AArch64Subtarget &TM);
+  explicit AArch64InstrInfo(const AArch64Subtarget &STI);
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
-  ///
-  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const AArch64RegisterInfo &getRegisterInfo() const { return RI; }
 
   const AArch64Subtarget &getSubTarget() const { return Subtarget; }
 
-  void copyPhysReg(MachineBasicBlock &MBB,
-                   MachineBasicBlock::iterator I, DebugLoc DL,
-                   unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
-  void CopyPhysRegTuple(MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator I, DebugLoc DL,
-                        unsigned DestReg, unsigned SrcReg) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const override;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  /// Returns true if there is a shiftable register and that the shift value
+  /// is non-zero.
+  bool hasShiftedReg(const MachineInstr *MI) const;
+
+  /// Returns true if there is an extendable register and that the extending
+  /// value is non-zero.
+  bool hasExtendedReg(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction set its full destination register to zero?
+  bool isGPRZero(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename a GPR without modifying bits?
+  bool isGPRCopy(const MachineInstr *MI) const;
+
+  /// \brief Does this instruction rename an FPR without modifying bits?
+  bool isFPRCopy(const MachineInstr *MI) const;
+
+  /// Return true if this is load/store scales or extends its register offset.
+  /// This refers to scaling a dynamic index as opposed to scaled immediates.
+  /// MI should be a memory op that allows scaled addressing.
+  bool isScaledAddr(const MachineInstr *MI) const;
+
+  /// Return true if pairing the given load or store is hinted to be
+  /// unprofitable.
+  bool isLdStPairSuppressed(const MachineInstr *MI) const;
+
+  /// Hint that pairing the given load or store is unprofitable.
+  void suppressLdStPair(MachineInstr *MI) const;
+
+  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
+                            unsigned &Offset,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool enableClusterLoads() const override { return true; }
+
+  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
+                          unsigned NumLoads) const override;
+
+  bool shouldScheduleAdjacent(MachineInstr *First,
+                              MachineInstr *Second) const override;
+
+  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
+                                         uint64_t Offset, const MDNode *MDPtr,
+                                         DebugLoc DL) const;
+  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                        bool KillSrc, unsigned Opcode,
+                        llvm::ArrayRef<unsigned> Indices) const;
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIdx,
-                            const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  MachineInstr *
+  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
+                        const SmallVectorImpl<unsigned> &Ops,
+                        int FrameIndex) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const;
+                     bool AllowModify = false) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
-
-  /// Look through the instructions in this function and work out the largest
-  /// the stack frame can be while maintaining the ability to address local
-  /// slots with no complexities.
-  unsigned estimateRSStackLimit(MachineFunction &MF) const;
-
-  /// getAddressConstraints - For loads and stores (and PRFMs) taking an
-  /// immediate offset, this function determines the constraints required for
-  /// the immediate. It must satisfy:
-  ///    + MinOffset <= imm <= MaxOffset
-  ///    + imm % OffsetScale == 0
-  void getAddressConstraints(const MachineInstr &MI, int &AccessScale,
-                             int &MinOffset, int &MaxOffset) const;
-
-
-  unsigned getInstSizeInBytes(const MachineInstr &MI) const;
-
-  unsigned getInstBundleLength(const MachineInstr &MI) const;
-
+                        DebugLoc DL) const override;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool canInsertSelect(const MachineBasicBlock &,
+                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
+                       unsigned, int &, int &, int &) const override;
+  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                    DebugLoc DL, unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+  void getNoopForMachoTarget(MCInst &NopInst) const override;
+
+  /// analyzeCompare - For a comparison instruction, return the source registers
+  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
+  /// Return true if the comparison instruction can be analyzed.
+  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
+                      unsigned &SrcReg2, int &CmpMask,
+                      int &CmpValue) const override;
+  /// optimizeCompareInstr - Convert the instruction supplying the argument to
+  /// the comparison into one that sets the zero bit in the flags register.
+  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
+                            unsigned SrcReg2, int CmpMask, int CmpValue,
+                            const MachineRegisterInfo *MRI) const override;
+
+private:
+  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
+                             MachineBasicBlock *TBB,
+                             const SmallVectorImpl<MachineOperand> &Cond) const;
 };
 
-bool rewriteA64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                          unsigned FrameReg, int &Offset,
-                          const AArch64InstrInfo &TII);
-
+/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
+/// plus Offset.  This is intended to be used from within the prolog/epilog
+/// insertion (PEI) pass, where a virtual scratch register may be allocated
+/// if necessary, to be replaced by the scavenger at the end of PEI.
+void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
+                     const AArch64InstrInfo *TII,
+                     MachineInstr::MIFlag = MachineInstr::NoFlags,
+                     bool SetNZCV = false);
+
+/// rewriteAArch64FrameIndex - Rewrite MI to access 'Offset' bytes from the
+/// FP. Return false if the offset could not be handled directly in MI, and
+/// return the left-over portion by reference.
+bool rewriteAArch64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
+                            unsigned FrameReg, int &Offset,
+                            const AArch64InstrInfo *TII);
+
+/// \brief Use to report the frame offset status in isAArch64FrameOffsetLegal.
+enum AArch64FrameOffsetStatus {
+  AArch64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
+  AArch64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
+  AArch64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
+};
 
-void emitRegUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   DebugLoc dl, const TargetInstrInfo &TII,
-                   unsigned DstReg, unsigned SrcReg, unsigned ScratchReg,
-                   int64_t NumBytes,
-                   MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+/// \brief Check if the @p Offset is a valid frame offset for @p MI.
+/// The returned value reports the validity of the frame offset for @p MI.
+/// It uses the values defined by AArch64FrameOffsetStatus for that.
+/// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to
+/// use an offset.eq
+/// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be
+/// rewriten in @p MI.
+/// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the
+/// amount that is off the limit of the legal offset.
+/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
+/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
+/// If set, @p EmittableOffset contains the amount that can be set in @p MI
+/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
+/// is a legal offset.
+int isAArch64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
+                            bool *OutUseUnscaledOp = nullptr,
+                            unsigned *OutUnscaledOp = nullptr,
+                            int *EmittableOffset = nullptr);
+
+static inline bool isUncondBranchOpcode(int Opc) { return Opc == AArch64::B; }
+
+static inline bool isCondBranchOpcode(int Opc) {
+  switch (Opc) {
+  case AArch64::Bcc:
+  case AArch64::CBZW:
+  case AArch64::CBZX:
+  case AArch64::CBNZW:
+  case AArch64::CBNZX:
+  case AArch64::TBZW:
+  case AArch64::TBZX:
+  case AArch64::TBNZW:
+  case AArch64::TBNZX:
+    return true;
+  default:
+    return false;
+  }
+}
 
-void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                  DebugLoc dl, const TargetInstrInfo &TII,
-                  unsigned ScratchReg, int64_t NumBytes,
-                  MachineInstr::MIFlag MIFlags = MachineInstr::NoFlags);
+static inline bool isIndirectBranchOpcode(int Opc) { return Opc == AArch64::BR; }
 
-}
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 7d7a641..9ad36e8 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1,4 +1,4 @@
-//===----- AArch64InstrInfo.td - AArch64 Instruction Info ----*- tablegen -*-=//
+//=- AArch64InstrInfo.td - Describe the AArch64 Instructions -*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file describes the AArch64 scalar instructions in TableGen format.
+// AArch64 Instruction definitions.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,5368 +19,5266 @@ def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto","crypto">;
-
-// Use fused MAC if more precision in FP computation is allowed.
-def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
-                                 " FPOpFusion::Fast)">;
-include "AArch64InstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-//  AArch64 specific pattern fragments.
-//
-// An 'fmul' node with a single use.
-def fmul_su : PatFrag<(ops node:$lhs, node:$rhs), (fmul node:$lhs, node:$rhs),[{
-  return N->hasOneUse();
-}]>;
-
+                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+def HasCRC           : Predicate<"Subtarget->hasCRC()">,
+                                 AssemblerPredicate<"FeatureCRC", "crc">;
+def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
+def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 
 //===----------------------------------------------------------------------===//
-// Target-specific ISD nodes and profiles
-//===----------------------------------------------------------------------===//
-
-def SDT_A64ret : SDTypeProfile<0, 0, []>;
-def A64ret : SDNode<"AArch64ISD::Ret", SDT_A64ret, [SDNPHasChain,
-                                                    SDNPOptInGlue,
-                                                    SDNPVariadic]>;
-
-// (ins NZCV, Condition, Dest)
-def SDT_A64br_cc : SDTypeProfile<0, 3, [SDTCisVT<0, i32>]>;
-def A64br_cc : SDNode<"AArch64ISD::BR_CC", SDT_A64br_cc, [SDNPHasChain]>;
-
-// (outs Result), (ins NZCV, IfTrue, IfFalse, Condition)
-def SDT_A64select_cc : SDTypeProfile<1, 4, [SDTCisVT<1, i32>,
-                                            SDTCisSameAs<0, 2>,
-                                            SDTCisSameAs<2, 3>]>;
-def A64select_cc : SDNode<"AArch64ISD::SELECT_CC", SDT_A64select_cc>;
-
-// (outs NZCV), (ins LHS, RHS, Condition)
-def SDT_A64setcc : SDTypeProfile<1, 3, [SDTCisVT<0, i32>,
-                                        SDTCisSameAs<1, 2>]>;
-def A64setcc : SDNode<"AArch64ISD::SETCC", SDT_A64setcc>;
-
-
-// (outs GPR64), (ins)
-def A64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-// A64 compares don't care about the cond really (they set all flags) so a
-// simple binary operator is useful.
-def A64cmp : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, node:$rhs, cond)>;
-
-
-// When matching a notional (CMP op1, (sub 0, op2)), we'd like to use a CMN
-// instruction on the grounds that "op1 - (-op2) == op1 + op2". However, the C
-// and V flags can be set differently by this operation. It comes down to
-// whether "SInt(~op2)+1 == SInt(~op2+1)" (and the same for UInt). If they are
-// then everything is fine. If not then the optimization is wrong. Thus general
-// comparisons are only valid if op2 != 0.
-
-// So, finally, the only LLVM-native comparisons that don't mention C and V are
-// SETEQ and SETNE. They're the only ones we can safely use CMN for in the
-// absence of information about op2.
-def equality_cond : PatLeaf<(cond), [{
-  return N->get() == ISD::SETEQ || N->get() == ISD::SETNE;
-}]>;
-
-def A64cmn : PatFrag<(ops node:$lhs, node:$rhs),
-                     (A64setcc node:$lhs, (sub 0, node:$rhs), equality_cond)>;
-
-// There are two layers of indirection here, driven by the following
-// considerations.
-//     + TableGen does not know CodeModel or Reloc so that decision should be
-//       made for a variable/address at ISelLowering.
-//     + The output of ISelLowering should be selectable (hence the Wrapper,
-//       rather than a bare target opcode)
-def SDTAArch64WrapperLarge : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<0, 2>,
-                                                  SDTCisSameAs<0, 3>,
-                                                  SDTCisSameAs<0, 4>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperLarge :SDNode<"AArch64ISD::WrapperLarge", SDTAArch64WrapperLarge>;
-
-def SDTAArch64WrapperSmall : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>,
-                                                  SDTCisSameAs<1, 2>,
-                                                  SDTCisVT<3, i32>,
-                                                  SDTCisPtrTy<0>]>;
-
-def A64WrapperSmall :SDNode<"AArch64ISD::WrapperSmall", SDTAArch64WrapperSmall>;
-
-
-def SDTAArch64GOTLoad : SDTypeProfile<1, 1, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-def A64GOTLoad : SDNode<"AArch64ISD::GOTLoad", SDTAArch64GOTLoad,
-                        [SDNPHasChain]>;
-
-
-// (A64BFI LHS, RHS, LSB, Width)
-def SDTA64BFI : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>,
-                                     SDTCisSameAs<1, 2>,
-                                     SDTCisVT<3, i64>,
-                                     SDTCisVT<4, i64>]>;
-
-def A64Bfi : SDNode<"AArch64ISD::BFI", SDTA64BFI>;
-
-// (A64EXTR HiReg, LoReg, LSB)
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
-                                      SDTCisVT<3, i64>]>;
-def A64Extr : SDNode<"AArch64ISD::EXTR", SDTA64EXTR>;
-
-// (A64[SU]BFX Field, ImmR, ImmS).
+// AArch64-specific DAG Nodes.
 //
-// Note that ImmR and ImmS are already encoded for the actual instructions. The
-// more natural LSB and Width mix together to form ImmR and ImmS, something
-// which TableGen can't handle.
-def SDTA64BFX : SDTypeProfile<1, 3, [SDTCisVT<2, i64>, SDTCisVT<3, i64>]>;
-def A64Sbfx : SDNode<"AArch64ISD::SBFX", SDTA64BFX>;
 
-def A64Ubfx : SDNode<"AArch64ISD::UBFX", SDTA64BFX>;
+// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
+def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
+                                              [SDTCisSameAs<0, 2>,
+                                               SDTCisSameAs<0, 3>,
+                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
+
+// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<3, i32>]>;
+
+// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
+def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
+                                            [SDTCisSameAs<0, 2>,
+                                             SDTCisSameAs<0, 3>,
+                                             SDTCisInt<0>,
+                                             SDTCisVT<1, i32>,
+                                             SDTCisVT<4, i32>]>;
+
+def SDT_AArch64Brcond  : SDTypeProfile<0, 3,
+                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
+                                      SDTCisVT<2, i32>]>;
+def SDT_AArch64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
+def SDT_AArch64tbz : SDTypeProfile<0, 3, [SDTCisInt<0>, SDTCisInt<1>,
+                                        SDTCisVT<2, OtherVT>]>;
+
+
+def SDT_AArch64CSel  : SDTypeProfile<1, 4,
+                                   [SDTCisSameAs<0, 1>,
+                                    SDTCisSameAs<0, 2>,
+                                    SDTCisInt<3>,
+                                    SDTCisVT<4, i32>]>;
+def SDT_AArch64FCmp   : SDTypeProfile<0, 2,
+                                   [SDTCisFP<0>,
+                                    SDTCisSameAs<0, 1>]>;
+def SDT_AArch64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
+def SDT_AArch64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
+                                          SDTCisSameAs<0, 1>,
+                                          SDTCisSameAs<0, 2>]>;
+def SDT_AArch64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
+def SDT_AArch64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
+def SDT_AArch64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisInt<2>, SDTCisInt<3>]>;
+def SDT_AArch64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
+def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
+
+def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
+def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
+def SDT_AArch64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>]>;
+def SDT_AArch64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+                                           SDTCisSameAs<0,2>,
+                                           SDTCisSameAs<0,3>]>;
+def SDT_AArch64TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>]>;
+def SDT_AArch64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
+
+def SDT_AArch64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
+
+def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
+                                                 SDTCisPtrTy<1>]>;
+def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
+                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
+                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+
+// Node definitions.
+def AArch64adrp          : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
+def AArch64addlow        : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>;
+def AArch64LOADgot       : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>;
+def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START",
+                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
+                                [SDNPHasChain, SDNPOutGlue]>;
+def AArch64callseq_end   : SDNode<"ISD::CALLSEQ_END",
+                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
+                                               SDTCisVT<1, i32> ]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64call          : SDNode<"AArch64ISD::CALL",
+                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                 SDNPVariadic]>;
+def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
+                                [SDNPHasChain]>;
+def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64cbnz           : SDNode<"AArch64ISD::CBNZ", SDT_AArch64cbz,
+                                [SDNPHasChain]>;
+def AArch64tbz           : SDNode<"AArch64ISD::TBZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+def AArch64tbnz           : SDNode<"AArch64ISD::TBNZ", SDT_AArch64tbz,
+                                [SDNPHasChain]>;
+
+
+def AArch64csel          : SDNode<"AArch64ISD::CSEL", SDT_AArch64CSel>;
+def AArch64csinv         : SDNode<"AArch64ISD::CSINV", SDT_AArch64CSel>;
+def AArch64csneg         : SDNode<"AArch64ISD::CSNEG", SDT_AArch64CSel>;
+def AArch64csinc         : SDNode<"AArch64ISD::CSINC", SDT_AArch64CSel>;
+def AArch64retflag       : SDNode<"AArch64ISD::RET_FLAG", SDTNone,
+                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def AArch64adc       : SDNode<"AArch64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
+def AArch64sbc       : SDNode<"AArch64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
+def AArch64add_flag  : SDNode<"AArch64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64sub_flag  : SDNode<"AArch64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
+def AArch64and_flag  : SDNode<"AArch64ISD::ANDS",  SDTBinaryArithWithFlagsOut,
+                            [SDNPCommutative]>;
+def AArch64adc_flag  : SDNode<"AArch64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
+def AArch64sbc_flag  : SDNode<"AArch64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
+
+def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
+
+def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+
+def AArch64fmax      : SDNode<"AArch64ISD::FMAX", SDTFPBinOp>;
+def AArch64fmin      : SDNode<"AArch64ISD::FMIN", SDTFPBinOp>;
+
+def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
+def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
+def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
+def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
+def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+
+def AArch64zip1      : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
+def AArch64zip2      : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
+def AArch64uzp1      : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
+def AArch64uzp2      : SDNode<"AArch64ISD::UZP2", SDT_AArch64Zip>;
+def AArch64trn1      : SDNode<"AArch64ISD::TRN1", SDT_AArch64Zip>;
+def AArch64trn2      : SDNode<"AArch64ISD::TRN2", SDT_AArch64Zip>;
+
+def AArch64movi_edit : SDNode<"AArch64ISD::MOVIedit", SDT_AArch64MOVIedit>;
+def AArch64movi_shift : SDNode<"AArch64ISD::MOVIshift", SDT_AArch64MOVIshift>;
+def AArch64movi_msl : SDNode<"AArch64ISD::MOVImsl", SDT_AArch64MOVIshift>;
+def AArch64mvni_shift : SDNode<"AArch64ISD::MVNIshift", SDT_AArch64MOVIshift>;
+def AArch64mvni_msl : SDNode<"AArch64ISD::MVNImsl", SDT_AArch64MOVIshift>;
+def AArch64movi : SDNode<"AArch64ISD::MOVI", SDT_AArch64MOVIedit>;
+def AArch64fmov : SDNode<"AArch64ISD::FMOV", SDT_AArch64MOVIedit>;
+
+def AArch64rev16 : SDNode<"AArch64ISD::REV16", SDT_AArch64UnaryVec>;
+def AArch64rev32 : SDNode<"AArch64ISD::REV32", SDT_AArch64UnaryVec>;
+def AArch64rev64 : SDNode<"AArch64ISD::REV64", SDT_AArch64UnaryVec>;
+def AArch64ext : SDNode<"AArch64ISD::EXT", SDT_AArch64ExtVec>;
+
+def AArch64vashr : SDNode<"AArch64ISD::VASHR", SDT_AArch64vshift>;
+def AArch64vlshr : SDNode<"AArch64ISD::VLSHR", SDT_AArch64vshift>;
+def AArch64vshl : SDNode<"AArch64ISD::VSHL", SDT_AArch64vshift>;
+def AArch64sqshli : SDNode<"AArch64ISD::SQSHL_I", SDT_AArch64vshift>;
+def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
+def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
+def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
+def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+
+def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
+def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
+def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+
+def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
+def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
+def AArch64cmgt: SDNode<"AArch64ISD::CMGT", SDT_AArch64binvec>;
+def AArch64cmhi: SDNode<"AArch64ISD::CMHI", SDT_AArch64binvec>;
+def AArch64cmhs: SDNode<"AArch64ISD::CMHS", SDT_AArch64binvec>;
+
+def AArch64fcmeq: SDNode<"AArch64ISD::FCMEQ", SDT_AArch64fcmp>;
+def AArch64fcmge: SDNode<"AArch64ISD::FCMGE", SDT_AArch64fcmp>;
+def AArch64fcmgt: SDNode<"AArch64ISD::FCMGT", SDT_AArch64fcmp>;
+
+def AArch64cmeqz: SDNode<"AArch64ISD::CMEQz", SDT_AArch64unvec>;
+def AArch64cmgez: SDNode<"AArch64ISD::CMGEz", SDT_AArch64unvec>;
+def AArch64cmgtz: SDNode<"AArch64ISD::CMGTz", SDT_AArch64unvec>;
+def AArch64cmlez: SDNode<"AArch64ISD::CMLEz", SDT_AArch64unvec>;
+def AArch64cmltz: SDNode<"AArch64ISD::CMLTz", SDT_AArch64unvec>;
+def AArch64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
+                        (AArch64not (AArch64cmeqz (and node:$LHS, node:$RHS)))>;
+
+def AArch64fcmeqz: SDNode<"AArch64ISD::FCMEQz", SDT_AArch64fcmpz>;
+def AArch64fcmgez: SDNode<"AArch64ISD::FCMGEz", SDT_AArch64fcmpz>;
+def AArch64fcmgtz: SDNode<"AArch64ISD::FCMGTz", SDT_AArch64fcmpz>;
+def AArch64fcmlez: SDNode<"AArch64ISD::FCMLEz", SDT_AArch64fcmpz>;
+def AArch64fcmltz: SDNode<"AArch64ISD::FCMLTz", SDT_AArch64fcmpz>;
+
+def AArch64bici: SDNode<"AArch64ISD::BICi", SDT_AArch64vecimm>;
+def AArch64orri: SDNode<"AArch64ISD::ORRi", SDT_AArch64vecimm>;
+
+def AArch64neg : SDNode<"AArch64ISD::NEG", SDT_AArch64unvec>;
+
+def AArch64tcret: SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64TCRET,
+                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
+
+def AArch64Prefetch        : SDNode<"AArch64ISD::PREFETCH", SDT_AArch64PREFETCH,
+                               [SDNPHasChain, SDNPSideEffect]>;
+
+def AArch64sitof: SDNode<"AArch64ISD::SITOF", SDT_AArch64ITOF>;
+def AArch64uitof: SDNode<"AArch64ISD::UITOF", SDT_AArch64ITOF>;
+
+def AArch64tlsdesc_call : SDNode<"AArch64ISD::TLSDESC_CALL",
+                                 SDT_AArch64TLSDescCall,
+                                 [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
+                                  SDNPVariadic]>;
+
+def AArch64WrapperLarge : SDNode<"AArch64ISD::WrapperLarge",
+                                 SDT_AArch64WrapperLarge>;
 
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
 
 //===----------------------------------------------------------------------===//
-// Call sequence pseudo-instructions
-//===----------------------------------------------------------------------===//
 
+//===----------------------------------------------------------------------===//
 
-def SDT_AArch64Call : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>;
-def AArch64Call : SDNode<"AArch64ISD::Call", SDT_AArch64Call,
-                     [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue, SDNPVariadic]>;
-
-def AArch64tcret : SDNode<"AArch64ISD::TC_RETURN", SDT_AArch64Call,
-                          [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-
-// The TLSDESCCALL node is a variant call which goes to an indirectly calculated
-// destination but needs a relocation against a fixed symbol. As such it has two
-// certain operands: the callee and the relocated variable.
+// AArch64 Instruction Predicate Definitions.
 //
-// The TLS ABI only allows it to be selected to a BLR instructin (with
-// appropriate relocation).
-def SDTTLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
-
-def A64tlsdesc_blr : SDNode<"AArch64ISD::TLSDESCCALL", SDTTLSDescCall,
-                            [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                             SDNPVariadic]>;
-
-
-def SDT_AArch64CallSeqStart : SDCallSeqStart<[ SDTCisPtrTy<0> ]>;
-def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_AArch64CallSeqStart,
-                                  [SDNPHasChain, SDNPOutGlue]>;
+def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
+def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
+def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
+def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
+def ForCodeSize   : Predicate<"ForCodeSize">;
+def NotForCodeSize   : Predicate<"!ForCodeSize">;
 
-def SDT_AArch64CallSeqEnd   : SDCallSeqEnd<[ SDTCisPtrTy<0>, SDTCisPtrTy<1> ]>;
-def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END",   SDT_AArch64CallSeqEnd,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-
-
-
-// These pseudo-instructions have special semantics by virtue of being passed to
-// the InstrInfo constructor. CALLSEQ_START/CALLSEQ_END are produced by
-// LowerCall to (in our case) tell the back-end about stack adjustments for
-// arguments passed on the stack. Here we select those markers to
-// pseudo-instructions which explicitly set the stack, and finally in the
-// RegisterInfo we convert them to a true stack adjustment.
-let Defs = [XSP], Uses = [XSP] in {
-  def ADJCALLSTACKDOWN : PseudoInst<(outs), (ins i64imm:$amt),
-                                    [(AArch64callseq_start timm:$amt)]>;
-
-  def ADJCALLSTACKUP : PseudoInst<(outs), (ins i64imm:$amt1, i64imm:$amt2),
-                                 [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
-}
+include "AArch64InstrFormats.td"
 
 //===----------------------------------------------------------------------===//
-// Atomic operation pseudo-instructions
-//===----------------------------------------------------------------------===//
-
-// These get selected from C++ code as a pretty much direct translation from the
-// generic DAG nodes. The one exception is the AtomicOrdering is added as an
-// operand so that the eventual lowering can make use of it and choose
-// acquire/release operations when required.
-
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
-  def _I8 : PseudoInst<(outs GPR32:$dst),
-                       (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I16 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I32 : PseudoInst<(outs GPR32:$dst),
-                        (ins GPR64xsp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I64 : PseudoInst<(outs GPR64:$dst),
-                        (ins GPR64xsp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-}
-}
-
-defm ATOMIC_LOAD_ADD  : AtomicSizes;
-defm ATOMIC_LOAD_SUB  : AtomicSizes;
-defm ATOMIC_LOAD_AND  : AtomicSizes;
-defm ATOMIC_LOAD_OR   : AtomicSizes;
-defm ATOMIC_LOAD_XOR  : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP      : AtomicSizes;
-let Defs = [NZCV] in {
-  // These operations need a CMP to calculate the correct value
-  defm ATOMIC_LOAD_MIN  : AtomicSizes;
-  defm ATOMIC_LOAD_MAX  : AtomicSizes;
-  defm ATOMIC_LOAD_UMIN : AtomicSizes;
-  defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
-
-class AtomicCmpSwap<RegisterClass GPRData>
-  : PseudoInst<(outs GPRData:$dst),
-               (ins GPR64xsp:$ptr, GPRData:$old, GPRData:$new,
-                    i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [NZCV];
-}
-
-def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (extended register) instructions
+// Miscellaneous instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP
-
-// The RHS of these operations is conceptually a sign/zero-extended
-// register, optionally shifted left by 1-4. The extension can be a
-// NOP (e.g. "sxtx" sign-extending a 64-bit register to 64-bits) but
-// must be specified with one exception:
-
-// If one of the registers is sp/wsp then LSL is an alias for UXTW in
-// 32-bit instructions and UXTX in 64-bit versions, the shift amount
-// is not optional in that case (but can explicitly be 0), and the
-// entire suffix can be skipped (e.g. "add sp, x3, x2").
-
-multiclass extend_operands<string PREFIX, string Diag> {
-     def _asmoperand : AsmOperandClass {
-         let Name = PREFIX;
-         let RenderMethod = "addRegExtendOperands";
-         let PredicateMethod = "isRegExtend<A64SE::" # PREFIX # ">";
-         let DiagnosticType = "AddSubRegExtend" # Diag;
-     }
-
-     def _operand : Operand<i64>,
-                    ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 4; }]> {
-         let PrintMethod = "printRegExtendOperand<A64SE::" # PREFIX # ">";
-         let DecoderMethod = "DecodeRegExtendOperand";
-         let ParserMatchClass = !cast<AsmOperandClass>(PREFIX # "_asmoperand");
-     }
-}
-
-defm UXTB : extend_operands<"UXTB", "Small">;
-defm UXTH : extend_operands<"UXTH", "Small">;
-defm UXTW : extend_operands<"UXTW", "Small">;
-defm UXTX : extend_operands<"UXTX", "Large">;
-defm SXTB : extend_operands<"SXTB", "Small">;
-defm SXTH : extend_operands<"SXTH", "Small">;
-defm SXTW : extend_operands<"SXTW", "Small">;
-defm SXTX : extend_operands<"SXTX", "Large">;
-
-def LSL_extasmoperand : AsmOperandClass {
-    let Name = "RegExtendLSL";
-    let RenderMethod = "addRegExtendOperands";
-    let DiagnosticType = "AddSubRegExtendLarge";
-}
-
-def LSL_extoperand : Operand<i64> {
-    let ParserMatchClass = LSL_extasmoperand;
-}
-
-
-// The patterns for various sign-extensions are a little ugly and
-// non-uniform because everything has already been promoted to the
-// legal i64 and i32 types. We'll wrap the various variants up in a
-// class for use later.
-class extend_types {
-    dag uxtb; dag uxth; dag uxtw; dag uxtx;
-    dag sxtb; dag sxth; dag sxtw; dag sxtx;
-    ValueType ty;
-    RegisterClass GPR;
-}
-
-def extends_to_i64 : extend_types {
-    let uxtb = (and (anyext i32:$Rm), 255);
-    let uxth = (and (anyext i32:$Rm), 65535);
-    let uxtw = (zext i32:$Rm);
-    let uxtx = (i64 $Rm);
 
-    let sxtb = (sext_inreg (anyext i32:$Rm), i8);
-    let sxth = (sext_inreg (anyext i32:$Rm), i16);
-    let sxtw = (sext i32:$Rm);
-    let sxtx = (i64 $Rm);
-
-    let ty = i64;
-    let GPR = GPR64xsp;
-}
-
-
-def extends_to_i32 : extend_types {
-    let uxtb = (and i32:$Rm, 255);
-    let uxth = (and i32:$Rm, 65535);
-    let uxtw = (i32 i32:$Rm);
-    let uxtx = (i32 i32:$Rm);
-
-    let sxtb = (sext_inreg i32:$Rm, i8);
-    let sxth = (sext_inreg i32:$Rm, i16);
-    let sxtw = (i32 i32:$Rm);
-    let sxtx = (i32 i32:$Rm);
-
-    let ty = i32;
-    let GPR = GPR32wsp;
-}
-
-// Now, six of the extensions supported are easy and uniform: if the source size
-// is 32-bits or less, then Rm is always a 32-bit register. We'll instantiate
-// those instructions in one block.
-
-// The uxtx/sxtx could potentially be merged in, but three facts dissuaded me:
-//     + It would break the naming scheme: either ADDxx_uxtx or ADDww_uxtx would
-//       be impossible.
-//     + Patterns are very different as well.
-//     + Passing different registers would be ugly (more fields in extend_types
-//       would probably be the best option).
-multiclass addsub_exts<bit sf, bit op, bit S, string asmop,
-                       SDPatternOperator opfrag,
-                       dag outs, extend_types exts> {
-    def w_uxtb : A64I_addsubext<sf, op, S, 0b00, 0b000,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtb, UXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxth : A64I_addsubext<sf, op, S, 0b00, 0b001,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxth, UXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_uxtw : A64I_addsubext<sf, op, S, 0b00, 0b010,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, UXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.uxtw, UXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtb : A64I_addsubext<sf, op, S, 0b00, 0b100,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTB_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtb, SXTB_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxth : A64I_addsubext<sf, op, S, 0b00, 0b101,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTH_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxth, SXTH_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-    def w_sxtw : A64I_addsubext<sf, op, S, 0b00, 0b110,
-                    outs, (ins exts.GPR:$Rn, GPR32:$Rm, SXTW_operand:$Imm3),
-                    !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                    [(opfrag exts.ty:$Rn, (shl exts.sxtw, SXTW_operand:$Imm3))],
-                    NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-// These two could be merge in with the above, but their patterns aren't really
-// necessary and the naming-scheme would necessarily break:
-multiclass addsub_xxtx<bit op, bit S, string asmop, SDPatternOperator opfrag,
-                       dag outs> {
-    def x_uxtx : A64I_addsubext<0b1, op, S, 0b00, 0b011,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [(opfrag i64:$Rn, (shl i64:$Rm, UXTX_operand:$Imm3))],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def x_sxtx : A64I_addsubext<0b1, op, S, 0b00, 0b111,
-                   outs,
-                   (ins GPR64xsp:$Rn, GPR64:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: same as uxtx */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-multiclass addsub_wxtx<bit op, bit S, string asmop, dag outs> {
-    def w_uxtx : A64I_addsubext<0b0, op, S, 0b00, 0b011,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, UXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def w_sxtx : A64I_addsubext<0b0, op, S, 0b00, 0b111,
-                   outs, (ins GPR32wsp:$Rn, GPR32:$Rm, SXTX_operand:$Imm3),
-                   !strconcat(asmop, "$Rn, $Rm, $Imm3"),
-                   [/* No Pattern: probably same as uxtw */],
-                   NoItinerary>,
-                 Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-class SetRD<RegisterClass RC, SDPatternOperator op>
- : PatFrag<(ops node:$lhs, node:$rhs), (set RC:$Rd, (op node:$lhs, node:$rhs))>;
-class SetNZCV<SDPatternOperator op>
-  : PatFrag<(ops node:$lhs, node:$rhs), (set NZCV, (op node:$lhs, node:$rhs))>;
-
-defm ADDxx :addsub_exts<0b1, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b0, "add\t$Rd, ", SetRD<GPR64xsp, add>,
-                        (outs GPR64xsp:$Rd)>;
-defm ADDww :addsub_exts<0b0, 0b0, 0b0, "add\t$Rd, ", SetRD<GPR32wsp, add>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b0, "add\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-defm SUBxx :addsub_exts<0b1, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b0, "sub\t$Rd, ", SetRD<GPR64xsp, sub>,
-                        (outs GPR64xsp:$Rd)>;
-defm SUBww :addsub_exts<0b0, 0b1, 0b0, "sub\t$Rd, ", SetRD<GPR32wsp, sub>,
-                        (outs GPR32wsp:$Rd), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b0, "sub\t$Rd, ",
-                        (outs GPR32wsp:$Rd)>;
-
-let Defs = [NZCV] in {
-defm ADDSxx :addsub_exts<0b1, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b0, 0b1, "adds\t$Rd, ", SetRD<GPR64, addc>,
-                         (outs GPR64:$Rd)>;
-defm ADDSww :addsub_exts<0b0, 0b0, 0b1, "adds\t$Rd, ", SetRD<GPR32, addc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b0, 0b1, "adds\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-defm SUBSxx :addsub_exts<0b1, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd), extends_to_i64>,
-             addsub_xxtx<     0b1, 0b1, "subs\t$Rd, ", SetRD<GPR64, subc>,
-                         (outs GPR64:$Rd)>;
-defm SUBSww :addsub_exts<0b0, 0b1, 0b1, "subs\t$Rd, ", SetRD<GPR32, subc>,
-                         (outs GPR32:$Rd), extends_to_i32>,
-             addsub_wxtx<     0b1, 0b1, "subs\t$Rd, ",
-                         (outs GPR32:$Rd)>;
-
-
-let SchedRW = [WriteCMP, ReadCMP, ReadCMP], Rd = 0b11111, isCompare = 1 in {
-defm CMNx : addsub_exts<0b1, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b0, 0b1, "cmn\t", SetNZCV<A64cmn>, (outs)>;
-defm CMNw : addsub_exts<0b0, 0b0, 0b1, "cmn\t", SetNZCV<A64cmn>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b0, 0b1, "cmn\t", (outs)>;
-defm CMPx : addsub_exts<0b1, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i64>,
-            addsub_xxtx<     0b1, 0b1, "cmp\t", SetNZCV<A64cmp>, (outs)>;
-defm CMPw : addsub_exts<0b0, 0b1, 0b1, "cmp\t", SetNZCV<A64cmp>,
-                        (outs), extends_to_i32>,
-            addsub_wxtx<     0b1, 0b1, "cmp\t", (outs)>;
-}
-}
-
-// Now patterns for the operation without a shift being needed. No patterns are
-// created for uxtx/sxtx since they're non-uniform and it's expected that
-// add/sub (shifted register) will handle those cases anyway.
-multiclass addsubext_noshift_patterns<string prefix, SDPatternOperator nodeop,
-                                      extend_types exts> {
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtb),
-              (!cast<Instruction>(prefix # "w_uxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxth),
-              (!cast<Instruction>(prefix # "w_uxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.uxtw),
-              (!cast<Instruction>(prefix # "w_uxtw") $Rn, $Rm, 0)>;
-
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtb),
-              (!cast<Instruction>(prefix # "w_sxtb") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxth),
-              (!cast<Instruction>(prefix # "w_sxth") $Rn, $Rm, 0)>;
-    def : Pat<(nodeop exts.ty:$Rn, exts.sxtw),
-              (!cast<Instruction>(prefix # "w_sxtw") $Rn, $Rm, 0)>;
-}
-
-defm : addsubext_noshift_patterns<"ADDxx", add, extends_to_i64>;
-defm : addsubext_noshift_patterns<"ADDww", add, extends_to_i32>;
-defm : addsubext_noshift_patterns<"SUBxx", sub, extends_to_i64>;
-defm : addsubext_noshift_patterns<"SUBww", sub, extends_to_i32>;
-
-defm : addsubext_noshift_patterns<"CMNx", A64cmn, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMNw", A64cmn, extends_to_i32>;
-defm : addsubext_noshift_patterns<"CMPx", A64cmp, extends_to_i64>;
-defm : addsubext_noshift_patterns<"CMPw", A64cmp, extends_to_i32>;
-
-// An extend of "lsl #imm" is valid if and only if one of Rn and Rd is
-// sp/wsp. It is synonymous with uxtx/uxtw depending on the size of the
-// operation. Also permitted in this case is complete omission of the argument,
-// which implies "lsl #0".
-multiclass lsl_aliases<string asmop, Instruction inst, RegisterClass GPR_Rd,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                    (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
-    def : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm, $LSL"),
-                (inst GPR_Rd:$Rd, GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-
-}
-
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"add",  ADDxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"add",  ADDwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  Rxsp, GPR64xsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBxxx_uxtx,  GPR64xsp, Rxsp, GPR64>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  Rwsp, GPR32wsp, GPR32>;
-defm : lsl_aliases<"sub",  SUBwww_uxtw,  GPR32wsp, Rwsp, GPR32>;
-
-// Rd cannot be sp for flag-setting variants so only half of the aliases are
-// needed.
-defm : lsl_aliases<"adds", ADDSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"adds", ADDSwww_uxtw, GPR32, Rwsp, GPR32>;
-defm : lsl_aliases<"subs", SUBSxxx_uxtx, GPR64, Rxsp, GPR64>;
-defm : lsl_aliases<"subs", SUBSwww_uxtw, GPR32, Rwsp, GPR32>;
-
-// CMP unfortunately has to be different because the instruction doesn't have a
-// dest register.
-multiclass cmp_lsl_aliases<string asmop, Instruction inst,
-                       RegisterClass GPR_Rn, RegisterClass GPR_Rm> {
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, 0)>;
-
-    def : InstAlias<!strconcat(asmop, " $Rn, $Rm, $LSL"),
-                    (inst GPR_Rn:$Rn, GPR_Rm:$Rm, LSL_extoperand:$LSL)>;
-}
-
-defm : cmp_lsl_aliases<"cmp", CMPxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmp", CMPww_uxtw, Rwsp, GPR32>;
-defm : cmp_lsl_aliases<"cmn", CMNxx_uxtx, Rxsp, GPR64>;
-defm : cmp_lsl_aliases<"cmn", CMNww_uxtw, Rwsp, GPR32>;
+let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              [(AArch64callseq_start timm:$amt)]>;
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                            [(AArch64callseq_end timm:$amt1, timm:$amt2)]>;
+} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
+
+let isReMaterializable = 1, isCodeGenOnly = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, they can be
+// removed, along with the AArch64Wrapper node.
+
+let AddedComplexity = 10 in
+def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
+                     [(set GPR64:$dst, (AArch64LOADgot tglobaladdr:$addr))]>,
+              Sched<[WriteLDAdr]>;
+
+// The MOVaddr instruction should match only when the add is not folded
+// into a load or store address.
+def MOVaddr
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaladdr:$hi),
+                                            tglobaladdr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrJT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tjumptable:$hi),
+                                             tjumptable:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrCP
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tconstpool:$hi),
+                                             tconstpool:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrBA
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tblockaddress:$hi),
+                                             tblockaddress:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrTLS
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp tglobaltlsaddr:$hi),
+                                            tglobaltlsaddr:$low))]>,
+      Sched<[WriteAdrAdr]>;
+def MOVaddrEXT
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
+             [(set GPR64:$dst, (AArch64addlow (AArch64adrp texternalsym:$hi),
+                                            texternalsym:$low))]>,
+      Sched<[WriteAdrAdr]>;
+
+} // isReMaterializable, isCodeGenOnly
+
+def : Pat<(AArch64LOADgot tglobaltlsaddr:$addr),
+          (LOADgot tglobaltlsaddr:$addr)>;
+
+def : Pat<(AArch64LOADgot texternalsym:$addr),
+          (LOADgot texternalsym:$addr)>;
+
+def : Pat<(AArch64LOADgot tconstpool:$addr),
+          (LOADgot tconstpool:$addr)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (immediate) instructions
+// System instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, MOV
-
-// These instructions accept a 12-bit unsigned immediate, optionally shifted
-// left by 12 bits. Official assembly format specifies a 12 bit immediate with
-// one of "", "LSL #0", "LSL #12" supplementary operands.
-
-// There are surprisingly few ways to make this work with TableGen, so this
-// implementation has separate instructions for the "LSL #0" and "LSL #12"
-// variants.
-
-// If the MCInst retained a single combined immediate (which could be 0x123000,
-// for example) then both components (imm & shift) would have to be delegated to
-// a single assembly operand. This would entail a separate operand parser
-// (because the LSL would have to live in the same AArch64Operand as the
-// immediate to be accessible); assembly parsing is rather complex and
-// error-prone C++ code.
-//
-// By splitting the immediate, we can delegate handling this optional operand to
-// an InstAlias. Supporting functions to generate the correct MCInst are still
-// required, but these are essentially trivial and parsing can remain generic.
-//
-// Rejected plans with rationale:
-// ------------------------------
-//
-// In an ideal world you'de have two first class immediate operands (in
-// InOperandList, specifying imm12 and shift). Unfortunately this is not
-// selectable by any means I could discover.
-//
-// An Instruction with two MCOperands hidden behind a single entry in
-// InOperandList (expanded by ComplexPatterns and MIOperandInfo) was functional,
-// but required more C++ code to handle encoding/decoding. Parsing (the intended
-// main beneficiary) ended up equally complex because of the optional nature of
-// "LSL #0".
-//
-// Attempting to circumvent the need for a custom OperandParser above by giving
-// InstAliases without the "lsl #0" failed. add/sub could be accommodated but
-// the cmp/cmn aliases didn't use the MIOperandInfo to determine how operands
-// should be parsed: there was no way to accommodate an "lsl #12".
-
-let ParserMethod = "ParseImmWithLSLOperand",
-    RenderMethod = "addImmWithLSLOperands" in {
-  // Derived PredicateMethod fields are different for each
-  def addsubimm_lsl0_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL0";
-    // If an error is reported against this operand, instruction could also be a
-    // register variant.
-    let DiagnosticType = "AddSubSecondSource";
-  }
-
-  def addsubimm_lsl12_asmoperand : AsmOperandClass {
-    let Name = "AddSubImmLSL12";
-    let DiagnosticType = "AddSubSecondSource";
-  }
-}
-
-def shr_12_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getSExtValue() >> 12, MVT::i32);
-}]>;
 
-def shr_12_neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((-N->getSExtValue()) >> 12, MVT::i32);
-}]>;
-
-def neg_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(-N->getSExtValue(), MVT::i32);
-}]>;
+def HINT  : HintI<"hint">;
+def : InstAlias<"nop",  (HINT 0b000)>;
+def : InstAlias<"yield",(HINT 0b001)>;
+def : InstAlias<"wfe",  (HINT 0b010)>;
+def : InstAlias<"wfi",  (HINT 0b011)>;
+def : InstAlias<"sev",  (HINT 0b100)>;
+def : InstAlias<"sevl", (HINT 0b101)>;
 
+  // As far as LLVM is concerned this writes to the system's exclusive monitors.
+let mayLoad = 1, mayStore = 1 in
+def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
 
-multiclass addsub_imm_operands<ValueType ty> {
- let PrintMethod = "printAddSubImmLSL0Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl0_asmoperand in {
-    def _posimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff) == 0; }]>;
-    def _negimm_lsl0 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff) == 0; }],
-                neg_XFORM>;
-  }
-
-  let PrintMethod = "printAddSubImmLSL12Operand",
-      EncoderMethod = "getAddSubImmOpValue",
-      ParserMatchClass = addsubimm_lsl12_asmoperand in {
-    def _posimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm >= 0 && (Imm & ~0xfff000) == 0; }],
-                shr_12_XFORM>;
-
-    def _negimm_lsl12 : Operand<ty>,
-        ImmLeaf<ty, [{ return Imm < 0 && (-Imm & ~0xfff000) == 0; }],
-                shr_12_neg_XFORM>;
-  }
-}
-
-// The add operands don't need any transformation
-defm addsubimm_operand_i32 : addsub_imm_operands<i32>;
-defm addsubimm_operand_i64 : addsub_imm_operands<i64>;
-
-multiclass addsubimm_varieties<string prefix, bit sf, bit op, bits<2> shift,
-                               string asmop, string cmpasmop,
-                               Operand imm_operand, Operand cmp_imm_operand,
-                               RegisterClass GPR, RegisterClass GPRsp,
-                               AArch64Reg ZR, ValueType Ty> {
-    // All registers for non-S variants allow SP
-  def _s : A64I_addsubimm<sf, op, 0b0, shift,
-                         (outs GPRsp:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (add Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]>;
-
-
-  // S variants can read SP but would write to ZR
-  def _S : A64I_addsubimm<sf, op, 0b1, shift,
-                         (outs GPR:$Rd),
-                         (ins GPRsp:$Rn, imm_operand:$Imm12),
-                         !strconcat(asmop, "s\t$Rd, $Rn, $Imm12"),
-                         [(set Ty:$Rd, (addc Ty:$Rn, imm_operand:$Imm12))],
-                         NoItinerary>,
-           Sched<[WriteALU, ReadALU]> {
-    let Defs = [NZCV];
-  }
-
-  // Note that the pattern here for ADDS is subtle. Canonically CMP
-  // a, b becomes SUBS a, b. If b < 0 then this is equivalent to
-  // ADDS a, (-b). This is not true in general.
-  def _cmp : A64I_addsubimm<sf, op, 0b1, shift,
-                            (outs), (ins GPRsp:$Rn, imm_operand:$Imm12),
-                            !strconcat(cmpasmop, " $Rn, $Imm12"),
-                            [(set NZCV,
-                                  (A64cmp Ty:$Rn, cmp_imm_operand:$Imm12))],
-                            NoItinerary>,
-           Sched<[WriteCMP, ReadCMP]> {
-    let Rd = 0b11111;
-    let Defs = [NZCV];
-    let isCompare = 1;
-  }
-}
+def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
+def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
+def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
+def : InstAlias<"clrex", (CLREX 0xf)>;
+def : InstAlias<"isb", (ISB 0xf)>;
 
+def MRS    : MRSI;
+def MSR    : MSRI;
+def MSRpstate: MSRpstateI;
 
-multiclass addsubimm_shifts<string prefix, bit sf, bit op,
-           string asmop, string cmpasmop, string operand, string cmpoperand,
-           RegisterClass GPR, RegisterClass GPRsp, AArch64Reg ZR,
-           ValueType Ty> {
-  defm _lsl0 : addsubimm_varieties<prefix # "_lsl0", sf, op, 0b00,
-                                   asmop, cmpasmop,
-                                   !cast<Operand>(operand # "_lsl0"),
-                                   !cast<Operand>(cmpoperand # "_lsl0"),
-                                   GPR, GPRsp, ZR, Ty>;
-
-  defm _lsl12 : addsubimm_varieties<prefix # "_lsl12", sf, op, 0b01,
-                                    asmop, cmpasmop,
-                                    !cast<Operand>(operand # "_lsl12"),
-                                    !cast<Operand>(cmpoperand # "_lsl12"),
-                                    GPR, GPRsp, ZR, Ty>;
-}
+// The thread pointer (on Linux, at least, where this has been implemented) is
+// TPIDR_EL0.
+def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
 
-defm ADDwwi : addsubimm_shifts<"ADDwi", 0b0, 0b0, "add", "cmn",
-                              "addsubimm_operand_i32_posimm",
-                              "addsubimm_operand_i32_negimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm ADDxxi : addsubimm_shifts<"ADDxi", 0b1, 0b0, "add", "cmn",
-                              "addsubimm_operand_i64_posimm",
-                              "addsubimm_operand_i64_negimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-defm SUBwwi : addsubimm_shifts<"SUBwi", 0b0, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i32_negimm",
-                              "addsubimm_operand_i32_posimm",
-                              GPR32, GPR32wsp, WZR, i32>;
-defm SUBxxi : addsubimm_shifts<"SUBxi", 0b1, 0b1, "sub", "cmp",
-                              "addsubimm_operand_i64_negimm",
-                              "addsubimm_operand_i64_posimm",
-                              GPR64, GPR64xsp, XZR, i64>;
-
-multiclass MOVsp<RegisterClass GPRsp, RegisterClass SP, Instruction addop> {
-  def _fromsp : InstAlias<"mov $Rd, $Rn",
-                          (addop GPRsp:$Rd, SP:$Rn, 0),
-                          0b1>;
-
-  def _tosp : InstAlias<"mov $Rd, $Rn",
-                        (addop SP:$Rd, GPRsp:$Rn, 0),
-                        0b1>;
-}
+// Generic system instructions
+def SYSxt  : SystemXtI<0, "sys">;
+def SYSLxt : SystemLXtI<1, "sysl">;
 
-// Recall Rxsp is a RegisterClass containing *just* xsp.
-defm MOVxx : MOVsp<GPR64xsp, Rxsp, ADDxxi_lsl0_s>;
-defm MOVww : MOVsp<GPR32wsp, Rwsp, ADDwwi_lsl0_s>;
+def : InstAlias<"sys $op1, $Cn, $Cm, $op2",
+                (SYSxt imm0_7:$op1, sys_cr_op:$Cn,
+                 sys_cr_op:$Cm, imm0_7:$op2, XZR)>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (shifted register) instructions
+// Move immediate instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADD, ADDS, SUB, SUBS + aliases CMN, CMP, NEG, NEGS
-
-//===-------------------------------
-// 1. The "shifted register" operands. Shared with logical insts.
-//===-------------------------------
-
-multiclass shift_operands<string prefix, string form> {
-  def _asmoperand_i32 : AsmOperandClass {
-    let Name = "Shift" # form # "i32";
-    let RenderMethod = "addShiftOperands";
-    let PredicateMethod = "isShift<A64SE::" # form # ", false>";
-    let DiagnosticType = "AddSubRegShift32";
-  }
 
-  // Note that the operand type is intentionally i64 because the DAGCombiner
-  // puts these into a canonical form.
-  def _i32 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i32");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-    let DecoderMethod = "Decode32BitShiftOperand";
-  }
+defm MOVK : InsertImmediate<0b11, "movk">;
+defm MOVN : MoveImmediate<0b00, "movn">;
 
-  def _asmoperand_i64 : AsmOperandClass {
-      let Name = "Shift" # form # "i64";
-      let RenderMethod = "addShiftOperands";
-      let PredicateMethod = "isShift<A64SE::" # form # ", true>";
-      let DiagnosticType = "AddSubRegShift64";
-  }
+let PostEncoderMethod = "fixMOVZ" in
+defm MOVZ : MoveImmediate<0b10, "movz">;
 
-  def _i64 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-    let ParserMatchClass
-          = !cast<AsmOperandClass>(prefix # "_asmoperand_i64");
-    let PrintMethod = "printShiftOperand<A64SE::" # form # ">";
-  }
-}
+// First group of aliases covers an implicit "lsl #0".
+def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
+def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
 
-defm lsl_operand : shift_operands<"lsl_operand", "LSL">;
-defm lsr_operand : shift_operands<"lsr_operand", "LSR">;
-defm asr_operand : shift_operands<"asr_operand", "ASR">;
-
-// Not used for add/sub, but defined here for completeness. The "logical
-// (shifted register)" instructions *do* have an ROR variant.
-defm ror_operand : shift_operands<"ror_operand", "ROR">;
-
-//===-------------------------------
-// 2. The basic 3.5-operand ADD/SUB/ADDS/SUBS instructions.
-//===-------------------------------
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass addsub_shifts<string prefix, bit sf, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag, ValueType ty,
-                         RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_addsubshift<sf, op, s, 0b00,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set GPR:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _lsr : A64I_addsubshift<sf, op, s, 0b01,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-
-  def _asr : A64I_addsubshift<sf, op, s, 0b10,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]>;
-  }
-
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass addsub_sizes<string prefix, bit op, bit s, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : addsub_shifts<prefix # "xxx", 0b1, op, s,
-                           commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : addsub_shifts<prefix # "www", 0b0, op, s,
-                           commutable, asmop, opfrag, i32, GPR32, defs>;
-}
+// Next, we have various ELF relocations with the ":XYZ_g0:sym" syntax.
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
 
-defm ADD : addsub_sizes<"ADD", 0b0, 0b0, 0b1, "add", add, []>;
-defm SUB : addsub_sizes<"SUB", 0b1, 0b0, 0b0, "sub", sub, []>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g3:$sym, 48)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
 
-defm ADDS : addsub_sizes<"ADDS", 0b0, 0b1, 0b1, "adds", addc, [NZCV]>;
-defm SUBS : addsub_sizes<"SUBS", 0b1, 0b1, 0b0, "subs", subc, [NZCV]>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
-//===-------------------------------
-// 1. The NEG/NEGS aliases
-//===-------------------------------
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
+def : InstAlias<"movn $Rd, $sym", (MOVNWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
 
-multiclass neg_alias<Instruction INST, RegisterClass GPR, Register ZR,
-                     ValueType ty, Operand shift_operand, SDNode shiftop> {
-   def : InstAlias<"neg $Rd, $Rm, $Imm6",
-                   (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
+def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
 
-   def : Pat<(sub 0, (shiftop ty:$Rm, shift_operand:$Imm6)),
-             (INST ZR, $Rm, shift_operand:$Imm6)>;
-}
+// Final group of aliases covers true "mov $Rd, $imm" cases.
+multiclass movw_mov_alias<string basename,Instruction INST, RegisterClass GPR,
+                          int width, int shift> {
+  def _asmoperand : AsmOperandClass {
+    let Name = basename # width # "_lsl" # shift # "MovAlias";
+    let PredicateMethod = "is" # basename # "MovAlias<" # width # ", "
+                               # shift # ">";
+    let RenderMethod = "add" # basename # "MovAliasOperands<" # shift # ">";
+  }
 
-defm : neg_alias<SUBwww_lsl, GPR32, WZR, i32, lsl_operand_i32, shl>;
-defm : neg_alias<SUBwww_lsr, GPR32, WZR, i32, lsr_operand_i32, srl>;
-defm : neg_alias<SUBwww_asr, GPR32, WZR, i32, asr_operand_i32, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-def : Pat<(sub 0, i32:$Rm), (SUBwww_lsl WZR, $Rm, 0)>;
-
-defm : neg_alias<SUBxxx_lsl, GPR64, XZR, i64, lsl_operand_i64, shl>;
-defm : neg_alias<SUBxxx_lsr, GPR64, XZR, i64, lsr_operand_i64, srl>;
-defm : neg_alias<SUBxxx_asr, GPR64, XZR, i64, asr_operand_i64, sra>;
-def : InstAlias<"neg $Rd, $Rm", (SUBxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def : Pat<(sub 0, i64:$Rm), (SUBxxx_lsl XZR, $Rm, 0)>;
-
-// NEGS doesn't get any patterns yet: defining multiple outputs means C++ has to
-// be involved.
-class negs_alias<Instruction INST, RegisterClass GPR,
-                 Register ZR, Operand shift_operand, SDNode shiftop>
-  : InstAlias<"negs $Rd, $Rm, $Imm6",
-              (INST GPR:$Rd, ZR, GPR:$Rm, shift_operand:$Imm6)>;
-
-def : negs_alias<SUBSwww_lsl, GPR32, WZR, lsl_operand_i32, shl>;
-def : negs_alias<SUBSwww_lsr, GPR32, WZR, lsr_operand_i32, srl>;
-def : negs_alias<SUBSwww_asr, GPR32, WZR, asr_operand_i32, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
-
-def : negs_alias<SUBSxxx_lsl, GPR64, XZR, lsl_operand_i64, shl>;
-def : negs_alias<SUBSxxx_lsr, GPR64, XZR, lsr_operand_i64, srl>;
-def : negs_alias<SUBSxxx_asr, GPR64, XZR, asr_operand_i64, sra>;
-def : InstAlias<"negs $Rd, $Rm", (SUBSxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-
-//===-------------------------------
-// 1. The CMP/CMN aliases
-//===-------------------------------
-
-multiclass cmp_shifts<string prefix, bit sf, bit op, bit commutable,
-                      string asmop, SDPatternOperator opfrag, ValueType ty,
-                      RegisterClass GPR> {
-  let isCommutable = commutable, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_addsubshift<sf, op, 0b1, 0b00,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _lsr : A64I_addsubshift<sf, op, 0b1, 0b01,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-  def _asr : A64I_addsubshift<sf, op, 0b1, 0b10,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rn, $Rm, $Imm6"),
-                       [(set NZCV, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteCMP, ReadCMP, ReadCMP]>;
+  def _movimm : Operand<i32> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_asmoperand");
   }
 
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
+  def : InstAlias<"mov $Rd, $imm",
+                  (INST GPR:$Rd, !cast<Operand>(NAME # "_movimm"):$imm, shift)>;
+}
+
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVZ", MOVZXi, GPR64, 64, 48>;
+
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 0>;
+defm : movw_mov_alias<"MOVN", MOVNWi, GPR32, 32, 16>;
+
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 0>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 16>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 32>;
+defm : movw_mov_alias<"MOVN", MOVNXi, GPR64, 64, 48>;
+
+let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
+    isAsCheapAsAMove = 1 in {
+// FIXME: The following pseudo instructions are only needed because remat
+// cannot handle multiple instructions.  When that changes, we can select
+// directly to the real instructions and get rid of these pseudos.
+
+def MOVi32imm
+    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
+             [(set GPR32:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+def MOVi64imm
+    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
+             [(set GPR64:$dst, imm:$src)]>,
+      Sched<[WriteImm]>;
+} // isReMaterializable, isCodeGenOnly
+
+// If possible, we want to use MOVi32imm even for 64-bit moves. This gives the
+// eventual expansion code fewer bits to worry about getting right. Marshalling
+// the types is a little tricky though:
+def i64imm_32bit : ImmLeaf<i64, [{
+  return (Imm & 0xffffffffULL) == static_cast<uint64_t>(Imm);
+}]>;
 
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
+def trunc_imm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), MVT::i32);
+}]>;
 
-defm CMPww : cmp_shifts<"CMPww", 0b0, 0b1, 0b0, "cmp", A64cmp, i32, GPR32>;
-defm CMPxx : cmp_shifts<"CMPxx", 0b1, 0b1, 0b0, "cmp", A64cmp, i64, GPR64>;
+def : Pat<(i64 i64imm_32bit:$src),
+          (SUBREG_TO_REG (i64 0), (MOVi32imm (trunc_imm imm:$src)), sub_32)>;
+
+// Deal with the various forms of (ELF) large addressing with MOVZ/MOVK
+// sequences.
+def : Pat<(AArch64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
+                             tglobaladdr:$g1, tglobaladdr:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
+                                  tglobaladdr:$g2, 32),
+                          tglobaladdr:$g1, 16),
+                  tglobaladdr:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
+                             tblockaddress:$g1, tblockaddress:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
+                                  tblockaddress:$g2, 32),
+                          tblockaddress:$g1, 16),
+                  tblockaddress:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tconstpool:$g3, tconstpool:$g2,
+                             tconstpool:$g1, tconstpool:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
+                                  tconstpool:$g2, 32),
+                          tconstpool:$g1, 16),
+                  tconstpool:$g0, 0)>;
+
+def : Pat<(AArch64WrapperLarge tjumptable:$g3, tjumptable:$g2,
+                             tjumptable:$g1, tjumptable:$g0),
+          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tjumptable:$g3, 48),
+                                  tjumptable:$g2, 32),
+                          tjumptable:$g1, 16),
+                  tjumptable:$g0, 0)>;
 
-defm CMNww : cmp_shifts<"CMNww", 0b0, 0b0, 0b1, "cmn", A64cmn, i32, GPR32>;
-defm CMNxx : cmp_shifts<"CMNxx", 0b1, 0b0, 0b1, "cmn", A64cmn, i64, GPR64>;
 
 //===----------------------------------------------------------------------===//
-// Add-subtract (with carry) instructions
+// Arithmetic instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADC, ADCS, SBC, SBCS + aliases NGC, NGCS
-
-multiclass A64I_addsubcarrySizes<bit op, bit s, string asmop> {
-  let Uses = [NZCV] in {
-    def www : A64I_addsubcarry<0b0, op, s, 0b000000,
-                               (outs GPR32:$Rd), (ins GPR32:$Rn, GPR32:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-
-    def xxx : A64I_addsubcarry<0b1, op, s, 0b000000,
-                               (outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-                               !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                               [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-}
-
-let isCommutable = 1 in {
-  defm ADC : A64I_addsubcarrySizes<0b0, 0b0, "adc">;
-}
-
-defm SBC : A64I_addsubcarrySizes<0b1, 0b0, "sbc">;
-
-let Defs = [NZCV] in {
-  let isCommutable = 1 in {
-    defm ADCS : A64I_addsubcarrySizes<0b0, 0b1, "adcs">;
-  }
 
-  defm SBCS : A64I_addsubcarrySizes<0b1, 0b1, "sbcs">;
-}
-
-def : InstAlias<"ngc $Rd, $Rm", (SBCwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngc $Rd, $Rm", (SBCxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSwww GPR32:$Rd, WZR, GPR32:$Rm)>;
-def : InstAlias<"ngcs $Rd, $Rm", (SBCSxxx GPR64:$Rd, XZR, GPR64:$Rm)>;
+// Add/subtract with carry.
+defm ADC : AddSubCarry<0, "adc", "adcs", AArch64adc, AArch64adc_flag>;
+defm SBC : AddSubCarry<1, "sbc", "sbcs", AArch64sbc, AArch64sbc_flag>;
+
+def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
+def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
+
+// Add/subtract
+defm ADD : AddSub<0, "add", add>;
+defm SUB : AddSub<1, "sub">;
+
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sponly:$dst, GPR32sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDWri GPR32sp:$dst, GPR32sponly:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sponly:$dst, GPR64sp:$src, 0, 0)>;
+def : InstAlias<"mov $dst, $src",
+                (ADDXri GPR64sp:$dst, GPR64sponly:$src, 0, 0)>;
+
+defm ADDS : AddSubS<0, "adds", AArch64add_flag, "cmn">;
+defm SUBS : AddSubS<1, "subs", AArch64sub_flag, "cmp">;
+
+// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
+def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
+          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
+          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
+def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
+          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
+def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
+          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
+def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
+          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
+def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
+          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+// Because of the immediate format for add/sub-imm instructions, the
+// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
+//  These patterns capture that transformation.
+let AddedComplexity = 1 in {
+def : Pat<(AArch64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+def : Pat<(AArch64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
+          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
+def : Pat<(AArch64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
+          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
+}
+
+def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"neg $dst, $src$shift",
+                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0), 3>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift32:$shift), 2>;
+def : InstAlias<"negs $dst, $src$shift",
+                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift64:$shift), 2>;
+
+
+// Unsigned/Signed divide
+defm UDIV : Div<0, "udiv", udiv>;
+defm SDIV : Div<1, "sdiv", sdiv>;
+let isCodeGenOnly = 1 in {
+defm UDIV_Int : Div<0, "udiv", int_aarch64_udiv>;
+defm SDIV_Int : Div<1, "sdiv", int_aarch64_sdiv>;
+}
+
+// Variable shift
+defm ASRV : Shift<0b10, "asr", sra>;
+defm LSLV : Shift<0b00, "lsl", shl>;
+defm LSRV : Shift<0b01, "lsr", srl>;
+defm RORV : Shift<0b11, "ror", rotr>;
+
+def : ShiftAlias<"asrv", ASRVWr, GPR32>;
+def : ShiftAlias<"asrv", ASRVXr, GPR64>;
+def : ShiftAlias<"lslv", LSLVWr, GPR32>;
+def : ShiftAlias<"lslv", LSLVXr, GPR64>;
+def : ShiftAlias<"lsrv", LSRVWr, GPR32>;
+def : ShiftAlias<"lsrv", LSRVXr, GPR64>;
+def : ShiftAlias<"rorv", RORVWr, GPR32>;
+def : ShiftAlias<"rorv", RORVXr, GPR64>;
+
+// Multiply-add
+let AddedComplexity = 7 in {
+defm MADD : MulAccum<0, "madd", add>;
+defm MSUB : MulAccum<1, "msub", sub>;
+
+def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
+          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
+          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+
+def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
+          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
+def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
+          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
+} // AddedComplexity = 7
+
+let AddedComplexity = 5 in {
+def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
+def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
+def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
+def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
+
+def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
+          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
+          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+
+def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
+          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
+          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
+} // AddedComplexity = 5
+
+def : MulAccumWAlias<"mul", MADDWrrr>;
+def : MulAccumXAlias<"mul", MADDXrrr>;
+def : MulAccumWAlias<"mneg", MSUBWrrr>;
+def : MulAccumXAlias<"mneg", MSUBXrrr>;
+def : WideMulAccumAlias<"smull", SMADDLrrr>;
+def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
+def : WideMulAccumAlias<"umull", UMADDLrrr>;
+def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
+
+// Multiply-high
+def SMULHrr : MulHi<0b010, "smulh", mulhs>;
+def UMULHrr : MulHi<0b110, "umulh", mulhu>;
+
+// CRC32
+def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_aarch64_crc32b, "crc32b">;
+def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_aarch64_crc32h, "crc32h">;
+def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_aarch64_crc32w, "crc32w">;
+def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_aarch64_crc32x, "crc32x">;
+
+def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_aarch64_crc32cb, "crc32cb">;
+def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_aarch64_crc32ch, "crc32ch">;
+def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
+def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
-// Note that adde and sube can form a chain longer than two (e.g. for 256-bit
-// addition). So the flag-setting instructions are appropriate.
-def : Pat<(adde i32:$Rn, i32:$Rm), (ADCSwww $Rn, $Rm)>;
-def : Pat<(adde i64:$Rn, i64:$Rm), (ADCSxxx $Rn, $Rm)>;
-def : Pat<(sube i32:$Rn, i32:$Rm), (SBCSwww $Rn, $Rm)>;
-def : Pat<(sube i64:$Rn, i64:$Rm), (SBCSxxx $Rn, $Rm)>;
 
 //===----------------------------------------------------------------------===//
-// Bitfield
+// Logical instructions.
 //===----------------------------------------------------------------------===//
-// Contains: SBFM, BFM, UBFM, [SU]XT[BHW], ASR, LSR, LSL, SBFI[ZX], BFI, BFXIL,
-//     UBFIZ, UBFX
-
-// Because of the rather complicated nearly-overlapping aliases, the decoding of
-// this range of instructions is handled manually. The architectural
-// instructions are BFM, SBFM and UBFM but a disassembler should never produce
-// these.
-//
-// In the end, the best option was to use BFM instructions for decoding under
-// almost all circumstances, but to create aliasing *Instructions* for each of
-// the canonical forms and specify a completely custom decoder which would
-// substitute the correct MCInst as needed.
-//
-// This also simplifies instruction selection, parsing etc because the MCInsts
-// have a shape that's closer to their use in code.
-
-//===-------------------------------
-// 1. The architectural BFM instructions
-//===-------------------------------
-
-def uimm5_asmoperand : AsmOperandClass {
-  let Name = "UImm5";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm5";
-}
-
-def uimm6_asmoperand : AsmOperandClass {
-  let Name = "UImm6";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm6";
-}
-
-def bitfield32_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 32; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
-
-  let DecoderMethod = "DecodeBitfield32ImmOperand";
-}
-
-
-def bitfield64_imm : Operand<i64>,
-                     ImmLeaf<i64, [{ return Imm >= 0 && Imm < 64; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
-
-  // Default decoder works in 64-bit case: the 6-bit field can take any value.
-}
-
-multiclass A64I_bitfieldSizes<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                    (ins GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
-
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                    (ins GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                    [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    let DecoderMethod = "DecodeBitfieldInstruction";
-  }
-}
-
-defm SBFM : A64I_bitfieldSizes<0b00, "sbfm">;
-defm UBFM : A64I_bitfieldSizes<0b10, "ubfm">;
-
-// BFM instructions modify the destination register rather than defining it
-// completely.
-def BFMwwii :
-  A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-        (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bitfield32_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
-}
-
-def BFMxxii :
-  A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-        (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bitfield64_imm:$ImmS),
-        "bfm\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-  Sched<[WriteALU, ReadALU, ReadALU]> {
-  let DecoderMethod = "DecodeBitfieldInstruction";
-  let Constraints = "$src = $Rd";
-}
-
-
-//===-------------------------------
-// 2. Extend aliases to 64-bit dest
-//===-------------------------------
-
-// Unfortunately the extensions that end up as 64-bits cannot be handled by an
-// instruction alias: their syntax is (for example) "SXTB x0, w0", which needs
-// to be mapped to "SBFM x0, x0, #0, 7" (changing the class of Rn). InstAlias is
-// not capable of such a map as far as I'm aware
-
-// Note that these instructions are strictly more specific than the
-// BFM ones (in ImmR) so they can handle their own decoding.
-class A64I_bf_ext<bit sf, bits<2> opc, RegisterClass GPRDest, ValueType dty,
-                    string asmop, bits<6> imms, dag pattern>
-  : A64I_bitfield<sf, opc, sf,
-                  (outs GPRDest:$Rd), (ins GPR32:$Rn),
-                  !strconcat(asmop, "\t$Rd, $Rn"),
-                  [(set dty:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  let ImmR = 0b000000;
-  let ImmS = imms;
-}
-
-// Signed extensions
-def SXTBxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtb", 7,
-                         (sext_inreg (anyext i32:$Rn), i8)>;
-def SXTBww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxtb", 7,
-                         (sext_inreg i32:$Rn, i8)>;
-def SXTHxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxth", 15,
-                         (sext_inreg (anyext i32:$Rn), i16)>;
-def SXTHww : A64I_bf_ext<0b0, 0b00, GPR32, i32, "sxth", 15,
-                         (sext_inreg i32:$Rn, i16)>;
-def SXTWxw : A64I_bf_ext<0b1, 0b00, GPR64, i64, "sxtw", 31, (sext i32:$Rn)>;
-
-// Unsigned extensions
-def UXTBww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxtb", 7,
-                         (and i32:$Rn, 255)>;
-def UXTHww : A64I_bf_ext<0b0, 0b10, GPR32, i32, "uxth", 15,
-                         (and i32:$Rn, 65535)>;
-
-// The 64-bit unsigned variants are not strictly architectural but recommended
-// for consistency.
-let isAsmParserOnly = 1 in {
-  def UXTBxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxtb", 7,
-                           (and (anyext i32:$Rn), 255)>;
-  def UXTHxw : A64I_bf_ext<0b0, 0b10, GPR64, i64, "uxth", 15,
-                           (and (anyext i32:$Rn), 65535)>;
-}
-
-// Extra patterns for when the source register is actually 64-bits
-// too. There's no architectural difference here, it's just LLVM
-// shinanigans. There's no need for equivalent zero-extension patterns
-// because they'll already be caught by logical (immediate) matching.
-def : Pat<(sext_inreg i64:$Rn, i8),
-          (SXTBxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i16),
-          (SXTHxw (EXTRACT_SUBREG $Rn, sub_32))>;
-def : Pat<(sext_inreg i64:$Rn, i32),
-          (SXTWxw (EXTRACT_SUBREG $Rn, sub_32))>;
-
-
-//===-------------------------------
-// 3. Aliases for ASR and LSR (the simple shifts)
-//===-------------------------------
-
-// These also handle their own decoding because ImmS being set makes
-// them take precedence over BFM.
-multiclass A64I_shift<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_bitfield<0b0, opc, 0b0,
-                    (outs GPR32:$Rd), (ins GPR32:$Rn, bitfield32_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i32:$Rd, (opnode i32:$Rn, bitfield32_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 31;
-  }
-
-  def xxi : A64I_bitfield<0b1, opc, 0b1,
-                    (outs GPR64:$Rd), (ins GPR64:$Rn, bitfield64_imm:$ImmR),
-                    !strconcat(asmop, "\t$Rd, $Rn, $ImmR"),
-                    [(set i64:$Rd, (opnode i64:$Rn, bitfield64_imm:$ImmR))],
-                    NoItinerary>,
-            Sched<[WriteALU, ReadALU]> {
-    let ImmS = 63;
-  }
-
-}
-
-defm ASR : A64I_shift<0b00, "asr", sra>;
-defm LSR : A64I_shift<0b10, "lsr", srl>;
-
-//===-------------------------------
-// 4. Aliases for LSL
-//===-------------------------------
-
-// Unfortunately LSL and subsequent aliases are much more complicated. We need
-// to be able to say certain output instruction fields depend in a complex
-// manner on combinations of input assembly fields).
-//
-// MIOperandInfo *might* have been able to do it, but at the cost of
-// significantly more C++ code.
-
-// N.b. contrary to usual practice these operands store the shift rather than
-// the machine bits in an MCInst. The complexity overhead of consistency
-// outweighed the benefits in this case (custom asmparser, printer and selection
-// vs custom encoder).
-def bitfield32_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let ParserMatchClass = uimm5_asmoperand;
-  let EncoderMethod = "getBitfield32LSLOpValue";
-}
-
-def bitfield64_lsl_imm : Operand<i64>,
-                         ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let ParserMatchClass = uimm6_asmoperand;
-  let EncoderMethod = "getBitfield64LSLOpValue";
-}
-
-class A64I_bitfield_lsl<bit sf, RegisterClass GPR, ValueType ty,
-                        Operand operand>
-  : A64I_bitfield<sf, 0b10, sf, (outs GPR:$Rd), (ins GPR:$Rn, operand:$FullImm),
-                  "lsl\t$Rd, $Rn, $FullImm",
-                  [(set ty:$Rd, (shl ty:$Rn, operand:$FullImm))],
-                  NoItinerary>,
-    Sched<[WriteALU, ReadALU]> {
-  bits<12> FullImm;
-  let ImmR = FullImm{5-0};
-  let ImmS = FullImm{11-6};
-
-  // No disassembler allowed because it would overlap with BFM which does the
-  // actual work.
-  let isAsmParserOnly = 1;
-}
-
-def LSLwwi : A64I_bitfield_lsl<0b0, GPR32, i32, bitfield32_lsl_imm>;
-def LSLxxi : A64I_bitfield_lsl<0b1, GPR64, i64, bitfield64_lsl_imm>;
-
-//===-------------------------------
-// 5. Aliases for bitfield extract instructions
-//===-------------------------------
-
-def bfx32_width_asmoperand : AsmOperandClass {
-  let Name = "BFX32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width32";
-}
-
-def bfx32_width : Operand<i64>, ImmLeaf<i64, [{ return true; }]> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx32_width_asmoperand;
-}
-
-def bfx64_width_asmoperand : AsmOperandClass {
-  let Name = "BFX64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFXWidthOperands";
-  let DiagnosticType = "Width64";
-}
-
-def bfx64_width : Operand<i64> {
-  let PrintMethod = "printBFXWidthOperand";
-  let ParserMatchClass = bfx64_width_asmoperand;
-}
-
-
-multiclass A64I_bitfield_extract<bits<2> opc, string asmop, SDNode op> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                       (ins GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i32:$Rd, (op i32:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                       (ins GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                       !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                       [(set i64:$Rd, (op i64:$Rn, imm:$ImmR, imm:$ImmS))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
-
-defm SBFX :  A64I_bitfield_extract<0b00, "sbfx", A64Sbfx>;
-defm UBFX :  A64I_bitfield_extract<0b10, "ubfx", A64Ubfx>;
-
-// Again, variants based on BFM modify Rd so need it as an input too.
-def BFXILwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                          (ins GPR32:$src, GPR32:$Rn, bitfield32_imm:$ImmR, bfx32_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
-
-def BFXILxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                          (ins GPR64:$src, GPR64:$Rn, bitfield64_imm:$ImmR, bfx64_width:$ImmS),
-                          "bfxil\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-                Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
-
-// SBFX instructions can do a 1-instruction sign-extension of boolean values.
-def : Pat<(sext_inreg i64:$Rn, i1), (SBFXxxii $Rn, 0, 0)>;
-def : Pat<(sext_inreg i32:$Rn, i1), (SBFXwwii $Rn, 0, 0)>;
-def : Pat<(i64 (sext_inreg (anyext i32:$Rn), i1)),
-          (SBFXxxii (SUBREG_TO_REG (i64 0), $Rn, sub_32), 0, 0)>;
-
-// UBFX makes sense as an implementation of a 64-bit zero-extension too. Could
-// use either 64-bit or 32-bit variant, but 32-bit might be more efficient.
-def : Pat<(i64 (zext i32:$Rn)), (SUBREG_TO_REG (i64 0), (UBFXwwii $Rn, 0, 31),
-                                         sub_32)>;
-
-//===-------------------------------
-// 6. Aliases for bitfield insert instructions
-//===-------------------------------
-
-def bfi32_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI32LSB";
-  let PredicateMethod = "isUImm<5>";
-  let RenderMethod = "addBFILSBOperands<32>";
-  let DiagnosticType = "UImm5";
-}
-
-def bfi32_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 31; }]> {
-  let PrintMethod = "printBFILSBOperand<32>";
-  let ParserMatchClass = bfi32_lsb_asmoperand;
-}
-
-def bfi64_lsb_asmoperand : AsmOperandClass {
-  let Name = "BFI64LSB";
-  let PredicateMethod = "isUImm<6>";
-  let RenderMethod = "addBFILSBOperands<64>";
-  let DiagnosticType = "UImm6";
-}
-
-def bfi64_lsb : Operand<i64>,
-                ImmLeaf<i64, [{ return Imm >= 0 && Imm <= 63; }]> {
-  let PrintMethod = "printBFILSBOperand<64>";
-  let ParserMatchClass = bfi64_lsb_asmoperand;
-}
-
-// Width verification is performed during conversion so width operand can be
-// shared between 32/64-bit cases. Still needed for the print method though
-// because ImmR encodes "width - 1".
-def bfi32_width_asmoperand : AsmOperandClass {
-  let Name = "BFI32Width";
-  let PredicateMethod = "isBitfieldWidth<32>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width32";
-}
-
-def bfi32_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 32; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi32_width_asmoperand;
-}
-
-def bfi64_width_asmoperand : AsmOperandClass {
-  let Name = "BFI64Width";
-  let PredicateMethod = "isBitfieldWidth<64>";
-  let RenderMethod = "addBFIWidthOperands";
-  let DiagnosticType = "Width64";
-}
-
-def bfi64_width : Operand<i64>,
-                  ImmLeaf<i64, [{ return Imm >= 1 && Imm <= 64; }]> {
-  let PrintMethod = "printBFIWidthOperand";
-  let ParserMatchClass = bfi64_width_asmoperand;
-}
-
-multiclass A64I_bitfield_insert<bits<2> opc, string asmop> {
-  def wwii : A64I_bitfield<0b0, opc, 0b0, (outs GPR32:$Rd),
-                           (ins GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-
-  def xxii : A64I_bitfield<0b1, opc, 0b1, (outs GPR64:$Rd),
-                           (ins GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                           !strconcat(asmop, "\t$Rd, $Rn, $ImmR, $ImmS"),
-                           [], NoItinerary>,
-             Sched<[WriteALU, ReadALU]> {
-    // As above, no disassembler allowed.
-    let isAsmParserOnly = 1;
-  }
-}
-
-defm SBFIZ :  A64I_bitfield_insert<0b00, "sbfiz">;
-defm UBFIZ :  A64I_bitfield_insert<0b10, "ubfiz">;
 
+// (immediate)
+defm ANDS : LogicalImmS<0b11, "ands", AArch64and_flag>;
+defm AND  : LogicalImm<0b00, "and", and>;
+defm EOR  : LogicalImm<0b10, "eor", xor>;
+defm ORR  : LogicalImm<0b01, "orr", or>;
+
+// FIXME: these aliases *are* canonical sometimes (when movz can't be
+// used). Actually, it seems to be working right now, but putting logical_immXX
+// here is a bit dodgy on the AsmParser side too.
+def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
+                                          logical_imm32:$imm), 0>;
+def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
+                                          logical_imm64:$imm), 0>;
+
+
+// (register)
+defm ANDS : LogicalRegS<0b11, 0, "ands", AArch64and_flag>;
+defm BICS : LogicalRegS<0b11, 1, "bics",
+                        BinOpFrag<(AArch64and_flag node:$LHS, (not node:$RHS))>>;
+defm AND  : LogicalReg<0b00, 0, "and", and>;
+defm BIC  : LogicalReg<0b00, 1, "bic",
+                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
+defm EON  : LogicalReg<0b10, 1, "eon",
+                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
+defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
+defm ORN  : LogicalReg<0b01, 1, "orn",
+                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
+defm ORR  : LogicalReg<0b01, 0, "orr", or>;
+
+def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0), 2>;
+def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0), 2>;
+
+def : InstAlias<"mvn $Wd, $Wm", (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0), 3>;
+def : InstAlias<"mvn $Xd, $Xm", (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0), 3>;
+
+def : InstAlias<"mvn $Wd, $Wm$sh",
+                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, logical_shift32:$sh), 2>;
+def : InstAlias<"mvn $Xd, $Xm$sh",
+                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, logical_shift64:$sh), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2), 2>;
+def : InstAlias<"tst $src1, $src2",
+                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2), 2>;
+
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0), 3>;
+def : InstAlias<"tst $src1, $src2",
+                        (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0), 3>;
+
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift32:$sh), 2>;
+def : InstAlias<"tst $src1, $src2$sh",
+               (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift64:$sh), 2>;
+
+
+def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
+def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
 
-def BFIwwii : A64I_bitfield<0b0, 0b01, 0b0, (outs GPR32:$Rd),
-                (ins GPR32:$src, GPR32:$Rn, bfi32_lsb:$ImmR, bfi32_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
-
-def BFIxxii : A64I_bitfield<0b1, 0b01, 0b1, (outs GPR64:$Rd),
-                (ins GPR64:$src, GPR64:$Rn, bfi64_lsb:$ImmR, bfi64_width:$ImmS),
-                "bfi\t$Rd, $Rn, $ImmR, $ImmS", [], NoItinerary>,
-              Sched<[WriteALU, ReadALU, ReadALU]> {
-  // As above, no disassembler allowed.
-  let isAsmParserOnly = 1;
-  let Constraints = "$src = $Rd";
-}
 
 //===----------------------------------------------------------------------===//
-// Compare and branch (immediate)
+// One operand data processing instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CBZ, CBNZ
 
-class label_asmoperand<int width, int scale> : AsmOperandClass {
-  let Name = "Label" # width # "_" # scale;
-  let PredicateMethod = "isLabel<" # width # "," # scale # ">";
-  let RenderMethod = "addLabelOperands<" # width # ", " # scale # ">";
-  let DiagnosticType = "Label";
-}
-
-def label_wid19_scal4_asmoperand : label_asmoperand<19, 4>;
-
-// All conditional immediate branches are the same really: 19 signed bits scaled
-// by the instruction-size (4).
-def bcc_target : Operand<OtherVT> {
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let ParserMatchClass = label_wid19_scal4_asmoperand;
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_condbr>";
-  let OperandType = "OPERAND_PCREL";
-}
-
-multiclass cmpbr_sizes<bit op, string asmop, ImmLeaf SETOP> {
-  let isBranch = 1, isTerminator = 1 in {
-  def x : A64I_cmpbr<0b1, op,
-                     (outs),
-                     (ins GPR64:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i64:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-
-  def w : A64I_cmpbr<0b0, op,
-                     (outs),
-                     (ins GPR32:$Rt, bcc_target:$Label),
-                     !strconcat(asmop,"\t$Rt, $Label"),
-                     [(A64br_cc (A64cmp i32:$Rt, 0), SETOP, bb:$Label)],
-                     NoItinerary>,
-          Sched<[WriteBr, ReadBr]>;
-  }
-}
-
-defm CBZ  : cmpbr_sizes<0b0, "cbz",  ImmLeaf<i32, [{
-  return Imm == A64CC::EQ;
-}]> >;
-defm CBNZ : cmpbr_sizes<0b1, "cbnz", ImmLeaf<i32, [{
-  return Imm == A64CC::NE;
-}]> >;
+defm CLS    : OneOperandData<0b101, "cls">;
+defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
+defm RBIT   : OneOperandData<0b000, "rbit">;
+def  REV16Wr : OneWRegData<0b001, "rev16",
+                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
+def  REV16Xr : OneXRegData<0b001, "rev16", null_frag>;
+
+def : Pat<(cttz GPR32:$Rn),
+          (CLZWr (RBITWr GPR32:$Rn))>;
+def : Pat<(cttz GPR64:$Rn),
+          (CLZXr (RBITXr GPR64:$Rn))>;
+def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
+                (i32 1))),
+          (CLSWr GPR32:$Rn)>;
+def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
+                (i64 1))),
+          (CLSXr GPR64:$Rn)>;
+
+// Unlike the other one operand instructions, the instructions with the "rev"
+// mnemonic do *not* just different in the size bit, but actually use different
+// opcode bits for the different sizes.
+def REVWr   : OneWRegData<0b010, "rev", bswap>;
+def REVXr   : OneXRegData<0b011, "rev", bswap>;
+def REV32Xr : OneXRegData<0b010, "rev32",
+                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
+
+// The bswap commutes with the rotr so we want a pattern for both possible
+// orders.
+def : Pat<(bswap (rotr GPR32:$Rn, (i64 16))), (REV16Wr GPR32:$Rn)>;
+def : Pat<(bswap (rotr GPR64:$Rn, (i64 32))), (REV32Xr GPR64:$Rn)>;
 
 //===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instructions
+// Bitfield immediate extraction instruction.
 //===----------------------------------------------------------------------===//
-// Contains: B.cc
-
-def cond_code_asmoperand : AsmOperandClass {
-  let Name = "CondCode";
-  let DiagnosticType = "CondCode";
-}
-
-def cond_code : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm <= 15;
-}]> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_asmoperand;
-}
-
-def Bcc : A64I_condbr<0b0, 0b0, (outs),
-                (ins cond_code:$Cond, bcc_target:$Label),
-                "b.$Cond $Label", [(A64br_cc NZCV, (i32 imm:$Cond), bb:$Label)],
-                NoItinerary>,
-          Sched<[WriteBr]> {
-  let Uses = [NZCV];
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+let neverHasSideEffects = 1 in
+defm EXTR : ExtractImm<"extr">;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
+def : InstAlias<"ror $dst, $src, $shift",
+            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
+
+def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
+          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
+def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
+          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
 
 //===----------------------------------------------------------------------===//
-// Conditional compare (immediate) instructions
+// Other bitfield immediate instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-def uimm4_asmoperand : AsmOperandClass {
-  let Name = "UImm4";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm4";
-}
-
-def uimm4 : Operand<i32> {
-  let ParserMatchClass = uimm4_asmoperand;
+let neverHasSideEffects = 1 in {
+defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
+defm SBFM : BitfieldImm<0b00, "sbfm">;
+defm UBFM : BitfieldImm<0b10, "ubfm">;
 }
 
-def uimm5 : Operand<i32> {
-  let ParserMatchClass = uimm5_asmoperand;
-}
-
-// The only difference between this operand and the one for instructions like
-// B.cc is that it's parsed manually. The other get parsed implicitly as part of
-// the mnemonic handling.
-def cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "CondCodeOp";
-  let RenderMethod = "addCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-def cond_code_op : Operand<i32> {
-  let PrintMethod = "printCondCodeOperand";
-  let ParserMatchClass = cond_code_op_asmoperand;
-}
+def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-class A64I_condcmpimmImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpimm<sf, op, 0b0, 0b0, 0b1, (outs),
-                (ins GPR:$Rn, uimm5:$UImm5, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $UImm5, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
+// min(7, 31 - shift_amt)
+def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-def CCMNwi : A64I_condcmpimmImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxi : A64I_condcmpimmImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPwi : A64I_condcmpimmImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxi : A64I_condcmpimmImpl<0b1, 0b1, GPR64, "ccmp">;
+// min(15, 31 - shift_amt)
+def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 31 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-//===----------------------------------------------------------------------===//
-// Conditional compare (register) instructions
-//===----------------------------------------------------------------------===//
-// Contains: CCMN, CCMP
-
-class A64I_condcmpregImpl<bit sf, bit op, RegisterClass GPR, string asmop>
-  : A64I_condcmpreg<sf, op, 0b0, 0b0, 0b1,
-                    (outs),
-                    (ins GPR:$Rn, GPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                    !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                    [], NoItinerary>,
-    Sched<[WriteCMP, ReadCMP, ReadCMP]> {
-  let Defs = [NZCV];
-}
+def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-def CCMNww : A64I_condcmpregImpl<0b0, 0b0, GPR32, "ccmn">;
-def CCMNxx : A64I_condcmpregImpl<0b1, 0b0, GPR64, "ccmn">;
-def CCMPww : A64I_condcmpregImpl<0b0, 0b1, GPR32, "ccmp">;
-def CCMPxx : A64I_condcmpregImpl<0b1, 0b1, GPR64, "ccmp">;
+def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-//===----------------------------------------------------------------------===//
-// Conditional select instructions
-//===----------------------------------------------------------------------===//
-// Contains: CSEL, CSINC, CSINV, CSNEG + aliases CSET, CSETM, CINC, CINV, CNEG
-
-// Condition code which is encoded as the inversion (semantically rather than
-// bitwise) in the instruction.
-def inv_cond_code_op_asmoperand : AsmOperandClass {
-  let Name = "InvCondCodeOp";
-  let RenderMethod = "addInvCondCodeOperands";
-  let PredicateMethod = "isCondCode";
-  let ParserMethod = "ParseCondCodeOperand";
-  let DiagnosticType = "CondCode";
-}
+// min(7, 63 - shift_amt)
+def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 7 ? 7 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-def inv_cond_code_op : Operand<i32> {
-  let ParserMatchClass = inv_cond_code_op_asmoperand;
-}
+// min(15, 63 - shift_amt)
+def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 15 ? 15 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
+}]>;
 
-// Having a separate operand for the selectable use-case is debatable, but gives
-// consistency with cond_code.
-def inv_cond_XFORM : SDNodeXForm<imm, [{
-  A64CC::CondCodes CC = static_cast<A64CC::CondCodes>(N->getZExtValue());
-  return CurDAG->getTargetConstant(A64InvertCondCode(CC), MVT::i32);
+// min(31, 63 - shift_amt)
+def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
+  uint64_t enc = 63 - N->getZExtValue();
+  enc = enc > 31 ? 31 : enc;
+  return CurDAG->getTargetConstant(enc, MVT::i64);
 }]>;
 
-def inv_cond_code
-  : ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 15; }], inv_cond_XFORM>;
-
-
-multiclass A64I_condselSizes<bit op, bits<2> op2, string asmop,
-                             SDPatternOperator select> {
-  let Uses = [NZCV] in {
-    def wwwc : A64I_condsel<0b0, op, 0b0, op2,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i32:$Rd, (select i32:$Rn, i32:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-
-
-    def xxxc : A64I_condsel<0b1, op, 0b0, op2,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, cond_code_op:$Cond),
-                            !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Cond"),
-                            [(set i64:$Rd, (select i64:$Rn, i64:$Rm))],
-                            NoItinerary>,
-               Sched<[WriteCMP, ReadCMP, ReadCMP]>;
-  }
-}
+def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
+                              (i64 (i32shift_b imm0_31:$imm)))>;
+def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_b imm0_63:$imm)))>;
 
-def simple_select
-  : PatFrag<(ops node:$lhs, node:$rhs),
-            (A64select_cc NZCV, node:$lhs, node:$rhs, (i32 imm:$Cond))>;
-
-class complex_select<SDPatternOperator opnode>
-  : PatFrag<(ops node:$lhs, node:$rhs),
-        (A64select_cc NZCV, node:$lhs, (opnode node:$rhs), (i32 imm:$Cond))>;
-
-
-defm CSEL : A64I_condselSizes<0b0, 0b00, "csel", simple_select>;
-defm CSINC : A64I_condselSizes<0b0, 0b01, "csinc",
-                               complex_select<PatFrag<(ops node:$val),
-                                                      (add node:$val, 1)>>>;
-defm CSINV : A64I_condselSizes<0b1, 0b00, "csinv", complex_select<not>>;
-defm CSNEG : A64I_condselSizes<0b1, 0b01, "csneg", complex_select<ineg>>;
-
-// Now the instruction aliases, which fit nicely into LLVM's model:
-
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cset $Rd, $Cond",
-                (CSINCxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVwwwc GPR32:$Rd, WZR, WZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"csetm $Rd, $Cond",
-                (CSINVxxxc GPR64:$Rd, XZR, XZR, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinc $Rd, $Rn, $Cond",
-           (CSINCxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cinv $Rd, $Rn, $Cond",
-           (CSINVxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGwwwc GPR32:$Rd, GPR32:$Rn, GPR32:$Rn, inv_cond_code_op:$Cond)>;
-def : InstAlias<"cneg $Rd, $Rn, $Cond",
-           (CSNEGxxxc GPR64:$Rd, GPR64:$Rn, GPR64:$Rn, inv_cond_code_op:$Cond)>;
-
-// Finally some helper patterns.
-
-// For CSET (a.k.a. zero-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, 1, cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, 1, 0, inv_cond_code:$Cond),
-          (CSINCxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// For CSETM (a.k.a. sign-extension of icmp)
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVwwwc WZR, WZR, inv_cond_code:$Cond)>;
-
-def : Pat<(A64select_cc NZCV, 0, -1, cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, cond_code:$Cond)>;
-def : Pat<(A64select_cc NZCV, -1, 0, inv_cond_code:$Cond),
-          (CSINVxxxc XZR, XZR, inv_cond_code:$Cond)>;
-
-// CINC, CINV and CNEG get dealt with automatically, which leaves the issue of
-// commutativity. The instructions are to complex for isCommutable to be used,
-// so we have to create the patterns manually:
-
-// No commutable pattern for CSEL since the commuted version is isomorphic.
-
-// CSINC
-def :Pat<(A64select_cc NZCV, (add i32:$Rm, 1), i32:$Rn, inv_cond_code:$Cond),
-         (CSINCwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (add i64:$Rm, 1), i64:$Rn, inv_cond_code:$Cond),
-         (CSINCxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSINV
-def :Pat<(A64select_cc NZCV, (not i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSINVwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (not i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSINVxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
-
-// CSNEG
-def :Pat<(A64select_cc NZCV, (ineg i32:$Rm), i32:$Rn, inv_cond_code:$Cond),
-         (CSNEGwwwc $Rn, $Rm, inv_cond_code:$Cond)>;
-def :Pat<(A64select_cc NZCV, (ineg i64:$Rm), i64:$Rn, inv_cond_code:$Cond),
-         (CSNEGxxxc $Rn, $Rm, inv_cond_code:$Cond)>;
+let AddedComplexity = 10 in {
+def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+}
+
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"asr $dst, $src, $shift",
+                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
+
+def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
+          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
+def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
+          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
+
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
+def : InstAlias<"lsr $dst, $src, $shift",
+                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
+def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
+def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
+def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
+def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (1 source) instructions
+// Conditionally set flags instructions.
 //===----------------------------------------------------------------------===//
-// Contains: RBIT, REV16, REV, REV32, CLZ, CLS.
-
-// We define an unary operator which always fails. We will use this to
-// define unary operators that cannot be matched.
-
-class A64I_dp_1src_impl<bit sf, bits<6> opcode, string asmop,
-                   list<dag> patterns, RegisterClass GPRrc,
-                   InstrItinClass itin>:
-      A64I_dp_1src<sf,
-                   0,
-                   0b00000,
-                   opcode,
-                   !strconcat(asmop, "\t$Rd, $Rn"),
-                   (outs GPRrc:$Rd),
-                   (ins GPRrc:$Rn),
-                   patterns,
-                   itin>,
-      Sched<[WriteALU, ReadALU]>;
-
-multiclass A64I_dp_1src <bits<6> opcode, string asmop> {
-  let hasSideEffects = 0 in {
-    def ww : A64I_dp_1src_impl<0b0, opcode, asmop, [], GPR32, NoItinerary>;
-    def xx : A64I_dp_1src_impl<0b1, opcode, asmop, [], GPR64, NoItinerary>;
-  }
-}
+defm CCMN : CondSetFlagsImm<0, "ccmn">;
+defm CCMP : CondSetFlagsImm<1, "ccmp">;
 
-defm RBIT  : A64I_dp_1src<0b000000, "rbit">;
-defm CLS   : A64I_dp_1src<0b000101, "cls">;
-defm CLZ   : A64I_dp_1src<0b000100, "clz">;
-
-def : Pat<(ctlz i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz i64:$Rn), (CLZxx $Rn)>;
-def : Pat<(ctlz_zero_undef i32:$Rn), (CLZww $Rn)>;
-def : Pat<(ctlz_zero_undef i64:$Rn), (CLZxx $Rn)>;
-
-def : Pat<(cttz i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz i64:$Rn), (CLZxx (RBITxx $Rn))>;
-def : Pat<(cttz_zero_undef i32:$Rn), (CLZww (RBITww $Rn))>;
-def : Pat<(cttz_zero_undef i64:$Rn), (CLZxx (RBITxx $Rn))>;
-
-
-def REVww : A64I_dp_1src_impl<0b0, 0b000010, "rev",
-                              [(set i32:$Rd, (bswap i32:$Rn))],
-                              GPR32, NoItinerary>;
-def REVxx : A64I_dp_1src_impl<0b1, 0b000011, "rev",
-                              [(set i64:$Rd, (bswap i64:$Rn))],
-                              GPR64, NoItinerary>;
-def REV32xx : A64I_dp_1src_impl<0b1, 0b000010, "rev32",
-                          [(set i64:$Rd, (bswap (rotr i64:$Rn, (i64 32))))],
-                          GPR64, NoItinerary>;
-def REV16ww : A64I_dp_1src_impl<0b0, 0b000001, "rev16",
-                          [(set i32:$Rd, (bswap (rotr i32:$Rn, (i64 16))))],
-                          GPR32,
-                          NoItinerary>;
-def REV16xx : A64I_dp_1src_impl<0b1, 0b000001, "rev16", [], GPR64, NoItinerary>;
+defm CCMN : CondSetFlagsReg<0, "ccmn">;
+defm CCMP : CondSetFlagsReg<1, "ccmp">;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (2 sources) instructions
+// Conditional select instructions.
 //===----------------------------------------------------------------------===//
-// Contains: CRC32C?[BHWX], UDIV, SDIV, LSLV, LSRV, ASRV, RORV + aliases LSL,
-//           LSR, ASR, ROR
-
-
-class dp_2src_impl<bit sf, bits<6> opcode, string asmop, list<dag> patterns,
-                   RegisterClass GPRsp,
-                   InstrItinClass itin>:
-      A64I_dp_2src<sf,
-                   opcode,
-                   0,
-                   !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                   (outs GPRsp:$Rd),
-                   (ins GPRsp:$Rn, GPRsp:$Rm),
-                   patterns,
-                   itin>,
-	  Sched<[WriteALU, ReadALU, ReadALU]>;
-
-multiclass dp_2src_crc<bit c, string asmop> {
-  def B_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 0},
-                           !strconcat(asmop, "b"), [], GPR32, NoItinerary>;
-  def H_www : dp_2src_impl<0b0, {0, 1, 0, c, 0, 1},
-                           !strconcat(asmop, "h"), [], GPR32, NoItinerary>;
-  def W_www : dp_2src_impl<0b0, {0, 1, 0, c, 1, 0},
-                           !strconcat(asmop, "w"), [], GPR32, NoItinerary>;
-  def X_wwx : A64I_dp_2src<0b1, {0, 1, 0, c, 1, 1}, 0b0,
-                           !strconcat(asmop, "x\t$Rd, $Rn, $Rm"),
-                           (outs GPR32:$Rd), (ins GPR32:$Rn, GPR64:$Rm), [],
-                           NoItinerary>,
-	          Sched<[WriteALU, ReadALU, ReadALU]>;
-}
-
-multiclass dp_2src_zext <bits<6> opcode, string asmop, SDPatternOperator op> {
-   def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd,
-                               (op i32:$Rn, (i64 (zext i32:$Rm))))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
-
-
-multiclass dp_2src <bits<6> opcode, string asmop, SDPatternOperator op> {
-    def www : dp_2src_impl<0b0,
-                         opcode,
-                         asmop,
-                         [(set i32:$Rd, (op i32:$Rn, i32:$Rm))],
-                         GPR32,
-                         NoItinerary>;
-   def xxx : dp_2src_impl<0b1,
-                         opcode,
-                         asmop,
-                         [(set i64:$Rd, (op i64:$Rn, i64:$Rm))],
-                         GPR64,
-                         NoItinerary>;
-}
-
-// Here we define the data processing 2 source instructions.
-defm CRC32  : dp_2src_crc<0b0, "crc32">;
-defm CRC32C : dp_2src_crc<0b1, "crc32c">;
-
-let SchedRW = [WriteDiv, ReadDiv, ReadDiv] in {
-  defm UDIV : dp_2src<0b000010, "udiv", udiv>;
-  defm SDIV : dp_2src<0b000011, "sdiv", sdiv>;
-}
-
-let SchedRW = [WriteALUs, ReadALU, ReadALU] in {
-  defm LSLV : dp_2src_zext<0b001000, "lsl", shl>;
-  defm LSRV : dp_2src_zext<0b001001, "lsr", srl>;
-  defm ASRV : dp_2src_zext<0b001010, "asr", sra>;
-  defm RORV : dp_2src_zext<0b001011, "ror", rotr>;
-}
-
-// Extra patterns for an incoming 64-bit value for a 32-bit
-// operation. Since the LLVM operations are undefined (as in C) if the
-// RHS is out of range, it's perfectly permissible to discard the high
-// bits of the GPR64.
-def : Pat<(shl i32:$Rn, i64:$Rm),
-          (LSLVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(srl i32:$Rn, i64:$Rm),
-          (LSRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(sra i32:$Rn, i64:$Rm),
-          (ASRVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-def : Pat<(rotr i32:$Rn, i64:$Rm),
-          (RORVwww $Rn, (EXTRACT_SUBREG $Rm, sub_32))>;
-
-// Here we define the aliases for the data processing 2 source instructions.
-def LSL_mnemonic : MnemonicAlias<"lslv", "lsl">;
-def LSR_mnemonic : MnemonicAlias<"lsrv", "lsr">;
-def ASR_menmonic : MnemonicAlias<"asrv", "asr">;
-def ROR_menmonic : MnemonicAlias<"rorv", "ror">;
+defm CSEL  : CondSelect<0, 0b00, "csel">;
+
+def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
+defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
+defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
+defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
+
+def : Pat<(AArch64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), NZCV),
+          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
+def : Pat<(AArch64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), NZCV),
+          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
+
+def : Pat<(AArch64csel (i32 0), (i32 1), (i32 imm:$cc), NZCV),
+          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 1), (i32 imm:$cc), NZCV),
+          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i32 0), (i32 -1), (i32 imm:$cc), NZCV),
+          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
+def : Pat<(AArch64csel (i64 0), (i64 -1), (i32 imm:$cc), NZCV),
+          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
+
+// The inverse of the condition code from the alias instruction is what is used
+// in the aliased instruction. The parser all ready inverts the condition code
+// for these aliases.
+def : InstAlias<"cset $dst, $cc",
+                (CSINCWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"cset $dst, $cc",
+                (CSINCXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVWr GPR32:$dst, WZR, WZR, inv_ccode:$cc)>;
+def : InstAlias<"csetm $dst, $cc",
+                (CSINVXr GPR64:$dst, XZR, XZR, inv_ccode:$cc)>;
+
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinc $dst, $src, $cc",
+                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cinv $dst, $src, $cc",
+                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
+
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, inv_ccode:$cc)>;
+def : InstAlias<"cneg $dst, $src, $cc",
+                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, inv_ccode:$cc)>;
 
 //===----------------------------------------------------------------------===//
-// Data Processing (3 sources) instructions
+// PC-relative instructions.
 //===----------------------------------------------------------------------===//
-// Contains: MADD, MSUB, SMADDL, SMSUBL, SMULH, UMADDL, UMSUBL, UMULH
-//    + aliases MUL, MNEG, SMULL, SMNEGL, UMULL, UMNEGL
-
-class A64I_dp3_4operand<bit sf, bits<6> opcode, RegisterClass AccReg,
-                        ValueType AccTy, RegisterClass SrcReg,
-                        string asmop, dag pattern>
-  : A64I_dp3<sf, opcode,
-             (outs AccReg:$Rd), (ins SrcReg:$Rn, SrcReg:$Rm, AccReg:$Ra),
-             !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Ra"),
-             [(set AccTy:$Rd, pattern)], NoItinerary>,
-    Sched<[WriteMAC, ReadMAC, ReadMAC, ReadMAC]> {
-  bits<5> Ra;
-  let Inst{14-10} = Ra;
-
-  RegisterClass AccGPR = AccReg;
-  RegisterClass SrcGPR = SrcReg;
-}
-
-def MADDwwww : A64I_dp3_4operand<0b0, 0b000000, GPR32, i32, GPR32, "madd",
-                                 (add i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MADDxxxx : A64I_dp3_4operand<0b1, 0b000000, GPR64, i64, GPR64, "madd",
-                                 (add i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def MSUBwwww : A64I_dp3_4operand<0b0, 0b000001, GPR32, i32, GPR32, "msub",
-                                 (sub i32:$Ra, (mul i32:$Rn, i32:$Rm))>;
-def MSUBxxxx : A64I_dp3_4operand<0b1, 0b000001, GPR64, i64, GPR64, "msub",
-                                 (sub i64:$Ra, (mul i64:$Rn, i64:$Rm))>;
-
-def SMADDLxwwx : A64I_dp3_4operand<0b1, 0b000010, GPR64, i64, GPR32, "smaddl",
-                     (add i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-def SMSUBLxwwx : A64I_dp3_4operand<0b1, 0b000011, GPR64, i64, GPR32, "smsubl",
-                     (sub i64:$Ra, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-def UMADDLxwwx : A64I_dp3_4operand<0b1, 0b001010, GPR64, i64, GPR32, "umaddl",
-                     (add i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-def UMSUBLxwwx : A64I_dp3_4operand<0b1, 0b001011, GPR64, i64, GPR32, "umsubl",
-                     (sub i64:$Ra, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-let isCommutable = 1, PostEncoderMethod = "fixMulHigh" in {
-  def UMULHxxx : A64I_dp3<0b1, 0b001100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "umulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhu i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-
-  def SMULHxxx : A64I_dp3<0b1, 0b000100, (outs GPR64:$Rd),
-                          (ins GPR64:$Rn, GPR64:$Rm),
-                          "smulh\t$Rd, $Rn, $Rm",
-                          [(set i64:$Rd, (mulhs i64:$Rn, i64:$Rm))],
-                          NoItinerary>,
-                 Sched<[WriteMAC, ReadMAC, ReadMAC]>;
-}
-
-multiclass A64I_dp3_3operand<string asmop, A64I_dp3_4operand INST,
-                             Register ZR, dag pattern> {
-  def : InstAlias<asmop # " $Rd, $Rn, $Rm",
-                  (INST INST.AccGPR:$Rd, INST.SrcGPR:$Rn, INST.SrcGPR:$Rm, ZR)>;
-
-  def : Pat<pattern, (INST $Rn, $Rm, ZR)>;
-}
-
-defm : A64I_dp3_3operand<"mul", MADDwwww, WZR, (mul i32:$Rn, i32:$Rm)>;
-defm : A64I_dp3_3operand<"mul", MADDxxxx, XZR, (mul i64:$Rn, i64:$Rm)>;
-
-defm : A64I_dp3_3operand<"mneg", MSUBwwww, WZR,
-                         (sub 0, (mul i32:$Rn, i32:$Rm))>;
-defm : A64I_dp3_3operand<"mneg", MSUBxxxx, XZR,
-                         (sub 0, (mul i64:$Rn, i64:$Rm))>;
-
-defm : A64I_dp3_3operand<"smull", SMADDLxwwx, XZR,
-                         (mul (i64 (sext i32:$Rn)), (sext i32:$Rm))>;
-defm : A64I_dp3_3operand<"smnegl", SMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (sext i32:$Rn)), (sext i32:$Rm)))>;
-
-defm : A64I_dp3_3operand<"umull", UMADDLxwwx, XZR,
-                         (mul (i64 (zext i32:$Rn)), (zext i32:$Rm))>;
-defm : A64I_dp3_3operand<"umnegl", UMSUBLxwwx, XZR,
-                       (sub 0, (mul (i64 (zext i32:$Rn)), (zext i32:$Rm)))>;
-
-
-//===----------------------------------------------------------------------===//
-// Exception generation
-//===----------------------------------------------------------------------===//
-// Contains: SVC, HVC, SMC, BRK, HLT, DCPS1, DCPS2, DCPS3
-
-def uimm16_asmoperand : AsmOperandClass {
-  let Name = "UImm16";
-  let PredicateMethod = "isUImm<16>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm16";
-}
+let isReMaterializable = 1 in {
+let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
+def ADR  : ADRI<0, "adr", adrlabel, []>;
+} // neverHasSideEffects = 1
 
-def uimm16 : Operand<i32> {
-  let ParserMatchClass = uimm16_asmoperand;
-}
-
-class A64I_exceptImpl<bits<3> opc, bits<2> ll, string asmop>
-  : A64I_exception<opc, 0b000, ll, (outs), (ins uimm16:$UImm16),
-                   !strconcat(asmop, "\t$UImm16"), [], NoItinerary>,
-    Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-}
+def ADRP : ADRI<1, "adrp", adrplabel,
+                [(set GPR64:$Xd, (AArch64adrp tglobaladdr:$label))]>;
+} // isReMaterializable = 1
 
-def SVCi : A64I_exceptImpl<0b000, 0b01, "svc">;
-def HVCi : A64I_exceptImpl<0b000, 0b10, "hvc">;
-def SMCi : A64I_exceptImpl<0b000, 0b11, "smc">;
-def BRKi : A64I_exceptImpl<0b001, 0b00, "brk">;
-def HLTi : A64I_exceptImpl<0b010, 0b00, "hlt">;
-
-def DCPS1i : A64I_exceptImpl<0b101, 0b01, "dcps1">;
-def DCPS2i : A64I_exceptImpl<0b101, 0b10, "dcps2">;
-def DCPS3i : A64I_exceptImpl<0b101, 0b11, "dcps3">;
-
-// The immediate is optional for the DCPS instructions, defaulting to 0.
-def : InstAlias<"dcps1", (DCPS1i 0)>;
-def : InstAlias<"dcps2", (DCPS2i 0)>;
-def : InstAlias<"dcps3", (DCPS3i 0)>;
+// page address of a constant pool entry, block address
+def : Pat<(AArch64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
+def : Pat<(AArch64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
 
 //===----------------------------------------------------------------------===//
-// Extract (immediate)
+// Unconditional branch (register) instructions.
 //===----------------------------------------------------------------------===//
-// Contains: EXTR + alias ROR
-
-def EXTRwwwi : A64I_extract<0b0, 0b000, 0b0,
-                            (outs GPR32:$Rd),
-                            (ins GPR32:$Rn, GPR32:$Rm, bitfield32_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i32:$Rd,
-                                  (A64Extr i32:$Rn, i32:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-def EXTRxxxi : A64I_extract<0b1, 0b000, 0b1,
-                            (outs GPR64:$Rd),
-                            (ins GPR64:$Rn, GPR64:$Rm, bitfield64_imm:$LSB),
-                            "extr\t$Rd, $Rn, $Rm, $LSB",
-                            [(set i64:$Rd,
-                                  (A64Extr i64:$Rn, i64:$Rm, imm:$LSB))],
-                            NoItinerary>,
-               Sched<[WriteALU, ReadALU, ReadALU]>;
-
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRwwwi GPR32:$Rd, GPR32:$Rs, GPR32:$Rs, bitfield32_imm:$LSB)>;
-def : InstAlias<"ror $Rd, $Rs, $LSB",
-               (EXTRxxxi GPR64:$Rd, GPR64:$Rs, GPR64:$Rs, bitfield64_imm:$LSB)>;
-
-def : Pat<(rotr i32:$Rn, bitfield32_imm:$LSB),
-          (EXTRwwwi $Rn, $Rn, bitfield32_imm:$LSB)>;
-def : Pat<(rotr i64:$Rn, bitfield64_imm:$LSB),
-          (EXTRxxxi $Rn, $Rn, bitfield64_imm:$LSB)>;
 
-//===----------------------------------------------------------------------===//
-// Floating-point compare instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCMP, FCMPE
+let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
+def RET  : BranchReg<0b0010, "ret", []>;
+def DRPS : SpecialReturn<0b0101, "drps">;
+def ERET : SpecialReturn<0b0100, "eret">;
+} // isReturn = 1, isTerminator = 1, isBarrier = 1
 
-def fpzero_asmoperand : AsmOperandClass {
-  let Name = "FPZero";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPZero";
-}
+// Default to the LR register.
+def : InstAlias<"ret", (RET LR)>;
 
-def fpz32 : Operand<f32>,
-            ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+} // isCall
 
-def fpz64 : Operand<f64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
+} // isBranch, isTerminator, isBarrier, isIndirectBranch
 
-def fpz64movi : Operand<i64>,
-            ComplexPattern<f64, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
+// Create a separate pseudo-instruction for codegen to use so that we don't
+// flag lr as used in every function. It'll be restored before the RET by the
+// epilogue if it's legitimately used.
+def RET_ReallyLR : Pseudo<(outs), (ins), [(AArch64retflag)]> {
+  let isTerminator = 1;
+  let isBarrier = 1;
+  let isReturn = 1;
 }
 
-multiclass A64I_fpcmpSignal<bits<2> type, bit imm, dag ins, dag pattern> {
-  def _quiet : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b0, imm, 0b0, 0b0, 0b0},
-                          (outs), ins, "fcmp\t$Rn, $Rm", [pattern],
-                          NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
-
-  def _sig : A64I_fpcmp<0b0, 0b0, type, 0b00, {0b1, imm, 0b0, 0b0, 0b0},
-                        (outs), ins, "fcmpe\t$Rn, $Rm", [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Defs = [NZCV];
-  }
+// This is a directive-like pseudo-instruction. The purpose is to insert an
+// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
+// (which in the usual case is a BLR).
+let hasSideEffects = 1 in
+def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
+  let AsmString = ".tlsdesccall $sym";
 }
 
-defm FCMPss : A64I_fpcmpSignal<0b00, 0b0, (ins FPR32:$Rn, FPR32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, f32:$Rm))>;
-defm FCMPdd : A64I_fpcmpSignal<0b01, 0b0, (ins FPR64:$Rn, FPR64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, f64:$Rm))>;
-
-// What would be Rm should be written as 0; note that even though it's called
-// "$Rm" here to fit in with the InstrFormats, it's actually an immediate.
-defm FCMPsi : A64I_fpcmpSignal<0b00, 0b1, (ins FPR32:$Rn, fpz32:$Rm),
-                               (set NZCV, (A64cmp f32:$Rn, fpz32:$Rm))>;
-
-defm FCMPdi : A64I_fpcmpSignal<0b01, 0b1, (ins FPR64:$Rn, fpz64:$Rm),
-                               (set NZCV, (A64cmp f64:$Rn, fpz64:$Rm))>;
-
+// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
+// gets expanded to two MCInsts during lowering.
+let isCall = 1, Defs = [LR] in
+def TLSDESC_BLR
+    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
+             [(AArch64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
 
+def : Pat<(AArch64tlsdesc_call GPR64:$dest, texternalsym:$sym),
+          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
 //===----------------------------------------------------------------------===//
-// Floating-point conditional compare instructions
+// Conditional branch (immediate) instruction.
 //===----------------------------------------------------------------------===//
-// Contains: FCCMP, FCCMPE
-
-class A64I_fpccmpImpl<bits<2> type, bit op, RegisterClass FPR, string asmop>
-  : A64I_fpccmp<0b0, 0b0, type, op,
-                (outs),
-                (ins FPR:$Rn, FPR:$Rm, uimm4:$NZCVImm, cond_code_op:$Cond),
-                !strconcat(asmop, "\t$Rn, $Rm, $NZCVImm, $Cond"),
-                [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Defs = [NZCV];
-}
-
-def FCCMPss : A64I_fpccmpImpl<0b00, 0b0, FPR32, "fccmp">;
-def FCCMPEss : A64I_fpccmpImpl<0b00, 0b1, FPR32, "fccmpe">;
-def FCCMPdd : A64I_fpccmpImpl<0b01, 0b0, FPR64, "fccmp">;
-def FCCMPEdd : A64I_fpccmpImpl<0b01, 0b1, FPR64, "fccmpe">;
+def Bcc : BranchCond;
 
 //===----------------------------------------------------------------------===//
-// Floating-point conditional select instructions
+// Compare-and-branch instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FCSEL
-
-let Uses = [NZCV] in {
-  def FCSELsssc : A64I_fpcondsel<0b0, 0b0, 0b00, (outs FPR32:$Rd),
-                                 (ins FPR32:$Rn, FPR32:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f32:$Rd,
-                                       (simple_select f32:$Rn, f32:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-
-  def FCSELdddc : A64I_fpcondsel<0b0, 0b0, 0b01, (outs FPR64:$Rd),
-                                 (ins FPR64:$Rn, FPR64:$Rm, cond_code_op:$Cond),
-                                 "fcsel\t$Rd, $Rn, $Rm, $Cond",
-                                 [(set f64:$Rd,
-                                       (simple_select f64:$Rn, f64:$Rm))],
-                                 NoItinerary>,
-                  Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
+defm CBZ  : CmpBranch<0, "cbz", AArch64cbz>;
+defm CBNZ : CmpBranch<1, "cbnz", AArch64cbnz>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point data-processing (1 source)
+// Test-bit-and-branch instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FMOV, FABS, FNEG, FSQRT, FCVT, FRINT[NPMZAXI].
-
-def FPNoUnop : PatFrag<(ops node:$val), (fneg node:$val),
-                       [{ (void)N; return false; }]>;
-
-// First we do the fairly trivial bunch with uniform "OP s, s" and "OP d, d"
-// syntax. Default to no pattern because most are odd enough not to have one.
-multiclass A64I_fpdp1sizes<bits<6> opcode, string asmstr,
-                           SDPatternOperator opnode = FPNoUnop> {
-  def ss : A64I_fpdp1<0b0, 0b0, 0b00, opcode, (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f32:$Rd, (opnode f32:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def dd : A64I_fpdp1<0b0, 0b0, 0b01, opcode, (outs FPR64:$Rd), (ins FPR64:$Rn),
-                     !strconcat(asmstr, "\t$Rd, $Rn"),
-                     [(set f64:$Rd, (opnode f64:$Rn))],
-                     NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMOV   : A64I_fpdp1sizes<0b000000, "fmov">;
-defm FABS   : A64I_fpdp1sizes<0b000001, "fabs", fabs>;
-defm FNEG   : A64I_fpdp1sizes<0b000010, "fneg", fneg>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-  defm FSQRT  : A64I_fpdp1sizes<0b000011, "fsqrt", fsqrt>;
-}
-
-defm FRINTN : A64I_fpdp1sizes<0b001000, "frintn">;
-defm FRINTP : A64I_fpdp1sizes<0b001001, "frintp", fceil>;
-defm FRINTM : A64I_fpdp1sizes<0b001010, "frintm", ffloor>;
-defm FRINTZ : A64I_fpdp1sizes<0b001011, "frintz", ftrunc>;
-defm FRINTA : A64I_fpdp1sizes<0b001100, "frinta">;
-defm FRINTX : A64I_fpdp1sizes<0b001110, "frintx", frint>;
-defm FRINTI : A64I_fpdp1sizes<0b001111, "frinti", fnearbyint>;
-
-// The FCVT instrucitons have different source and destination register-types,
-// but the fields are uniform everywhere a D-register (say) crops up. Package
-// this information in a Record.
-class FCVTRegType<RegisterClass rc, bits<2> fld, ValueType vt> {
-    RegisterClass Class = rc;
-    ValueType VT = vt;
-    bit t1 = fld{1};
-    bit t0 = fld{0};
-}
-
-def FCVT16 : FCVTRegType<FPR16, 0b11, f16>;
-def FCVT32 : FCVTRegType<FPR32, 0b00, f32>;
-def FCVT64 : FCVTRegType<FPR64, 0b01, f64>;
-
-class A64I_fpdp1_fcvt<FCVTRegType DestReg, FCVTRegType SrcReg, SDNode opnode>
-  : A64I_fpdp1<0b0, 0b0, {SrcReg.t1, SrcReg.t0},
-               {0,0,0,1, DestReg.t1, DestReg.t0},
-               (outs DestReg.Class:$Rd), (ins SrcReg.Class:$Rn),
-               "fcvt\t$Rd, $Rn",
-               [(set DestReg.VT:$Rd, (opnode SrcReg.VT:$Rn))], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTds : A64I_fpdp1_fcvt<FCVT64, FCVT32, fextend>;
-def FCVThs : A64I_fpdp1_fcvt<FCVT16, FCVT32, fround>;
-def FCVTsd : A64I_fpdp1_fcvt<FCVT32, FCVT64, fround>;
-def FCVThd : A64I_fpdp1_fcvt<FCVT16, FCVT64, fround>;
-def FCVTsh : A64I_fpdp1_fcvt<FCVT32, FCVT16, fextend>;
-def FCVTdh : A64I_fpdp1_fcvt<FCVT64, FCVT16, fextend>;
-
+defm TBZ  : TestBranch<0, "tbz", AArch64tbz>;
+defm TBNZ : TestBranch<1, "tbnz", AArch64tbnz>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point data-processing (2 sources) instructions
+// Unconditional branch (immediate) instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FMUL, FDIV, FADD, FSUB, FMAX, FMIN, FMAXNM, FMINNM, FNMUL
-
-def FPNoBinop : PatFrag<(ops node:$lhs, node:$rhs), (fadd node:$lhs, node:$rhs),
-                      [{ (void)N; return false; }]>;
-
-multiclass A64I_fpdp2sizes<bits<4> opcode, string asmstr,
-                           SDPatternOperator opnode> {
-  def sss : A64I_fpdp2<0b0, 0b0, 0b00, opcode,
-                      (outs FPR32:$Rd),
-                      (ins FPR32:$Rn, FPR32:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f32:$Rd, (opnode f32:$Rn, f32:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-  def ddd : A64I_fpdp2<0b0, 0b0, 0b01, opcode,
-                      (outs FPR64:$Rd),
-                      (ins FPR64:$Rn, FPR64:$Rm),
-                      !strconcat(asmstr, "\t$Rd, $Rn, $Rm"),
-                      [(set f64:$Rd, (opnode f64:$Rn, f64:$Rm))],
-                      NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-}
+let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
+def B  : BranchImm<0, "b", [(br bb:$addr)]>;
+} // isBranch, isTerminator, isBarrier
 
-let isCommutable = 1 in {
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FMUL   : A64I_fpdp2sizes<0b0000, "fmul", fmul>;
-  }
-  defm FADD   : A64I_fpdp2sizes<0b0010, "fadd", fadd>;
-
-  // No patterns for these.
-  defm FMAX   : A64I_fpdp2sizes<0b0100, "fmax", FPNoBinop>;
-  defm FMIN   : A64I_fpdp2sizes<0b0101, "fmin", FPNoBinop>;
-  defm FMAXNM : A64I_fpdp2sizes<0b0110, "fmaxnm", FPNoBinop>;
-  defm FMINNM : A64I_fpdp2sizes<0b0111, "fminnm", FPNoBinop>;
-
-  let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    defm FNMUL  : A64I_fpdp2sizes<0b1000, "fnmul",
-                                  PatFrag<(ops node:$lhs, node:$rhs),
-                                          (fneg (fmul node:$lhs, node:$rhs))> >;
-  }
-}
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-  defm FDIV : A64I_fpdp2sizes<0b0001, "fdiv", fdiv>;
-}
-defm FSUB : A64I_fpdp2sizes<0b0011, "fsub", fsub>;
+let isCall = 1, Defs = [LR], Uses = [SP] in {
+def BL : CallImm<1, "bl", [(AArch64call tglobaladdr:$addr)]>;
+} // isCall
+def : Pat<(AArch64call texternalsym:$func), (BL texternalsym:$func)>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point data-processing (3 sources) instructions
+// Exception generation instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FMADD, FMSUB, FNMADD, FNMSUB
-
-def fmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                    (fma (fneg node:$Rn),  node:$Rm, node:$Ra)>;
-def fnmsub : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma node:$Rn,  node:$Rm, (fneg node:$Ra))>;
-def fnmadd : PatFrag<(ops node:$Rn, node:$Rm, node:$Ra),
-                     (fma (fneg node:$Rn),  node:$Rm, (fneg node:$Ra))>;
-
-class A64I_fpdp3Impl<string asmop, RegisterClass FPR, ValueType VT,
-                     bits<2> type, bit o1, bit o0, SDPatternOperator fmakind>
-  : A64I_fpdp3<0b0, 0b0, type, o1, o0, (outs FPR:$Rd),
-               (ins FPR:$Rn, FPR:$Rm, FPR:$Ra),
-               !strconcat(asmop,"\t$Rd, $Rn, $Rm, $Ra"),
-               [(set VT:$Rd, (fmakind VT:$Rn, VT:$Rm, VT:$Ra))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]>;
-
-def FMADDssss  : A64I_fpdp3Impl<"fmadd",  FPR32, f32, 0b00, 0b0, 0b0, fma>;
-def FMSUBssss  : A64I_fpdp3Impl<"fmsub",  FPR32, f32, 0b00, 0b0, 0b1, fmsub>;
-def FNMADDssss : A64I_fpdp3Impl<"fnmadd", FPR32, f32, 0b00, 0b1, 0b0, fnmadd>;
-def FNMSUBssss : A64I_fpdp3Impl<"fnmsub", FPR32, f32, 0b00, 0b1, 0b1, fnmsub>;
-
-def FMADDdddd  : A64I_fpdp3Impl<"fmadd",  FPR64, f64, 0b01, 0b0, 0b0, fma>;
-def FMSUBdddd  : A64I_fpdp3Impl<"fmsub",  FPR64, f64, 0b01, 0b0, 0b1, fmsub>;
-def FNMADDdddd : A64I_fpdp3Impl<"fnmadd", FPR64, f64, 0b01, 0b1, 0b0, fnmadd>;
-def FNMSUBdddd : A64I_fpdp3Impl<"fnmsub", FPR64, f64, 0b01, 0b1, 0b1, fnmsub>;
-
-// Extra patterns for when we're allowed to optimise separate multiplication and
-// addition.
-let Predicates = [HasFPARMv8, UseFusedMAC] in {
-def : Pat<(f32 (fadd FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub FPR32:$Ra, (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fneg FPR32:$Ra)), (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)))),
-          (FNMADDssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-def : Pat<(f32 (fsub (f32 (fmul_su FPR32:$Rn, FPR32:$Rm)), FPR32:$Ra)),
-          (FNMSUBssss FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
-
-def : Pat<(f64 (fadd FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub FPR64:$Ra, (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fneg FPR64:$Ra)), (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)))),
-          (FNMADDdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-def : Pat<(f64 (fsub (f64 (fmul_su FPR64:$Rn, FPR64:$Rm)), FPR64:$Ra)),
-          (FNMSUBdddd FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-}
-
+def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
+def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
+def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
+def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
+def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
+def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
+def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
+def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
+
+// DCPSn defaults to an immediate operand of zero if unspecified.
+def : InstAlias<"dcps1", (DCPS1 0)>;
+def : InstAlias<"dcps2", (DCPS2 0)>;
+def : InstAlias<"dcps3", (DCPS3 0)>;
 
 //===----------------------------------------------------------------------===//
-// Floating-point <-> fixed-point conversion instructions
+// Load instructions.
 //===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-// #1-#32 allowed, encoded as "64 - <specified imm>
-def fixedpos_asmoperand_i32 : AsmOperandClass {
-  let Name = "CVTFixedPos32";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<32>";
-  let DiagnosticType = "CVTFixedPos32";
-}
 
-// Also encoded as "64 - <specified imm>" but #1-#64 allowed.
-def fixedpos_asmoperand_i64 : AsmOperandClass {
-  let Name = "CVTFixedPos64";
-  let RenderMethod = "addCVTFixedPosOperands";
-  let PredicateMethod = "isCVTFixedPos<64>";
-  let DiagnosticType = "CVTFixedPos64";
+// Pair (indexed, offset)
+defm LDPW : LoadPairOffset<0b00, 0, GPR32, simm7s4, "ldp">;
+defm LDPX : LoadPairOffset<0b10, 0, GPR64, simm7s8, "ldp">;
+defm LDPS : LoadPairOffset<0b00, 1, FPR32, simm7s4, "ldp">;
+defm LDPD : LoadPairOffset<0b01, 1, FPR64, simm7s8, "ldp">;
+defm LDPQ : LoadPairOffset<0b10, 1, FPR128, simm7s16, "ldp">;
+
+defm LDPSW : LoadPairOffset<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (pre-indexed)
+def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+// Pair (post-indexed)
+def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
+def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
+def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
+def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
+def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
+
+def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
+
+
+// Pair (no allocate)
+defm LDNPW : LoadPairNoAlloc<0b00, 0, GPR32, simm7s4, "ldnp">;
+defm LDNPX : LoadPairNoAlloc<0b10, 0, GPR64, simm7s8, "ldnp">;
+defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32, simm7s4, "ldnp">;
+defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64, simm7s8, "ldnp">;
+defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128, simm7s16, "ldnp">;
+
+//---
+// (register offset)
+//---
+
+// Integer
+defm LDRBB : Load8RO<0b00,  0, 0b01, GPR32, "ldrb", i32, zextloadi8>;
+defm LDRHH : Load16RO<0b01, 0, 0b01, GPR32, "ldrh", i32, zextloadi16>;
+defm LDRW  : Load32RO<0b10, 0, 0b01, GPR32, "ldr", i32, load>;
+defm LDRX  : Load64RO<0b11, 0, 0b01, GPR64, "ldr", i64, load>;
+
+// Floating-point
+defm LDRB : Load8RO<0b00,   1, 0b01, FPR8,   "ldr", untyped, load>;
+defm LDRH : Load16RO<0b01,  1, 0b01, FPR16,  "ldr", f16, load>;
+defm LDRS : Load32RO<0b10,  1, 0b01, FPR32,  "ldr", f32, load>;
+defm LDRD : Load64RO<0b11,  1, 0b01, FPR64,  "ldr", f64, load>;
+defm LDRQ : Load128RO<0b00, 1, 0b11, FPR128, "ldr", f128, load>;
+
+// Load sign-extended half-word
+defm LDRSHW : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh", i32, sextloadi16>;
+defm LDRSHX : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh", i64, sextloadi16>;
+
+// Load sign-extended byte
+defm LDRSBW : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb", i32, sextloadi8>;
+defm LDRSBX : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb", i64, sextloadi8>;
+
+// Load sign-extended word
+defm LDRSW  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw", i64, sextloadi32>;
+
+// Pre-fetch.
+defm PRFM : PrefetchRO<0b11, 0, 0b10, "prfm">;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+multiclass ScalToVecROLoadPat<ROAddrMode ro, SDPatternOperator loadop,
+                              ValueType ScalTy, ValueType VecTy,
+                              Instruction LOADW, Instruction LOADX,
+                              SubRegIndex sub> {
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$offset),
+                           sub)>;
+
+  def : Pat<(VecTy (scalar_to_vector (ScalTy
+              (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset))))),
+            (INSERT_SUBREG (VecTy (IMPLICIT_DEF)),
+                           (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$offset),
+                           sub)>;
 }
 
-// We need the cartesian product of f32/f64 i32/i64 operands for
-// conversions:
-//   + Selection needs to use operands of correct floating type
-//   + Assembly parsing and decoding depend on integer width
-class cvtfix_i32_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<32>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i32;
-  let DecoderMethod = "DecodeCVT32FixedPosOperand";
-  let PrintMethod = "printCVTFixedPosOperand";
-}
+let AddedComplexity = 10 in {
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v8i8,  LDRBroW, LDRBroX, bsub>;
+defm : ScalToVecROLoadPat<ro8,  extloadi8,  i32, v16i8, LDRBroW, LDRBroX, bsub>;
 
-class cvtfix_i64_op<ValueType FloatVT>
-  : Operand<FloatVT>,
-    ComplexPattern<FloatVT, 1, "SelectCVTFixedPosOperand<64>", [fpimm]> {
-  let ParserMatchClass = fixedpos_asmoperand_i64;
-  let PrintMethod = "printCVTFixedPosOperand";
-}
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v4i16, LDRHroW, LDRHroX, hsub>;
+defm : ScalToVecROLoadPat<ro16, extloadi16, i32, v8i16, LDRHroW, LDRHroX, hsub>;
 
-// Because of the proliferation of weird operands, it's not really
-// worth going for a multiclass here. Oh well.
-
-class A64I_fptofix<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass GPR, RegisterClass FPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b11, opcode,
-                 (outs GPR:$Rd), (ins FPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (cvtop (fmul SrcTy:$Rn, scale_op:$Scale)))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def FCVTZSwsi : A64I_fptofix<0b0, 0b00, 0b000, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZSxsi : A64I_fptofix<0b1, 0b00, 0b000, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzs", fp_to_sint>;
-def FCVTZUwsi : A64I_fptofix<0b0, 0b00, 0b001, GPR32, FPR32, i32, f32,
-                             cvtfix_i32_op<f32>, "fcvtzu", fp_to_uint>;
-def FCVTZUxsi : A64I_fptofix<0b1, 0b00, 0b001, GPR64, FPR32, i64, f32,
-                             cvtfix_i64_op<f32>, "fcvtzu", fp_to_uint>;
-
-def FCVTZSwdi : A64I_fptofix<0b0, 0b01, 0b000, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZSxdi : A64I_fptofix<0b1, 0b01, 0b000, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzs", fp_to_sint>;
-def FCVTZUwdi : A64I_fptofix<0b0, 0b01, 0b001, GPR32, FPR64, i32, f64,
-                             cvtfix_i32_op<f64>, "fcvtzu", fp_to_uint>;
-def FCVTZUxdi : A64I_fptofix<0b1, 0b01, 0b001, GPR64, FPR64, i64, f64,
-                             cvtfix_i64_op<f64>, "fcvtzu", fp_to_uint>;
-
-
-class A64I_fixtofp<bit sf, bits<2> type, bits<3> opcode,
-                   RegisterClass FPR, RegisterClass GPR,
-                   ValueType DstTy, ValueType SrcTy,
-                   Operand scale_op, string asmop, SDNode cvtop>
-  : A64I_fpfixed<sf, 0b0, type, 0b00, opcode,
-                 (outs FPR:$Rd), (ins GPR:$Rn, scale_op:$Scale),
-                 !strconcat(asmop, "\t$Rd, $Rn, $Scale"),
-                 [(set DstTy:$Rd, (fdiv (cvtop SrcTy:$Rn), scale_op:$Scale))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def SCVTFswi : A64I_fixtofp<0b0, 0b00, 0b010, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "scvtf", sint_to_fp>;
-def SCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b010, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "scvtf", sint_to_fp>;
-def UCVTFswi : A64I_fixtofp<0b0, 0b00, 0b011, FPR32, GPR32, f32, i32,
-                            cvtfix_i32_op<f32>, "ucvtf", uint_to_fp>;
-def UCVTFsxi : A64I_fixtofp<0b1, 0b00, 0b011, FPR32, GPR64, f32, i64,
-                            cvtfix_i64_op<f32>, "ucvtf", uint_to_fp>;
-def SCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b010, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "scvtf", sint_to_fp>;
-def SCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b010, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "scvtf", sint_to_fp>;
-def UCVTFdwi : A64I_fixtofp<0b0, 0b01, 0b011, FPR64, GPR32, f64, i32,
-                            cvtfix_i32_op<f64>, "ucvtf", uint_to_fp>;
-def UCVTFdxi : A64I_fixtofp<0b1, 0b01, 0b011, FPR64, GPR64, f64, i64,
-                            cvtfix_i64_op<f64>, "ucvtf", uint_to_fp>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v2i32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       i32, v4i32, LDRSroW, LDRSroX, ssub>;
 
-//===----------------------------------------------------------------------===//
-// Floating-point <-> integer conversion instructions
-//===----------------------------------------------------------------------===//
-// Contains: FCVTZS, FCVTZU, SCVTF, UCVTF
-
-class A64I_fpintI<bit sf, bits<2> type, bits<2> rmode, bits<3> opcode,
-                   RegisterClass DestPR, RegisterClass SrcPR, string asmop>
-  : A64I_fpint<sf, 0b0, type, rmode, opcode, (outs DestPR:$Rd), (ins SrcPR:$Rn),
-               !strconcat(asmop, "\t$Rd, $Rn"), [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass A64I_fptointRM<bits<2> rmode, bit o2, string asmop> {
-  def Sws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 0},
-                        GPR32, FPR32, asmop # "s">;
-  def Sxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 0},
-                        GPR64, FPR32, asmop # "s">;
-  def Uws : A64I_fpintI<0b0, 0b00, rmode, {o2, 0, 1},
-                        GPR32, FPR32, asmop # "u">;
-  def Uxs : A64I_fpintI<0b1, 0b00, rmode, {o2, 0, 1},
-                        GPR64, FPR32, asmop # "u">;
-
-  def Swd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 0},
-                        GPR32, FPR64, asmop # "s">;
-  def Sxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 0},
-                        GPR64, FPR64, asmop # "s">;
-  def Uwd : A64I_fpintI<0b0, 0b01, rmode, {o2, 0, 1},
-                        GPR32, FPR64, asmop # "u">;
-  def Uxd : A64I_fpintI<0b1, 0b01, rmode, {o2, 0, 1},
-                        GPR64, FPR64, asmop # "u">;
-}
+defm : ScalToVecROLoadPat<ro32, load,       f32, v2f32, LDRSroW, LDRSroX, ssub>;
+defm : ScalToVecROLoadPat<ro32, load,       f32, v4f32, LDRSroW, LDRSroX, ssub>;
 
-defm FCVTN : A64I_fptointRM<0b00, 0b0, "fcvtn">;
-defm FCVTP : A64I_fptointRM<0b01, 0b0, "fcvtp">;
-defm FCVTM : A64I_fptointRM<0b10, 0b0, "fcvtm">;
-defm FCVTZ : A64I_fptointRM<0b11, 0b0, "fcvtz">;
-defm FCVTA : A64I_fptointRM<0b00, 0b1, "fcvta">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (fp_to_sint f32:$Rn)), (FCVTZSws $Rn)>;
-def : Pat<(i64 (fp_to_sint f32:$Rn)), (FCVTZSxs $Rn)>;
-def : Pat<(i32 (fp_to_uint f32:$Rn)), (FCVTZUws $Rn)>;
-def : Pat<(i64 (fp_to_uint f32:$Rn)), (FCVTZUxs $Rn)>;
-def : Pat<(i32 (fp_to_sint f64:$Rn)), (FCVTZSwd $Rn)>;
-def : Pat<(i64 (fp_to_sint f64:$Rn)), (FCVTZSxd $Rn)>;
-def : Pat<(i32 (fp_to_uint f64:$Rn)), (FCVTZUwd $Rn)>;
-def : Pat<(i64 (fp_to_uint f64:$Rn)), (FCVTZUxd $Rn)>;
-}
+defm : ScalToVecROLoadPat<ro64, load,       i64, v2i64, LDRDroW, LDRDroX, dsub>;
 
-multiclass A64I_inttofp<bit o0, string asmop> {
-  def CVTFsw : A64I_fpintI<0b0, 0b00, 0b00, {0, 1, o0}, FPR32, GPR32, asmop>;
-  def CVTFsx : A64I_fpintI<0b1, 0b00, 0b00, {0, 1, o0}, FPR32, GPR64, asmop>;
-  def CVTFdw : A64I_fpintI<0b0, 0b01, 0b00, {0, 1, o0}, FPR64, GPR32, asmop>;
-  def CVTFdx : A64I_fpintI<0b1, 0b01, 0b00, {0, 1, o0}, FPR64, GPR64, asmop>;
-}
-
-defm S : A64I_inttofp<0b0, "scvtf">;
-defm U : A64I_inttofp<0b1, "ucvtf">;
-
-let Predicates = [HasFPARMv8] in {
-def : Pat<(f32 (sint_to_fp i32:$Rn)), (SCVTFsw $Rn)>;
-def : Pat<(f32 (sint_to_fp i64:$Rn)), (SCVTFsx $Rn)>;
-def : Pat<(f64 (sint_to_fp i32:$Rn)), (SCVTFdw $Rn)>;
-def : Pat<(f64 (sint_to_fp i64:$Rn)), (SCVTFdx $Rn)>;
-def : Pat<(f32 (uint_to_fp i32:$Rn)), (UCVTFsw $Rn)>;
-def : Pat<(f32 (uint_to_fp i64:$Rn)), (UCVTFsx $Rn)>;
-def : Pat<(f64 (uint_to_fp i32:$Rn)), (UCVTFdw $Rn)>;
-def : Pat<(f64 (uint_to_fp i64:$Rn)), (UCVTFdx $Rn)>;
-}
+defm : ScalToVecROLoadPat<ro64, load,       f64, v2f64, LDRDroW, LDRDroX, dsub>;
 
-def FMOVws : A64I_fpintI<0b0, 0b00, 0b00, 0b110, GPR32, FPR32, "fmov">;
-def FMOVsw : A64I_fpintI<0b0, 0b00, 0b00, 0b111, FPR32, GPR32, "fmov">;
-def FMOVxd : A64I_fpintI<0b1, 0b01, 0b00, 0b110, GPR64, FPR64, "fmov">;
-def FMOVdx : A64I_fpintI<0b1, 0b01, 0b00, 0b111, FPR64, GPR64, "fmov">;
 
-let Predicates = [HasFPARMv8] in {
-def : Pat<(i32 (bitconvert f32:$Rn)), (FMOVws $Rn)>;
-def : Pat<(f32 (bitconvert i32:$Rn)), (FMOVsw $Rn)>;
-def : Pat<(i64 (bitconvert f64:$Rn)), (FMOVxd $Rn)>;
-def : Pat<(f64 (bitconvert i64:$Rn)), (FMOVdx $Rn)>;
-}
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
+                                           ro_Wextend64:$extend))))),
+           (LDRDroW GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 
-def lane1_asmoperand : AsmOperandClass {
-  let Name = "Lane1";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "Lane1";
+def : Pat <(v1i64 (scalar_to_vector (i64
+                      (load (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
+                                           ro_Xextend64:$extend))))),
+           (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 }
 
-def lane1 : Operand<i32> {
-  let ParserMatchClass = lane1_asmoperand;
-  let PrintMethod = "printBareImmOperand";
-}
+// Match all load 64 bits width whose type is compatible with FPR64
+multiclass VecROLoadPat<ROAddrMode ro, ValueType VecTy,
+                        Instruction LOADW, Instruction LOADX> {
 
-let DecoderMethod =  "DecodeFMOVLaneInstruction" in {
-  def FMOVxv : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b110,
-                          (outs GPR64:$Rd), (ins VPR128:$Rn, lane1:$Lane),
-                          "fmov\t$Rd, $Rn.d[$Lane]", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
+  def : Pat<(VecTy (load (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (LOADW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-  def FMOVvx : A64I_fpint<0b1, 0b0, 0b10, 0b01, 0b111,
-                          (outs VPR128:$Rd), (ins GPR64:$Rn, lane1:$Lane),
-                          "fmov\t$Rd.d[$Lane], $Rn", [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
+  def : Pat<(VecTy (load (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (LOADX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-let Predicates = [HasFPARMv8] in {
-def : InstAlias<"fmov $Rd, $Rn.2d[$Lane]",
-                (FMOVxv GPR64:$Rd, VPR128:$Rn, lane1:$Lane), 0b0>;
-
-def : InstAlias<"fmov $Rd.2d[$Lane], $Rn",
-                (FMOVvx VPR128:$Rd, GPR64:$Rn, lane1:$Lane), 0b0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating-point immediate instructions
-//===----------------------------------------------------------------------===//
-// Contains: FMOV
-
-def fpimm_asmoperand : AsmOperandClass {
-  let Name = "FMOVImm";
-  let ParserMethod = "ParseFPImmOperand";
-  let DiagnosticType = "FPImm";
-}
-
-// The MCOperand for these instructions are the encoded 8-bit values.
-def SDXF_fpimm : SDNodeXForm<fpimm, [{
-  uint32_t Imm8;
-  A64Imms::isFPImm(N->getValueAPF(), Imm8);
-  return CurDAG->getTargetConstant(Imm8, MVT::i32);
-}]>;
-
-class fmov_operand<ValueType FT>
-  : Operand<i32>,
-    PatLeaf<(FT fpimm), [{ return A64Imms::isFPImm(N->getValueAPF()); }],
-            SDXF_fpimm> {
-  let PrintMethod = "printFPImmOperand";
-  let ParserMatchClass = fpimm_asmoperand;
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro64, v2i32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v2f32, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
+}
+
+defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
+defm : VecROLoadPat<ro64, v1f64,  LDRDroW, LDRDroX>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must do vector loads with LD1 in big-endian.
+  defm : VecROLoadPat<ro128, v2i64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v2f64,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4i32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
+}
+} // AddedComplexity = 10
+
+// zextload -> i64
+multiclass ExtLoadTo64ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i64 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                           sub_32)>;
+
+  def : Pat<(i64 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (SUBREG_TO_REG (i64 0),
+                           (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                           sub_32)>;
 }
 
-def fmov32_operand : fmov_operand<f32>;
-def fmov64_operand : fmov_operand<f64>;
-
-class A64I_fpimm_impl<bits<2> type, RegisterClass Reg, ValueType VT,
-                      Operand fmov_operand>
-  : A64I_fpimm<0b0, 0b0, type, 0b00000,
-               (outs Reg:$Rd),
-               (ins fmov_operand:$Imm8),
-               "fmov\t$Rd, $Imm8",
-               [(set VT:$Rd, fmov_operand:$Imm8)],
-               NoItinerary>,
-    Sched<[WriteFPALU]>;
-
-def FMOVsi : A64I_fpimm_impl<0b00, FPR32, f32, fmov32_operand>;
-def FMOVdi : A64I_fpimm_impl<0b01, FPR64, f64, fmov64_operand>;
+let AddedComplexity = 10 in {
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi8,  LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, zextloadi16, LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, zextloadi32, LDRWroW,  LDRWroX>;
 
-//===----------------------------------------------------------------------===//
-// Load-register (literal) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDR, LDRSW, PRFM
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  zextloadi1,  LDRBBroW, LDRBBroX>;
 
-def ldrlit_label_asmoperand : AsmOperandClass {
-  let Name = "LoadLitLabel";
-  let RenderMethod = "addLabelOperands<19, 4>";
-  let DiagnosticType = "Label";
-}
+  // extload -> zextload
+  defm : ExtLoadTo64ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo64ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo64ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
 
-def ldrlit_label : Operand<i64> {
-  let EncoderMethod = "getLoadLitLabelOpValue";
-
-  // This label is a 19-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<19, 4>";
-  let ParserMatchClass = ldrlit_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
+  // extloadi1 -> zextloadi8
+  defm : ExtLoadTo64ROPat<ro8,  extloadi1,   LDRBBroW, LDRBBroX>;
 }
 
-// Various instructions take an immediate value (which can always be used),
-// where some numbers have a symbolic name to make things easier. These operands
-// and the associated functions abstract away the differences.
-multiclass namedimm<string prefix, string mapper> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "NamedImm" # prefix;
-    let PredicateMethod = "isUImm";
-    let RenderMethod = "addImmOperands";
-    let ParserMethod = "ParseNamedImmOperand<" # mapper # ">";
-    let DiagnosticType = "NamedImm_" # prefix;
-  }
 
-  def _op : Operand<i32> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printNamedImmOperand<" # mapper # ">";
-    let DecoderMethod = "DecodeNamedImmOperand<" # mapper # ">";
-  }
-}
-
-defm prefetch : namedimm<"prefetch", "A64PRFM::PRFMMapper">;
+// zextload -> i64
+multiclass ExtLoadTo32ROPat<ROAddrMode ro, SDPatternOperator loadop,
+                            Instruction INSTW, Instruction INSTX> {
+  def : Pat<(i32 (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend))),
+            (INSTW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-class A64I_LDRlitSimple<bits<2> opc, bit v, RegisterClass OutReg,
-                      list<dag> patterns = []>
-   : A64I_LDRlit<opc, v, (outs OutReg:$Rt), (ins ldrlit_label:$Imm19),
-                 "ldr\t$Rt, $Imm19", patterns, NoItinerary>,
-     Sched<[WriteLd]>;
+  def : Pat<(i32 (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend))),
+            (INSTX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 
-let mayLoad = 1 in {
-  def LDRw_lit : A64I_LDRlitSimple<0b00, 0b0, GPR32>;
-  def LDRx_lit : A64I_LDRlitSimple<0b01, 0b0, GPR64>;
 }
 
-let Predicates = [HasFPARMv8] in {
-def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
-def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+let AddedComplexity = 10 in {
+  // extload -> zextload
+  defm : ExtLoadTo32ROPat<ro8,  extloadi8,   LDRBBroW, LDRBBroX>;
+  defm : ExtLoadTo32ROPat<ro16, extloadi16,  LDRHHroW, LDRHHroX>;
+  defm : ExtLoadTo32ROPat<ro32, extloadi32,  LDRWroW,  LDRWroX>;
+
+  // zextloadi1 -> zextloadi8
+  defm : ExtLoadTo32ROPat<ro8, zextloadi1, LDRBBroW, LDRBBroX>;
+}
+
+//---
+// (unsigned immediate)
+//---
+defm LDRX : LoadUI<0b11, 0, 0b01, GPR64, uimm12s8, "ldr",
+                   [(set GPR64:$Rt,
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRW : LoadUI<0b10, 0, 0b01, GPR32, uimm12s4, "ldr",
+                   [(set GPR32:$Rt,
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRB : LoadUI<0b00, 1, 0b01, FPR8, uimm12s1, "ldr",
+                   [(set FPR8:$Rt,
+                         (load (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)))]>;
+defm LDRH : LoadUI<0b01, 1, 0b01, FPR16, uimm12s2, "ldr",
+                   [(set (f16 FPR16:$Rt),
+                         (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)))]>;
+defm LDRS : LoadUI<0b10, 1, 0b01, FPR32, uimm12s4, "ldr",
+                   [(set (f32 FPR32:$Rt),
+                         (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)))]>;
+defm LDRD : LoadUI<0b11, 1, 0b01, FPR64, uimm12s8, "ldr",
+                   [(set (f64 FPR64:$Rt),
+                         (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)))]>;
+defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128, uimm12s16, "ldr",
+                 [(set (f128 FPR128:$Rt),
+                       (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
+
+// For regular load, we do not have any alignment requirement.
+// Thus, it is safe to directly map the vector loads with interesting
+// addressing modes.
+// FIXME: We could do the same for bitconvert to floating point vectors.
+def : Pat <(v8i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v16i8 (scalar_to_vector (i32
+               (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+def : Pat <(v4i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v8i16 (scalar_to_vector (i32
+               (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+def : Pat <(v2i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v4i32 (scalar_to_vector (i32
+               (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+def : Pat <(v1i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat <(v2i64 (scalar_to_vector (i64
+               (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))))),
+           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                          (LDRDui GPR64sp:$Rn, uimm12s8:$offset), dsub)>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v2f32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v8i8 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4i16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v2i32 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(v1i64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+          (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use LD1 to perform vector loads in big-endian.
+  def : Pat<(v4f32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2f64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v16i8 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8i16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v4i32 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v2i64 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+          (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+
+defm LDRHH : LoadUI<0b01, 0, 0b01, GPR32, uimm12s2, "ldrh",
+                    [(set GPR32:$Rt,
+                          (zextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                     uimm12s2:$offset)))]>;
+defm LDRBB : LoadUI<0b00, 0, 0b01, GPR32, uimm12s1, "ldrb",
+                    [(set GPR32:$Rt,
+                          (zextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                   uimm12s1:$offset)))]>;
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+
+// zextloadi1 -> zextloadi8
+def : Pat<(i32 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (zextloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// extload -> zextload
+def : Pat<(i32 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+          (LDRHHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(i32 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i32 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+          (LDRBBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : Pat<(i64 (extloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRHHui GPR64sp:$Rn, uimm12s2:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDRBBui GPR64sp:$Rn, uimm12s1:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDRSHW : LoadUI<0b01, 0, 0b11, GPR32, uimm12s2, "ldrsh",
+                     [(set GPR32:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+defm LDRSHX : LoadUI<0b01, 0, 0b10, GPR64, uimm12s2, "ldrsh",
+                     [(set GPR64:$Rt,
+                           (sextloadi16 (am_indexed16 GPR64sp:$Rn,
+                                                      uimm12s2:$offset)))]>;
+
+// load sign-extended byte
+defm LDRSBW : LoadUI<0b00, 0, 0b11, GPR32, uimm12s1, "ldrsb",
+                     [(set GPR32:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+defm LDRSBX : LoadUI<0b00, 0, 0b10, GPR64, uimm12s1, "ldrsb",
+                     [(set GPR64:$Rt,
+                           (sextloadi8 (am_indexed8 GPR64sp:$Rn,
+                                                    uimm12s1:$offset)))]>;
+
+// load sign-extended word
+defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
+                     [(set GPR64:$Rt,
+                           (sextloadi32 (am_indexed32 GPR64sp:$Rn,
+                                                      uimm12s4:$offset)))]>;
+
+// load zero-extended word
+def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
+      (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
+
+// Pre-fetch.
+def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
+                        [(AArch64Prefetch imm:$Rt,
+                                        (am_indexed64 GPR64sp:$Rn,
+                                                      uimm12s8:$offset))]>;
+
+def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
+
+//---
+// (literal)
+def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
+def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
+def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
+def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
+def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
+
+// load sign-extended word
+def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
+
+// prefetch
+def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
+//                   [(AArch64Prefetch imm:$Rt, tglobaladdr:$label)]>;
+
+//---
+// (unscaled immediate)
+defm LDURX : LoadUnscaled<0b11, 0, 0b01, GPR64, "ldur",
+                    [(set GPR64:$Rt,
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURW : LoadUnscaled<0b10, 0, 0b01, GPR32, "ldur",
+                    [(set GPR32:$Rt,
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8, "ldur",
+                    [(set FPR8:$Rt,
+                          (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16, "ldur",
+                    [(set FPR16:$Rt,
+                          (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32, "ldur",
+                    [(set (f32 FPR32:$Rt),
+                          (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURD : LoadUnscaled<0b11, 1, 0b01, FPR64, "ldur",
+                    [(set (f64 FPR64:$Rt),
+                          (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURQ : LoadUnscaled<0b00, 1, 0b11, FPR128, "ldur",
+                    [(set (f128 FPR128:$Rt),
+                          (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset)))]>;
+
+defm LDURHH
+    : LoadUnscaled<0b01, 0, 0b01, GPR32, "ldurh",
+             [(set GPR32:$Rt,
+                    (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURBB
+    : LoadUnscaled<0b00, 0, 0b01, GPR32, "ldurb",
+             [(set GPR32:$Rt,
+                    (zextloadi8 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// Match all load 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i32 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i16 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i8 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+            (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(v1f64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(v1i64 (load (am_unscaled64 GPR64sp:$Rn, simm9:$offset))),
+          (LDURDi GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all load 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v2i64 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4f32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v4i32 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v8i16 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(v16i8 (load (am_unscaled128 GPR64sp:$Rn, simm9:$offset))),
+            (LDURQi GPR64sp:$Rn, simm9:$offset)>;
+}
+
+//  anyext -> zext
+def : Pat<(i32 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (extloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (extloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+// unscaled zext
+def : Pat<(i32 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+          (LDURHHi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i32 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+          (LDURBBi GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(i64 (zextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURWi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi1 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+    (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+
+//---
+// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
+
+// Define new assembler match classes as we want to only match these when
+// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
+// associate a DiagnosticType either, as we want the diagnostic for the
+// canonical form (the scaled operand) to take precedence.
+class SImm9OffsetOperand<int Width> : AsmOperandClass {
+  let Name = "SImm9OffsetFB" # Width;
+  let PredicateMethod = "isSImm9OffsetFB<" # Width # ">";
+  let RenderMethod = "addImmOperands";
 }
 
-let mayLoad = 1 in {
-  let Predicates = [HasFPARMv8] in {
-  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
-  }
-
-  def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
-                               (outs GPR64:$Rt),
-                               (ins ldrlit_label:$Imm19),
-                               "ldrsw\t$Rt, $Imm19",
-                               [], NoItinerary>,
-                   Sched<[WriteLd]>;
-
-  def PRFM_lit : A64I_LDRlit<0b11, 0b0,
-                             (outs), (ins prefetch_op:$Rt, ldrlit_label:$Imm19),
-                             "prfm\t$Rt, $Imm19",
-                             [], NoItinerary>,
-                 Sched<[WriteLd, ReadLd]>;
-}
+def SImm9OffsetFB8Operand : SImm9OffsetOperand<8>;
+def SImm9OffsetFB16Operand : SImm9OffsetOperand<16>;
+def SImm9OffsetFB32Operand : SImm9OffsetOperand<32>;
+def SImm9OffsetFB64Operand : SImm9OffsetOperand<64>;
+def SImm9OffsetFB128Operand : SImm9OffsetOperand<128>;
+
+def simm9_offset_fb8 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB8Operand;
+}
+def simm9_offset_fb16 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB16Operand;
+}
+def simm9_offset_fb32 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB32Operand;
+}
+def simm9_offset_fb64 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB64Operand;
+}
+def simm9_offset_fb128 : Operand<i64> {
+  let ParserMatchClass = SImm9OffsetFB128Operand;
+}
+
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+                (LDURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"ldr $Rt, [$Rn, $offset]",
+               (LDURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+// zextload -> i64
+def : Pat<(i64 (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURBBi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+def : Pat<(i64 (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))),
+  (SUBREG_TO_REG (i64 0), (LDURHHi GPR64sp:$Rn, simm9:$offset), sub_32)>;
+
+// load sign-extended half-word
+defm LDURSHW
+    : LoadUnscaled<0b01, 0, 0b11, GPR32, "ldursh",
+               [(set GPR32:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSHX
+    : LoadUnscaled<0b01, 0, 0b10, GPR64, "ldursh",
+              [(set GPR64:$Rt,
+                    (sextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended byte
+defm LDURSBW
+    : LoadUnscaled<0b00, 0, 0b11, GPR32, "ldursb",
+                [(set GPR32:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+defm LDURSBX
+    : LoadUnscaled<0b00, 0, 0b10, GPR64, "ldursb",
+                [(set GPR64:$Rt,
+                      (sextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// load sign-extended word
+defm LDURSW
+    : LoadUnscaled<0b10, 0, 0b10, GPR64, "ldursw",
+              [(set GPR64:$Rt,
+                    (sextloadi32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset)))]>;
+
+// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
+def : InstAlias<"ldrb $Rt, [$Rn, $offset]",
+                (LDURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrh $Rt, [$Rn, $offset]",
+                (LDURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsb $Rt, [$Rn, $offset]",
+                (LDURSBXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsh $Rt, [$Rn, $offset]",
+                (LDURSHXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"ldrsw $Rt, [$Rn, $offset]",
+                (LDURSWi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+
+// Pre-fetch.
+defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
+                  [(AArch64Prefetch imm:$Rt,
+                                  (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm LDTRX : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
+defm LDTRW : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
+
+defm LDTRH : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
+defm LDTRB : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
+
+// load sign-extended half-word
+defm LDTRSHW : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
+defm LDTRSHX : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
+
+// load sign-extended byte
+defm LDTRSBW : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
+defm LDTRSBX : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
+
+// load sign-extended word
+defm LDTRSW  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
+
+//---
+// (immediate pre-indexed)
+def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
+
+//---
+// (immediate post-indexed)
+def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
+def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
+def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
+def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
+def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
+def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
+def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
+
+// load sign-extended half-word
+def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
+def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
+
+// load sign-extended byte
+def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
+def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
+
+// load zero-extended byte
+def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
+def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
+
+// load sign-extended word
+def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
 
 //===----------------------------------------------------------------------===//
-// Load-store exclusive instructions
+// Store instructions.
 //===----------------------------------------------------------------------===//
-// Contains: STXRB, STXRH, STXR, LDXRB, LDXRH, LDXR. STXP, LDXP, STLXRB,
-//           STLXRH, STLXR, LDAXRB, LDAXRH, LDAXR, STLXP, LDAXP, STLRB,
-//           STLRH, STLR, LDARB, LDARH, LDAR
-
-// Since these instructions have the undefined register bits set to 1 in
-// their canonical form, we need a post encoder method to set those bits
-// to 1 when encoding these instructions. We do this using the
-// fixLoadStoreExclusive function. This function has template parameters:
-//
-// fixLoadStoreExclusive<int hasRs, int hasRt2>
-//
-// hasRs indicates that the instruction uses the Rs field, so we won't set
-// it to 1 (and the same for Rt2). We don't need template parameters for
-// the other register fiels since Rt and Rn are always used.
-
-// This operand parses a GPR64xsp register, followed by an optional immediate
-// #0.
-def GPR64xsp0_asmoperand : AsmOperandClass {
-  let Name = "GPR64xsp0";
-  let PredicateMethod = "isWrappedReg";
-  let RenderMethod = "addRegOperands";
-  let ParserMethod = "ParseLSXAddressOperand";
-  // Diagnostics are provided by ParserMethod
-}
-
-def GPR64xsp0 : RegisterOperand<GPR64xsp> {
-  let ParserMatchClass = GPR64xsp0_asmoperand;
-}
-
-//===----------------------------------
-// Store-exclusive (releasing & normal)
-//===----------------------------------
-
-class A64I_SRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-       A64I_LDSTex_stn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<1,0>";
-  let Constraints = "@earlyclobber $Rs";
-}
-
-multiclass A64I_SRex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                               (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                               [],NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SRexs_impl<0b10, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SRexs_impl<0b11, opcode, asmstr,
-                              (outs GPR32:$Rs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                              [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STXR  : A64I_SRex<"stxr",  0b000, "STXR">;
-defm STLXR : A64I_SRex<"stlxr", 0b001, "STLXR">;
-
-//===----------------------------------
-// Loads
-//===----------------------------------
-
-class A64I_LRexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayLoad = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
 
-multiclass A64I_LRex<string asmstr, bits<3> opcode> {
-  def _byte:  A64I_LRexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _hword:  A64I_LRexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-               Sched<[WriteLd]>;
-
-  def _word:  A64I_LRexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
-
-  def _dword: A64I_LRexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt), (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd]>;
+// Pair (indexed, offset)
+// FIXME: Use dedicated range-checked addressing mode operand here.
+defm STPW : StorePairOffset<0b00, 0, GPR32, simm7s4, "stp">;
+defm STPX : StorePairOffset<0b10, 0, GPR64, simm7s8, "stp">;
+defm STPS : StorePairOffset<0b00, 1, FPR32, simm7s4, "stp">;
+defm STPD : StorePairOffset<0b01, 1, FPR64, simm7s8, "stp">;
+defm STPQ : StorePairOffset<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpre : StorePairPreIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpre : StorePairPreIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpre : StorePairPreIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpre : StorePairPreIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpre : StorePairPreIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (pre-indexed)
+def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
+def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
+def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
+def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
+def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
+
+// Pair (no allocate)
+defm STNPW : StorePairNoAlloc<0b00, 0, GPR32, simm7s4, "stnp">;
+defm STNPX : StorePairNoAlloc<0b10, 0, GPR64, simm7s8, "stnp">;
+defm STNPS : StorePairNoAlloc<0b00, 1, FPR32, simm7s4, "stnp">;
+defm STNPD : StorePairNoAlloc<0b01, 1, FPR64, simm7s8, "stnp">;
+defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128, simm7s16, "stnp">;
+
+//---
+// (Register offset)
+
+// Integer
+defm STRBB : Store8RO< 0b00, 0, 0b00, GPR32, "strb", i32, truncstorei8>;
+defm STRHH : Store16RO<0b01, 0, 0b00, GPR32, "strh", i32, truncstorei16>;
+defm STRW  : Store32RO<0b10, 0, 0b00, GPR32, "str",  i32, store>;
+defm STRX  : Store64RO<0b11, 0, 0b00, GPR64, "str",  i64, store>;
+
+
+// Floating-point
+defm STRB : Store8RO< 0b00,  1, 0b00, FPR8,   "str", untyped, store>;
+defm STRH : Store16RO<0b01,  1, 0b00, FPR16,  "str", f16,     store>;
+defm STRS : Store32RO<0b10,  1, 0b00, FPR32,  "str", f32,     store>;
+defm STRD : Store64RO<0b11,  1, 0b00, FPR64,  "str", f64,     store>;
+defm STRQ : Store128RO<0b00, 1, 0b10, FPR128, "str", f128,    store>;
+
+multiclass TruncStoreFrom64ROPat<ROAddrMode ro, SDPatternOperator storeop,
+                                 Instruction STRW, Instruction STRX> {
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
+
+  def : Pat<(storeop GPR64:$Rt,
+                     (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX (EXTRACT_SUBREG GPR64:$Rt, sub_32),
+                  GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-defm LDXR  : A64I_LRex<"ldxr",  0b000>;
-defm LDAXR : A64I_LRex<"ldaxr", 0b001>;
-defm LDAR  : A64I_LRex<"ldar",  0b101>;
-
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
-}]>;
-
-def atomic_load_acquire_8  : acquiring_load<atomic_load_8>;
-def atomic_load_acquire_16 : acquiring_load<atomic_load_16>;
-def atomic_load_acquire_32 : acquiring_load<atomic_load_32>;
-def atomic_load_acquire_64 : acquiring_load<atomic_load_64>;
-
-def : Pat<(atomic_load_acquire_8  i64:$Rn), (LDAR_byte  $Rn)>;
-def : Pat<(atomic_load_acquire_16 i64:$Rn), (LDAR_hword $Rn)>;
-def : Pat<(atomic_load_acquire_32 i64:$Rn), (LDAR_word  $Rn)>;
-def : Pat<(atomic_load_acquire_64 i64:$Rn), (LDAR_dword $Rn)>;
-
-//===----------------------------------
-// Store-release (no exclusivity)
-//===----------------------------------
-
-class A64I_SLexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-        A64I_LDSTex_tn <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,0>";
-}
-
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Release || Ordering == SequentiallyConsistent;
-}]>;
-
-def atomic_store_release_8  : releasing_store<atomic_store_8>;
-def atomic_store_release_16 : releasing_store<atomic_store_16>;
-def atomic_store_release_32 : releasing_store<atomic_store_32>;
-def atomic_store_release_64 : releasing_store<atomic_store_64>;
-
-multiclass A64I_SLex<string asmstr, bits<3> opcode, string prefix> {
-  def _byte:  A64I_SLexs_impl<0b00, opcode, !strconcat(asmstr, "b"),
-                            (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                            [(atomic_store_release_8 i64:$Rn, i32:$Rt)],
-                            NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _hword:  A64I_SLexs_impl<0b01, opcode, !strconcat(asmstr, "h"),
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_16 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-               Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _word:  A64I_SLexs_impl<0b10, opcode, asmstr,
-                           (outs), (ins GPR32:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_32 i64:$Rn, i32:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SLexs_impl<0b11, opcode, asmstr,
-                           (outs), (ins GPR64:$Rt, GPR64xsp0:$Rn),
-                           [(atomic_store_release_64 i64:$Rn, i64:$Rt)],
-                           NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]>;
-}
-
-defm STLR  : A64I_SLex<"stlr", 0b101, "STLR">;
-
-//===----------------------------------
-// Store-exclusive pair (releasing & normal)
-//===----------------------------------
-
-class A64I_SPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-     A64I_LDSTex_stt2n <size,
-                        opcode{2}, 0, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rs, $Rt, $Rt2, [$Rn]"),
-                        pat, itin> {
-  let mayStore = 1;
-}
-
-
-multiclass A64I_SPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_SPexs_impl<0b10, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR32:$Rt, GPR32:$Rt2,
-                                 GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-  def _dword: A64I_SPexs_impl<0b11, opcode, asmstr, (outs),
-                            (ins GPR32:$Rs, GPR64:$Rt, GPR64:$Rt2,
-                                            GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
+let AddedComplexity = 10 in {
+  // truncstore i64
+  defm : TruncStoreFrom64ROPat<ro8,  truncstorei8,  STRBBroW, STRBBroX>;
+  defm : TruncStoreFrom64ROPat<ro16, truncstorei16, STRHHroW, STRHHroX>;
+  defm : TruncStoreFrom64ROPat<ro32, truncstorei32, STRWroW,  STRWroX>;
 }
 
-defm STXP  : A64I_SPex<"stxp", 0b010>;
-defm STLXP : A64I_SPex<"stlxp", 0b011>;
-
-//===----------------------------------
-// Load-exclusive pair (acquiring & normal)
-//===----------------------------------
-
-class A64I_LPexs_impl<bits<2> size, bits<3> opcode, string asm, dag outs,
-                        dag ins, list<dag> pat,
-                        InstrItinClass itin> :
-      A64I_LDSTex_tt2n <size,
-                        opcode{2}, 1, opcode{1}, opcode{0},
-                        outs, ins,
-                        !strconcat(asm, "\t$Rt, $Rt2, [$Rn]"),
-                        pat, itin>{
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLoadPairExclusiveInstruction";
-  let PostEncoderMethod = "fixLoadStoreExclusive<0,1>";
-}
+multiclass VecROStorePat<ROAddrMode ro, ValueType VecTy, RegisterClass FPR,
+                         Instruction STRW, Instruction STRX> {
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Wpat GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)),
+            (STRW FPR:$Rt, GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend)>;
 
-multiclass A64I_LPex<string asmstr, bits<3> opcode> {
-  def _word:  A64I_LPexs_impl<0b10, opcode, asmstr,
-                            (outs GPR32:$Rt, GPR32:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
-
-  def _dword: A64I_LPexs_impl<0b11, opcode, asmstr,
-                            (outs GPR64:$Rt, GPR64:$Rt2),
-                            (ins GPR64xsp0:$Rn),
-                            [], NoItinerary>,
-              Sched<[WriteLd, WriteLd, ReadLd]>;
+  def : Pat<(store (VecTy FPR:$Rt),
+                   (ro.Xpat GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)),
+            (STRX FPR:$Rt, GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend)>;
 }
 
-defm LDXP  : A64I_LPex<"ldxp", 0b010>;
-defm LDAXP : A64I_LPex<"ldaxp", 0b011>;
+let AddedComplexity = 10 in {
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro64, v2i32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v2f32, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
+}
+
+defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
+defm : VecROStorePat<ro64, v1f64, FPR64, STRDroW, STRDroX>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  defm : VecROStorePat<ro128, v2i64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v2f64, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4i32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v4f32, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
+}
+} // AddedComplexity = 10
+
+//---
+// (unsigned immediate)
+defm STRX : StoreUI<0b11, 0, 0b00, GPR64, uimm12s8, "str",
+                   [(store GPR64:$Rt,
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRW : StoreUI<0b10, 0, 0b00, GPR32, uimm12s4, "str",
+                    [(store GPR32:$Rt,
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRB : StoreUI<0b00, 1, 0b00, FPR8, uimm12s1, "str",
+                    [(store FPR8:$Rt,
+                            (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))]>;
+defm STRH : StoreUI<0b01, 1, 0b00, FPR16, uimm12s2, "str",
+                    [(store (f16 FPR16:$Rt),
+                            (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))]>;
+defm STRS : StoreUI<0b10, 1, 0b00, FPR32, uimm12s4, "str",
+                    [(store (f32 FPR32:$Rt),
+                            (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))]>;
+defm STRD : StoreUI<0b11, 1, 0b00, FPR64, uimm12s8, "str",
+                    [(store (f64 FPR64:$Rt),
+                            (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))]>;
+defm STRQ : StoreUI<0b00, 1, 0b10, FPR128, uimm12s16, "str", []>;
+
+defm STRHH : StoreUI<0b01, 0, 0b00, GPR32, uimm12s2, "strh",
+                     [(truncstorei16 GPR32:$Rt,
+                                     (am_indexed16 GPR64sp:$Rn,
+                                                   uimm12s2:$offset))]>;
+defm STRBB : StoreUI<0b00, 0, 0b00, GPR32, uimm12s1,  "strb",
+                     [(truncstorei8 GPR32:$Rt,
+                                    (am_indexed8 GPR64sp:$Rn,
+                                                 uimm12s1:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let AddedComplexity = 10 in {
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt),
+                 (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+          (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+}
+def : Pat<(store (f128  FPR128:$Rt),
+                 (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+          (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+
+// truncstore i64
+def : Pat<(truncstorei32 GPR64:$Rt,
+                         (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset)),
+  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s4:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt,
+                         (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s2:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset)),
+  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, uimm12s1:$offset)>;
+
+} // AddedComplexity = 10
+
+//---
+// (unscaled immediate)
+defm STURX : StoreUnscaled<0b11, 0, 0b00, GPR64, "stur",
+                         [(store GPR64:$Rt,
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURW : StoreUnscaled<0b10, 0, 0b00, GPR32, "stur",
+                         [(store GPR32:$Rt,
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURB : StoreUnscaled<0b00, 1, 0b00, FPR8, "stur",
+                         [(store FPR8:$Rt,
+                                 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURH : StoreUnscaled<0b01, 1, 0b00, FPR16, "stur",
+                         [(store (f16 FPR16:$Rt),
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURS : StoreUnscaled<0b10, 1, 0b00, FPR32, "stur",
+                         [(store (f32 FPR32:$Rt),
+                                 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURD : StoreUnscaled<0b11, 1, 0b00, FPR64, "stur",
+                         [(store (f64 FPR64:$Rt),
+                                 (am_unscaled64 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURQ : StoreUnscaled<0b00, 1, 0b10, FPR128, "stur",
+                         [(store (f128 FPR128:$Rt),
+                                 (am_unscaled128 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURHH : StoreUnscaled<0b01, 0, 0b00, GPR32, "sturh",
+                         [(truncstorei16 GPR32:$Rt,
+                                 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))]>;
+defm STURBB : StoreUnscaled<0b00, 0, 0b00, GPR32, "sturb",
+                         [(truncstorei8 GPR32:$Rt,
+                                  (am_unscaled8 GPR64sp:$Rn, simm9:$offset))]>;
+
+// Match all store 64 bits width whose type is compatible with FPR64
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v2f32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i8 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i32 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+def : Pat<(store (v1f64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(store (v1i64 FPR64:$Rt), (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+          (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+
+// Match all store 128 bits width whose type is compatible with FPR128
+let Predicates = [IsLE] in {
+  // We must use ST1 to store vectors in big-endian.
+  def : Pat<(store (v4f32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v16i8 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8i16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4i32 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2i64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v2f64 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+}
+
+// unscaled i64 truncating stores
+def : Pat<(truncstorei32 GPR64:$Rt, (am_unscaled32 GPR64sp:$Rn, simm9:$offset)),
+  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei16 GPR64:$Rt, (am_unscaled16 GPR64sp:$Rn, simm9:$offset)),
+  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+def : Pat<(truncstorei8 GPR64:$Rt, (am_unscaled8 GPR64sp:$Rn, simm9:$offset)),
+  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$Rn, simm9:$offset)>;
+
+//---
+// STR mnemonics fall back to STUR for negative or unaligned offsets.
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURXi GPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURWi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURBi FPR8:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURHi FPR16:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURSi FPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb32:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9_offset_fb64:$offset), 0>;
+def : InstAlias<"str $Rt, [$Rn, $offset]",
+                (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9_offset_fb128:$offset), 0>;
+
+def : InstAlias<"strb $Rt, [$Rn, $offset]",
+                (STURBBi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb8:$offset), 0>;
+def : InstAlias<"strh $Rt, [$Rn, $offset]",
+                (STURHHi GPR32:$Rt, GPR64sp:$Rn, simm9_offset_fb16:$offset), 0>;
+
+//---
+// (unscaled immediate, unprivileged)
+defm STTRW : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
+defm STTRX : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
+
+defm STTRH : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
+defm STTRB : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
+
+//---
+// (immediate pre-indexed)
+def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str",  pre_store, i32>;
+def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str",  pre_store, i64>;
+def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str",  pre_store, untyped>;
+def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str",  pre_store, f16>;
+def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str",  pre_store, f32>;
+def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str",  pre_store, f64>;
+def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str", pre_store, f128>;
+
+def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb", pre_truncsti8,  i32>;
+def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh", pre_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(pre_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+           simm9:$off)>;
+def : Pat<(pre_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(pre_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpre (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+
+def : Pat<(pre_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpre FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(pre_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(pre_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpre FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+//---
+// (immediate post-indexed)
+def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32,  "str", post_store, i32>;
+def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64,  "str", post_store, i64>;
+def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,   "str", post_store, untyped>;
+def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16,  "str", post_store, f16>;
+def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32,  "str", post_store, f32>;
+def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64,  "str", post_store, f64>;
+def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str", post_store, f128>;
+
+def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb", post_truncsti8, i32>;
+def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh", post_truncsti16, i32>;
+
+// truncstore i64
+def : Pat<(post_truncsti32 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRWpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+            simm9:$off)>;
+def : Pat<(post_truncsti16 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRHHpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+def : Pat<(post_truncsti8 GPR64:$Rt, GPR64sp:$addr, simm9:$off),
+  (STRBBpost (EXTRACT_SUBREG GPR64:$Rt, sub_32), GPR64sp:$addr,
+             simm9:$off)>;
+
+def : Pat<(post_store (v8i8 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i16 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f32 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1i64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v1f64 FPR64:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRDpost FPR64:$Rt, GPR64sp:$addr, simm9:$off)>;
+
+def : Pat<(post_store (v16i8 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v8i16 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4i32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v4f32 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2i64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
+def : Pat<(post_store (v2f64 FPR128:$Rt), GPR64sp:$addr, simm9:$off),
+          (STRQpost FPR128:$Rt, GPR64sp:$addr, simm9:$off)>;
 
 //===----------------------------------------------------------------------===//
-// Load-store register (unscaled immediate) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDURB, LDURH, LDRUSB, LDRUSH, LDRUSW, STUR, STURB, STURH and PRFUM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (register offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (unsigned immediate) instructions
+// Load/store exclusive instructions.
 //===----------------------------------------------------------------------===//
-// Contains: LDRB, LDRH, LDRSB, LDRSH, LDRSW, STR, STRB, STRH and PRFM
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register (immediate pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STRB, STRH, STR, LDRB, LDRH, LDR, LDRSB, LDRSH, LDRSW
-
-// Note that patterns are much later on in a completely separate section (they
-// need ADRPxi to be defined).
-
-//===-------------------------------
-// 1. Various operands needed
-//===-------------------------------
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-// The addressing mode for these instructions consists of an unsigned 12-bit
-// immediate which is scaled by the size of the memory access.
-//
-// We represent this in the MC layer by two operands:
-//     1. A base register.
-//     2. A 12-bit immediate: not multiplied by access size, so "LDR x0,[x0,#8]"
-//        would have '1' in this field.
-// This means that separate functions are needed for converting representations
-// which *are* aware of the intended access size.
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-//===-------------------------------
-// 1.1 Unsigned 12-bit immediate operands
-//===-------------------------------
-
-multiclass offsets_uimm12<int MemSize, string prefix> {
-  def uimm12_asmoperand : AsmOperandClass {
-    let Name = "OffsetUImm12_" # MemSize;
-    let PredicateMethod = "isOffsetUImm12<" # MemSize # ">";
-    let RenderMethod = "addOffsetUImm12Operands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreUImm12_" # MemSize;
-  }
 
-  // Pattern is really no more than an ImmLeaf, but predicated on MemSize which
-  // complicates things beyond TableGen's ken.
-  def uimm12 : Operand<i64>,
-               ComplexPattern<i64, 1, "SelectOffsetUImm12<" # MemSize # ">"> {
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # uimm12_asmoperand);
+def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
+def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
+def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
+def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
 
-    let PrintMethod = "printOffsetUImm12Operand<" # MemSize # ">";
-    let EncoderMethod = "getOffsetUImm12OpValue<" # MemSize # ">";
-  }
-}
-
-defm byte_  : offsets_uimm12<1, "byte_">;
-defm hword_ : offsets_uimm12<2, "hword_">;
-defm word_  : offsets_uimm12<4, "word_">;
-defm dword_ : offsets_uimm12<8, "dword_">;
-defm qword_ : offsets_uimm12<16, "qword_">;
-
-//===-------------------------------
-// 1.1 Signed 9-bit immediate operands
-//===-------------------------------
-
-// The MCInst is expected to store the bit-wise encoding of the value,
-// which amounts to lopping off the extended sign bits.
-def SDXF_simm9 : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 0x1ff, MVT::i32);
-}]>;
-
-def simm9_asmoperand : AsmOperandClass {
-  let Name = "SImm9";
-  let PredicateMethod = "isSImm<9>";
-  let RenderMethod = "addSImmOperands<9>";
-  let DiagnosticType = "LoadStoreSImm9";
-}
-
-def simm9 : Operand<i64>,
-            ImmLeaf<i64, [{ return Imm >= -0x100 && Imm <= 0xff; }],
-            SDXF_simm9> {
-  let PrintMethod = "printOffsetSImm9Operand";
-  let ParserMatchClass = simm9_asmoperand;
-}
-
-
-//===-------------------------------
-// 1.3 Register offset extensions
-//===-------------------------------
-
-// The assembly-syntax for these addressing-modes is:
-//    [<Xn|SP>, <R><m> {, <extend> {<amount>}}]
-//
-// The essential semantics are:
-//     + <amount> is a shift: #<log(transfer size)> or #0
-//     + <R> can be W or X.
-//     + If <R> is W, <extend> can be UXTW or SXTW
-//     + If <R> is X, <extend> can be LSL or SXTX
-//
-// The trickiest of those constraints is that Rm can be either GPR32 or GPR64,
-// which will need separate instructions for LLVM type-consistency. We'll also
-// need separate operands, of course.
-multiclass regexts<int MemSize, int RmSize, RegisterClass GPR,
-                   string Rm, string prefix> {
-  def regext_asmoperand : AsmOperandClass {
-    let Name = "AddrRegExtend_" # MemSize # "_" #  Rm;
-    let PredicateMethod = "isAddrRegExtend<" # MemSize # "," # RmSize # ">";
-    let RenderMethod = "addAddrRegExtendOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreExtend" # RmSize # "_" # MemSize;
-  }
+def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
+def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
+def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
+def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
 
-  def regext : Operand<i64> {
-    let PrintMethod
-      = "printAddrRegExtendOperand<" # MemSize # ", " # RmSize # ">";
+def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
+def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
+def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
+def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
 
-    let DecoderMethod = "DecodeAddrRegExtendOperand";
-    let ParserMatchClass
-      = !cast<AsmOperandClass>(prefix # regext_asmoperand);
-  }
-}
-
-multiclass regexts_wx<int MemSize, string prefix> {
-  // Rm is an X-register if LSL or SXTX are specified as the shift.
-  defm Xm_ : regexts<MemSize, 64, GPR64, "Xm", prefix # "Xm_">;
+def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
+def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
+def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
+def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
 
-  // Rm is a W-register if UXTW or SXTW are specified as the shift.
-  defm Wm_ : regexts<MemSize, 32, GPR32, "Wm", prefix # "Wm_">;
-}
+def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
+def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
+def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
+def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
 
-defm byte_  : regexts_wx<1, "byte_">;
-defm hword_ : regexts_wx<2, "hword_">;
-defm word_  : regexts_wx<4, "word_">;
-defm dword_ : regexts_wx<8, "dword_">;
-defm qword_ : regexts_wx<16, "qword_">;
+def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
+def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
+def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
+def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
 
+def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
+def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
 
-//===------------------------------
-// 2. The instructions themselves.
-//===------------------------------
+def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
+def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
 
-// We have the following instructions to implement:
-// |                 | B     | H     | W     | X      |
-// |-----------------+-------+-------+-------+--------|
-// | unsigned str    | STRB  | STRH  | STR   | STR    |
-// | unsigned ldr    | LDRB  | LDRH  | LDR   | LDR    |
-// | signed ldr to W | LDRSB | LDRSH | -     | -      |
-// | signed ldr to X | LDRSB | LDRSH | LDRSW | (PRFM) |
+def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
+def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
 
-// This will instantiate the LDR/STR instructions you'd expect to use for an
-// unsigned datatype (first two rows above) or floating-point register, which is
-// reasonably uniform across all access sizes.
-
-
-//===------------------------------
-// 2.1 Regular instructions
-//===------------------------------
-
-// This class covers the basic unsigned or irrelevantly-signed loads and stores,
-// to general-purpose and floating-point registers.
-
-class AddrParams<string prefix> {
-  Operand uimm12 = !cast<Operand>(prefix # "_uimm12");
-
-  Operand regextWm = !cast<Operand>(prefix # "_Wm_regext");
-  Operand regextXm = !cast<Operand>(prefix # "_Xm_regext");
-}
-
-def byte_addrparams : AddrParams<"byte">;
-def hword_addrparams : AddrParams<"hword">;
-def word_addrparams : AddrParams<"word">;
-def dword_addrparams : AddrParams<"dword">;
-def qword_addrparams : AddrParams<"qword">;
-
-multiclass A64I_LDRSTR_unsigned<string prefix, bits<2> size, bit v,
-                                bit high_opc, string asmsuffix,
-                                RegisterClass GPR, AddrParams params> {
-  // Unsigned immediate
-  def _STR : A64I_LSunsigimm<size, v, {high_opc, 0b0},
-                     (outs), (ins GPR:$Rt, GPR64xsp:$Rn, params.uimm12:$UImm12),
-                     "str" # asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                     [], NoItinerary>,
-             Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSunsigimm<size, v, {high_opc, 0b1},
-                      (outs GPR:$Rt), (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                      "ldr" #  asmsuffix # "\t$Rt, [$Rn, $UImm12]",
-                      [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn]",
-                (!cast<Instruction>(prefix # "_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset (four of these: load/store and Wm/Xm).
-  let mayLoad = 1 in {
-    def _Wm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b0,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def _Xm_RegOffset_LDR : A64I_LSregoff<size, v, {high_opc, 0b1}, 0b1,
-                            (outs GPR:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldr" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldr" # asmsuffix # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "_Xm_RegOffset_LDR") GPR:$Rt, GPR64xsp:$Rn,
-                                                          GPR64:$Rm, 2)>;
-
-  let mayStore = 1 in {
-    def _Wm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b0,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR32:$Rm,
-                                               params.regextWm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-
-    def _Xm_RegOffset_STR : A64I_LSregoff<size, v, {high_opc, 0b0}, 0b1,
-                                  (outs), (ins GPR:$Rt, GPR64xsp:$Rn, GPR64:$Rm,
-                                               params.regextXm:$Ext),
-                                  "str" # asmsuffix # "\t$Rt, [$Rn, $Rm, $Ext]",
-                                  [], NoItinerary>,
-                            Sched<[WriteSt, ReadSt, ReadSt, ReadSt]>;
-  }
-  def : InstAlias<"str" # asmsuffix # " $Rt, [$Rn, $Rm]",
-      (!cast<Instruction>(prefix # "_Xm_RegOffset_STR") GPR:$Rt, GPR64xsp:$Rn,
-                                                        GPR64:$Rm, 2)>;
-
-  // Unaligned immediate
-  def _STUR : A64I_LSunalimm<size, v, {high_opc, 0b0},
-                             (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                             "stur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-  }
-  def : InstAlias<"stur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_STUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _LDUR : A64I_LSunalimm<size, v, {high_opc, 0b1},
-                             (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldur" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldur" # asmsuffix # " $Rt, [$Rn]",
-               (!cast<Instruction>(prefix # "_LDUR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Post-indexed
-  def _PostInd_STR : A64I_LSpostind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSpostind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn], $SImm9",
-                                    [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  // Pre-indexed
-  def _PreInd_STR : A64I_LSpreind<size, v, {high_opc, 0b0},
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                               "str" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                               [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt]> {
-    let Constraints = "$Rn = $Rn_wb";
-    let mayStore = 1;
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSpreind<size, v, {high_opc, 0b1},
-                                    (outs GPR:$Rt, GPR64xsp:$Rn_wb),
-                                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                    "ldr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]!",
-                                    [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeSingleIndexedInstruction";
-  }
-
-}
-
-// STRB/LDRB: First define the instructions
-defm LS8
-  : A64I_LDRSTR_unsigned<"LS8", 0b00, 0b0, 0b0, "b", GPR32, byte_addrparams>;
-
-// STRH/LDRH
-defm LS16
-  : A64I_LDRSTR_unsigned<"LS16", 0b01, 0b0, 0b0, "h", GPR32, hword_addrparams>;
-
-
-// STR/LDR to/from a W register
-defm LS32
-  : A64I_LDRSTR_unsigned<"LS32", 0b10, 0b0, 0b0, "", GPR32, word_addrparams>;
-
-// STR/LDR to/from an X register
-defm LS64
-  : A64I_LDRSTR_unsigned<"LS64", 0b11, 0b0, 0b0, "", GPR64, dword_addrparams>;
-
-let Predicates = [HasFPARMv8] in {
-// STR/LDR to/from a B register
-defm LSFP8
-  : A64I_LDRSTR_unsigned<"LSFP8", 0b00, 0b1, 0b0, "", FPR8, byte_addrparams>;
-
-// STR/LDR to/from an H register
-defm LSFP16
-  : A64I_LDRSTR_unsigned<"LSFP16", 0b01, 0b1, 0b0, "", FPR16, hword_addrparams>;
-
-// STR/LDR to/from an S register
-defm LSFP32
-  : A64I_LDRSTR_unsigned<"LSFP32", 0b10, 0b1, 0b0, "", FPR32, word_addrparams>;
-// STR/LDR to/from a D register
-defm LSFP64
-  : A64I_LDRSTR_unsigned<"LSFP64", 0b11, 0b1, 0b0, "", FPR64, dword_addrparams>;
-// STR/LDR to/from a Q register
-defm LSFP128
-  : A64I_LDRSTR_unsigned<"LSFP128", 0b00, 0b1, 0b1, "", FPR128,
-                         qword_addrparams>;
-}
-
-//===------------------------------
-// 2.3 Signed loads
-//===------------------------------
-
-// Byte and half-word signed loads can both go into either an X or a W register,
-// so it's worth factoring out. Signed word loads don't fit because there is no
-// W version.
-multiclass A64I_LDR_signed<bits<2> size, string asmopcode, AddrParams params,
-                           string prefix> {
-  // Unsigned offset
-  def w : A64I_LSunsigimm<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # w) GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def x : A64I_LSunsigimm<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, params.uimm12:$UImm12),
-                          "ldrs" # asmopcode # "\t$Rt, [$Rn, $UImm12]",
-                          [], NoItinerary>,
-          Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn]",
-                  (!cast<Instruction>(prefix # x) GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-  // Register offset
-  let mayLoad = 1 in {
-    def w_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b0,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def w_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b11, 0b1,
-                            (outs GPR32:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Wm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b0,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR32:$Rm, params.regextWm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-
-    def x_Xm_RegOffset : A64I_LSregoff<size, 0b0, 0b10, 0b1,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, GPR64:$Rm, params.regextXm:$Ext),
-                            "ldrs" # asmopcode # "\t$Rt, [$Rn, $Rm, $Ext]",
-                            [], NoItinerary>,
-                         Sched<[WriteLd, ReadLd, ReadLd]>;
-  }
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "w_Xm_RegOffset") GPR32:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-  def : InstAlias<"ldrs" # asmopcode # " $Rt, [$Rn, $Rm]",
-        (!cast<Instruction>(prefix # "x_Xm_RegOffset") GPR64:$Rt, GPR64xsp:$Rn,
-                                                       GPR64:$Rm, 2)>;
-
-
-  let mayLoad = 1 in {
-    // Unaligned offset
-    def w_U : A64I_LSunalimm<size, 0b0, 0b11,
-                             (outs GPR32:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-    def x_U : A64I_LSunalimm<size, 0b0, 0b10,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, simm9:$SImm9),
-                             "ldurs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                             [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]>;
-
-
-    // Post-indexed
-    def w_PostInd : A64I_LSpostind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PostInd : A64I_LSpostind<size, 0b0, 0b10,
-                                   (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                   "ldrs" # asmopcode # "\t$Rt, [$Rn], $SImm9",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    // Pre-indexed
-    def w_PreInd : A64I_LSpreind<size, 0b0, 0b11,
-                                 (outs GPR32:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-
-    def x_PreInd : A64I_LSpreind<size, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                   Sched<[WriteLd, WriteLd, ReadLd]> {
-      let Constraints = "$Rn = $Rn_wb";
-      let DecoderMethod = "DecodeSingleIndexedInstruction";
-    }
-  } // let mayLoad = 1
-}
-
-// LDRSB
-defm LDRSB : A64I_LDR_signed<0b00, "b", byte_addrparams, "LDRSB">;
-// LDRSH
-defm LDRSH : A64I_LDR_signed<0b01, "h", hword_addrparams, "LDRSH">;
-
-// LDRSW: load a 32-bit register, sign-extending to 64-bits.
-def LDRSWx
-    : A64I_LSunsigimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, word_uimm12:$UImm12),
-                    "ldrsw\t$Rt, [$Rn, $UImm12]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn]", (LDRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def LDRSWx_Wm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b0,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR32:$Rm, word_Wm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-
-  def LDRSWx_Xm_RegOffset : A64I_LSregoff<0b10, 0b0, 0b10, 0b1,
-                             (outs GPR64:$Rt),
-                             (ins GPR64xsp:$Rn, GPR64:$Rm, word_Xm_regext:$Ext),
-                             "ldrsw\t$Rt, [$Rn, $Rm, $Ext]",
-                             [], NoItinerary>,
-                            Sched<[WriteLd, ReadLd, ReadLd]>;
-}
-def : InstAlias<"ldrsw $Rt, [$Rn, $Rm]",
-                (LDRSWx_Xm_RegOffset GPR64:$Rt, GPR64xsp:$Rn, GPR64:$Rm, 2)>;
-
-
-def LDURSWx
-    : A64I_LSunalimm<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldursw\t$Rt, [$Rn, $SImm9]",
-                    [], NoItinerary>,
-      Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"ldursw $Rt, [$Rn]", (LDURSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-def LDRSWx_PostInd
-    : A64I_LSpostind<0b10, 0b0, 0b10,
-                    (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                    (ins GPR64xsp:$Rn, simm9:$SImm9),
-                    "ldrsw\t$Rt, [$Rn], $SImm9",
-                    [], NoItinerary>,
-      Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-def LDRSWx_PreInd : A64I_LSpreind<0b10, 0b0, 0b10,
-                                 (outs GPR64:$Rt, GPR64xsp:$Rn_wb),
-                                 (ins GPR64xsp:$Rn, simm9:$SImm9),
-                                 "ldrsw\t$Rt, [$Rn, $SImm9]!",
-                                 [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeSingleIndexedInstruction";
-}
-
-//===------------------------------
-// 2.4 Prefetch operations
-//===------------------------------
-
-def PRFM : A64I_LSunsigimm<0b11, 0b0, 0b10, (outs),
-                 (ins prefetch_op:$Rt, GPR64xsp:$Rn, dword_uimm12:$UImm12),
-                 "prfm\t$Rt, [$Rn, $UImm12]",
-                 [], NoItinerary>,
-           Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfm $Rt, [$Rn]",
-                (PRFM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
-
-let mayLoad = 1 in {
-  def PRFM_Wm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b0, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR32:$Rm, dword_Wm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-  def PRFM_Xm_RegOffset : A64I_LSregoff<0b11, 0b0, 0b10, 0b1, (outs),
-                                        (ins prefetch_op:$Rt, GPR64xsp:$Rn,
-                                             GPR64:$Rm, dword_Xm_regext:$Ext),
-                                        "prfm\t$Rt, [$Rn, $Rm, $Ext]",
-                                        [], NoItinerary>,
-                          Sched<[WritePreLd, ReadPreLd]>;
-}
-
-def : InstAlias<"prfm $Rt, [$Rn, $Rm]",
-                (PRFM_Xm_RegOffset prefetch_op:$Rt, GPR64xsp:$Rn,
-                                   GPR64:$Rm, 2)>;
-
-
-def PRFUM : A64I_LSunalimm<0b11, 0b0, 0b10, (outs),
-                         (ins prefetch_op:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                         "prfum\t$Rt, [$Rn, $SImm9]",
-                         [], NoItinerary>,
-            Sched<[WritePreLd, ReadPreLd]> {
-  let mayLoad = 1;
-}
-def : InstAlias<"prfum $Rt, [$Rn]",
-                (PRFUM prefetch_op:$Rt, GPR64xsp:$Rn, 0)>;
+def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
+def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
 //===----------------------------------------------------------------------===//
-// Load-store register (unprivileged) instructions
+// Scaled floating point to integer conversion instructions.
 //===----------------------------------------------------------------------===//
-// Contains: LDTRB, LDTRH, LDTRSB, LDTRSH, LDTRSW, STTR, STTRB and STTRH
-
-// These instructions very much mirror the "unscaled immediate" loads, but since
-// there are no floating-point variants we need to split them out into their own
-// section to avoid instantiation of "ldtr d0, [sp]" etc.
-
-multiclass A64I_LDTRSTTR<bits<2> size, string asmsuffix, RegisterClass GPR,
-                         string prefix> {
-  def _UnPriv_STR : A64I_LSunpriv<size, 0b0, 0b00,
-                              (outs), (ins GPR:$Rt, GPR64xsp:$Rn, simm9:$SImm9),
-                              "sttr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-  }
-
-  def : InstAlias<"sttr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_STR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def _UnPriv_LDR : A64I_LSunpriv<size, 0b0, 0b01,
-                               (outs GPR:$Rt), (ins GPR64xsp:$Rn, simm9:$SImm9),
-                               "ldtr" # asmsuffix # "\t$Rt, [$Rn, $SImm9]",
-                               [], NoItinerary>,
-                    Sched<[WriteLd, ReadLd]> {
-    let mayLoad = 1;
-  }
-
-  def : InstAlias<"ldtr" # asmsuffix # " $Rt, [$Rn]",
-         (!cast<Instruction>(prefix # "_UnPriv_LDR") GPR:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
-
-// STTRB/LDTRB: First define the instructions
-defm LS8 : A64I_LDTRSTTR<0b00, "b", GPR32, "LS8">;
-
-// STTRH/LDTRH
-defm LS16 : A64I_LDTRSTTR<0b01, "h", GPR32, "LS16">;
-
-// STTR/LDTR to/from a W register
-defm LS32 : A64I_LDTRSTTR<0b10, "", GPR32, "LS32">;
-
-// STTR/LDTR to/from an X register
-defm LS64 : A64I_LDTRSTTR<0b11, "", GPR64, "LS64">;
-
-// Now a class for the signed instructions that can go to either 32 or 64
-// bits...
-multiclass A64I_LDTR_signed<bits<2> size, string asmopcode, string prefix> {
-  let mayLoad = 1 in {
-    def w : A64I_LSunpriv<size, 0b0, 0b11,
-                          (outs GPR32:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-
-    def x : A64I_LSunpriv<size, 0b0, 0b10,
-                          (outs GPR64:$Rt),
-                          (ins GPR64xsp:$Rn, simm9:$SImm9),
-                          "ldtrs" # asmopcode # "\t$Rt, [$Rn, $SImm9]",
-                          [], NoItinerary>,
-            Sched<[WriteLd, ReadLd]>;
-  }
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "w") GPR32:$Rt, GPR64xsp:$Rn, 0)>;
-
-  def : InstAlias<"ldtrs" # asmopcode # " $Rt, [$Rn]",
-                 (!cast<Instruction>(prefix # "x") GPR64:$Rt, GPR64xsp:$Rn, 0)>;
-
-}
 
-// LDTRSB
-defm LDTRSB : A64I_LDTR_signed<0b00, "b", "LDTRSB">;
-// LDTRSH
-defm LDTRSH : A64I_LDTR_signed<0b01, "h", "LDTRSH">;
-
-// And finally LDTRSW which only goes to 64 bits.
-def LDTRSWx : A64I_LSunpriv<0b10, 0b0, 0b10,
-                            (outs GPR64:$Rt),
-                            (ins GPR64xsp:$Rn, simm9:$SImm9),
-                            "ldtrsw\t$Rt, [$Rn, $SImm9]",
-                            [], NoItinerary>,
-              Sched<[WriteLd, ReadLd]> {
-  let mayLoad = 1;
+defm FCVTAS : FPToIntegerUnscaled<0b00, 0b100, "fcvtas", int_aarch64_neon_fcvtas>;
+defm FCVTAU : FPToIntegerUnscaled<0b00, 0b101, "fcvtau", int_aarch64_neon_fcvtau>;
+defm FCVTMS : FPToIntegerUnscaled<0b10, 0b000, "fcvtms", int_aarch64_neon_fcvtms>;
+defm FCVTMU : FPToIntegerUnscaled<0b10, 0b001, "fcvtmu", int_aarch64_neon_fcvtmu>;
+defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns>;
+defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
+defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
+defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
+defm FCVTZS_Int : FPToIntegerScaled<0b11, 0b000, "fcvtzs", int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : FPToIntegerScaled<0b11, 0b001, "fcvtzu", int_aarch64_neon_fcvtzu>;
 }
-def : InstAlias<"ldtrsw $Rt, [$Rn]", (LDTRSWx GPR64:$Rt, GPR64xsp:$Rn, 0)>;
 
 //===----------------------------------------------------------------------===//
-// Load-store register pair (offset) instructions
+// Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (post-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store register pair (pre-indexed) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STP, LDP, LDPSW
-//
-// and
-//
-//===----------------------------------------------------------------------===//
-// Load-store non-temporal register pair (offset) instructions
-//===----------------------------------------------------------------------===//
-// Contains: STNP, LDNP
-
-
-// Anything that creates an MCInst (Decoding, selection and AsmParsing) has to
-// know the access size via some means. An isolated operand does not have this
-// information unless told from here, which means we need separate tablegen
-// Operands for each access size. This multiclass takes care of instantiating
-// the correct template functions in the rest of the backend.
-
-multiclass offsets_simm7<string MemSize, string prefix> {
-  // The bare signed 7-bit immediate is used in post-indexed instructions, but
-  // because of the scaling performed a generic "simm7" operand isn't
-  // appropriate here either.
-  def simm7_asmoperand : AsmOperandClass {
-    let Name = "SImm7_Scaled" # MemSize;
-    let PredicateMethod = "isSImm7Scaled<" # MemSize # ">";
-    let RenderMethod = "addSImm7ScaledOperands<" # MemSize # ">";
-    let DiagnosticType = "LoadStoreSImm7_" # MemSize;
-  }
-
-  def simm7 : Operand<i64> {
-    let PrintMethod = "printSImm7ScaledOperand<" # MemSize # ">";
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "simm7_asmoperand");
-  }
-}
-
-defm word_  : offsets_simm7<"4", "word_">;
-defm dword_ : offsets_simm7<"8", "dword_">;
-defm qword_ : offsets_simm7<"16", "qword_">;
-
-multiclass A64I_LSPsimple<bits<2> opc, bit v, RegisterClass SomeReg,
-                          Operand simm7, string prefix> {
-  def _STR : A64I_LSPoffset<opc, v, 0b0, (outs),
-                    (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                    "stp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, ReadLd]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _LDR : A64I_LSPoffset<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _PostInd_STR : A64I_LSPpostind<opc, v, 0b0,
-                               (outs GPR64xsp:$Rn_wb),
-                               (ins SomeReg:$Rt, SomeReg:$Rt2,
-                                    GPR64xsp:$Rn,
-                                    simm7:$SImm7),
-                               "stp\t$Rt, $Rt2, [$Rn], $SImm7",
-                               [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-
-    // Decoder only needed for unpredictability checking (FIXME).
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PostInd_LDR : A64I_LSPpostind<opc, v, 0b1,
-                        (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                        (ins GPR64xsp:$Rn, simm7:$SImm7),
-                        "ldp\t$Rt, $Rt2, [$Rn], $SImm7",
-                        [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_STR : A64I_LSPpreind<opc, v, 0b0, (outs GPR64xsp:$Rn_wb),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                       [], NoItinerary>,
-                    Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-
-  def _PreInd_LDR : A64I_LSPpreind<opc, v, 0b1,
-                              (outs SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn_wb),
-                              (ins GPR64xsp:$Rn, simm7:$SImm7),
-                              "ldp\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                              [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let Constraints = "$Rn = $Rn_wb";
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
 
-  def _NonTemp_STR : A64I_LSPnontemp<opc, v, 0b0, (outs),
-                       (ins SomeReg:$Rt, SomeReg:$Rt2, GPR64xsp:$Rn, simm7:$SImm7),
-                       "stnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteSt, ReadSt, ReadSt, ReadSt]> {
-    let mayStore = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"stnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_STR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-  def _NonTemp_LDR : A64I_LSPnontemp<opc, v, 0b1,
-                            (outs SomeReg:$Rt, SomeReg:$Rt2),
-                            (ins GPR64xsp:$Rn, simm7:$SImm7),
-                            "ldnp\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, ReadLd]> {
-    let mayLoad = 1;
-    let DecoderMethod = "DecodeLDSTPairInstruction";
-  }
-  def : InstAlias<"ldnp $Rt, $Rt2, [$Rn]",
-                  (!cast<Instruction>(prefix # "_NonTemp_LDR") SomeReg:$Rt,
-                                                SomeReg:$Rt2, GPR64xsp:$Rn, 0)>;
-
-}
-
-
-defm LSPair32 : A64I_LSPsimple<0b00, 0b0, GPR32, word_simm7, "LSPair32">;
-defm LSPair64 : A64I_LSPsimple<0b10, 0b0, GPR64, dword_simm7, "LSPair64">;
-
-let Predicates = [HasFPARMv8] in {
-defm LSFPPair32 : A64I_LSPsimple<0b00, 0b1, FPR32, word_simm7, "LSFPPair32">;
-defm LSFPPair64 : A64I_LSPsimple<0b01, 0b1, FPR64,  dword_simm7, "LSFPPair64">;
-defm LSFPPair128 : A64I_LSPsimple<0b10, 0b1, FPR128, qword_simm7,
-                                  "LSFPPair128">;
-}
-
-
-def LDPSWx : A64I_LSPoffset<0b01, 0b0, 0b1,
-                           (outs GPR64:$Rt, GPR64:$Rt2),
-                           (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                           "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]", [], NoItinerary>,
-             Sched<[WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-def : InstAlias<"ldpsw $Rt, $Rt2, [$Rn]",
-                (LDPSWx GPR64:$Rt, GPR64:$Rt2, GPR64xsp:$Rn, 0)>;
-
-def LDPSWx_PostInd : A64I_LSPpostind<0b01, 0b0, 0b1,
-                                  (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                  (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                  "ldpsw\t$Rt, $Rt2, [$Rn], $SImm7",
-                                  [], NoItinerary>,
-                     Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
-
-def LDPSWx_PreInd : A64I_LSPpreind<0b01, 0b0, 0b1,
-                                   (outs GPR64:$Rt, GPR64:$Rt2, GPR64:$Rn_wb),
-                                   (ins GPR64xsp:$Rn, word_simm7:$SImm7),
-                                   "ldpsw\t$Rt, $Rt2, [$Rn, $SImm7]!",
-                                   [], NoItinerary>,
-                    Sched<[WriteLd, WriteLd, WriteLd, ReadLd]> {
-  let mayLoad = 1;
-  let Constraints = "$Rn = $Rn_wb";
-  let DecoderMethod = "DecodeLDSTPairInstruction";
-}
+defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
 
 //===----------------------------------------------------------------------===//
-// Logical (immediate) instructions
+// Unscaled integer to floating point conversion instruction.
 //===----------------------------------------------------------------------===//
-// Contains: AND, ORR, EOR, ANDS, + aliases TST, MOV
 
-multiclass logical_imm_operands<string prefix, string note,
-                                int size, ValueType VT> {
-  def _asmoperand : AsmOperandClass {
-    let Name = "LogicalImm" # note # size;
-    let PredicateMethod = "isLogicalImm" # note # "<" # size # ">";
-    let RenderMethod = "addLogicalImmOperands<" # size # ">";
-    let DiagnosticType = "LogicalSecondSource";
-  }
+defm FMOV : UnscaledConversion<"fmov">;
 
-  def _operand
-        : Operand<VT>, ComplexPattern<VT, 1, "SelectLogicalImm", [imm]> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-    let PrintMethod = "printLogicalImmOperand<" # size # ">";
-    let DecoderMethod = "DecodeLogicalImmOperand<" # size # ">";
-  }
-}
-
-defm logical_imm32 : logical_imm_operands<"logical_imm32", "", 32, i32>;
-defm logical_imm64 : logical_imm_operands<"logical_imm64", "", 64, i64>;
-
-// The mov versions only differ in assembly parsing, where they
-// exclude values representable with either MOVZ or MOVN.
-defm logical_imm32_mov
-  : logical_imm_operands<"logical_imm32_mov", "MOV", 32, i32>;
-defm logical_imm64_mov
-  : logical_imm_operands<"logical_imm64_mov", "MOV", 64, i64>;
-
-
-multiclass A64I_logimmSizes<bits<2> opc, string asmop, SDNode opnode> {
-  def wwi : A64I_logicalimm<0b0, opc, (outs GPR32wsp:$Rd),
-                         (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i32:$Rd,
-                               (opnode i32:$Rn, logical_imm32_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-
-  def xxi : A64I_logicalimm<0b1, opc, (outs GPR64xsp:$Rd),
-                         (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [(set i64:$Rd,
-                               (opnode i64:$Rn, logical_imm64_operand:$Imm))],
-                         NoItinerary>,
-            Sched<[WriteALU, ReadALU]>;
-}
-
-defm AND : A64I_logimmSizes<0b00, "and", and>;
-defm ORR : A64I_logimmSizes<0b01, "orr", or>;
-defm EOR : A64I_logimmSizes<0b10, "eor", xor>;
-
-let Defs = [NZCV] in {
-  def ANDSwwi : A64I_logicalimm<0b0, 0b11, (outs GPR32:$Rd),
-                                (ins GPR32:$Rn, logical_imm32_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-
-  def ANDSxxi : A64I_logicalimm<0b1, 0b11, (outs GPR64:$Rd),
-                                (ins GPR64:$Rn, logical_imm64_operand:$Imm),
-                                "ands\t$Rd, $Rn, $Imm",
-                                [], NoItinerary>,
-                Sched<[WriteALU, ReadALU]>;
-}
-
-
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSwwi WZR, GPR32:$Rn, logical_imm32_operand:$Imm)>;
-def : InstAlias<"tst $Rn, $Imm",
-                (ANDSxxi XZR, GPR64:$Rn, logical_imm64_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRwwi GPR32wsp:$Rd, WZR, logical_imm32_mov_operand:$Imm)>;
-def : InstAlias<"mov $Rd, $Imm",
-                (ORRxxi GPR64xsp:$Rd, XZR, logical_imm64_mov_operand:$Imm)>;
+def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
+def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
 
 //===----------------------------------------------------------------------===//
-// Logical (shifted register) instructions
+// Floating point conversion instruction.
 //===----------------------------------------------------------------------===//
-// Contains: AND, BIC, ORR, ORN, EOR, EON, ANDS, BICS + aliases TST, MVN, MOV
-
-// Operand for optimizing (icmp (and LHS, RHS), 0, SomeCode). In theory "ANDS"
-// behaves differently for unsigned comparisons, so we defensively only allow
-// signed or n/a as the operand. In practice "unsigned greater than 0" is "not
-// equal to 0" and LLVM gives us this.
-def signed_cond : PatLeaf<(cond), [{
-  return !isUnsignedIntSetCC(N->get());
-}]>;
 
+defm FCVT : FPConversion<"fcvt">;
 
-// These instructions share their "shift" operands with add/sub (shifted
-// register instructions). They are defined there.
-
-// N.b. the commutable parameter is just !N. It will be first against the wall
-// when the revolution comes.
-multiclass logical_shifts<string prefix, bit sf, bits<2> opc,
-                          bit N, bit commutable,
-                          string asmop, SDPatternOperator opfrag, ValueType ty,
-                          RegisterClass GPR, list<Register> defs> {
-  let isCommutable = commutable, Defs = defs in {
-  def _lsl : A64I_logicalshift<sf, opc, 0b00, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (shl ty:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _lsr : A64I_logicalshift<sf, opc, 0b01, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (srl ty:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, opc, 0b10, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (sra ty:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, opc, 0b11, N,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm, $Imm6"),
-                       [(set ty:$Rd, (opfrag ty:$Rn, (rotr ty:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6))
-                       )],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
+def : Pat<(f32_to_f16 FPR32:$Rn),
+          (i32 (COPY_TO_REGCLASS
+                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
+                   GPR32))>;
 
-  def _noshift
-      : InstAlias<!strconcat(asmop, " $Rd, $Rn, $Rm"),
-                 (!cast<Instruction>(prefix # "_lsl") GPR:$Rd, GPR:$Rn,
-                                                      GPR:$Rm, 0)>;
-
-  def : Pat<(opfrag ty:$Rn, ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-multiclass logical_sizes<string prefix, bits<2> opc, bit N, bit commutable,
-                         string asmop, SDPatternOperator opfrag,
-                         list<Register> defs> {
-  defm xxx : logical_shifts<prefix # "xxx", 0b1, opc, N,
-                            commutable, asmop, opfrag, i64, GPR64, defs>;
-  defm www : logical_shifts<prefix # "www", 0b0, opc, N,
-                            commutable, asmop, opfrag, i32, GPR32, defs>;
-}
-
-
-defm AND : logical_sizes<"AND", 0b00, 0b0, 0b1, "and", and, []>;
-defm ORR : logical_sizes<"ORR", 0b01, 0b0, 0b1, "orr", or, []>;
-defm EOR : logical_sizes<"EOR", 0b10, 0b0, 0b1, "eor", xor, []>;
-defm ANDS : logical_sizes<"ANDS", 0b11, 0b0, 0b1, "ands",
-             PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs),
-                     [{ (void)N; return false; }]>,
-             [NZCV]>;
-
-defm BIC : logical_sizes<"BIC", 0b00, 0b1, 0b0, "bic",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (and node:$lhs, (not node:$rhs))>, []>;
-defm ORN : logical_sizes<"ORN", 0b01, 0b1, 0b0, "orn",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (or node:$lhs, (not node:$rhs))>, []>;
-defm EON : logical_sizes<"EON", 0b10, 0b1, 0b0, "eon",
-                         PatFrag<(ops node:$lhs, node:$rhs),
-                                 (xor node:$lhs, (not node:$rhs))>, []>;
-defm BICS : logical_sizes<"BICS", 0b11, 0b1, 0b0, "bics",
-                          PatFrag<(ops node:$lhs, node:$rhs),
-                                  (and node:$lhs, (not node:$rhs)),
-                                  [{ (void)N; return false; }]>,
-                          [NZCV]>;
-
-multiclass tst_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 1, Rd = 0b11111, Defs = [NZCV] in {
-  def _lsl : A64I_logicalshift<sf, 0b11, 0b00, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (shl ty:$Rm,
-                           !cast<Operand>("lsl_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b11, 0b01, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (srl ty:$Rm,
-                           !cast<Operand>("lsr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b11, 0b10, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (sra ty:$Rm,
-                           !cast<Operand>("asr_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b11, 0b11, 0b0,
-                       (outs),
-                       (ins GPR:$Rn, GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "tst\t$Rn, $Rm, $Imm6",
-                       [(set NZCV, (A64setcc (and ty:$Rn, (rotr ty:$Rm,
-                           !cast<Operand>("ror_operand_" # ty):$Imm6)),
-                                          0, signed_cond))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"tst $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(A64setcc (and ty:$Rn, ty:$Rm), 0, signed_cond),
-            (!cast<Instruction>(prefix # "_lsl") $Rn, $Rm, 0)>;
-}
-
-defm TSTxx : tst_shifts<"TSTxx", 0b1, i64, GPR64>;
-defm TSTww : tst_shifts<"TSTww", 0b0, i32, GPR32>;
-
-
-multiclass mvn_shifts<string prefix, bit sf, ValueType ty, RegisterClass GPR> {
-  let isCommutable = 0, Rn = 0b11111 in {
-  def _lsl : A64I_logicalshift<sf, 0b01, 0b00, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsl_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (shl ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-
-  def _lsr : A64I_logicalshift<sf, 0b01, 0b01, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("lsr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (srl ty:$Rm,
-                         !cast<Operand>("lsr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _asr : A64I_logicalshift<sf, 0b01, 0b10, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("asr_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (sra ty:$Rm,
-                         !cast<Operand>("asr_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-
-  def _ror : A64I_logicalshift<sf, 0b01, 0b11, 0b1,
-                       (outs GPR:$Rd),
-                       (ins GPR:$Rm,
-                            !cast<Operand>("ror_operand_" # ty):$Imm6),
-                       "mvn\t$Rd, $Rm, $Imm6",
-                       [(set ty:$Rd, (not (rotr ty:$Rm,
-                         !cast<Operand>("lsl_operand_" # ty):$Imm6)))],
-                       NoItinerary>,
-             Sched<[WriteALU, ReadALU, ReadALU]>;
-  }
-
-  def _noshift : InstAlias<"mvn $Rn, $Rm",
-                     (!cast<Instruction>(prefix # "_lsl") GPR:$Rn, GPR:$Rm, 0)>;
-
-  def : Pat<(not ty:$Rm),
-            (!cast<Instruction>(prefix # "_lsl") $Rm, 0)>;
-}
-
-defm MVNxx : mvn_shifts<"MVNxx", 0b1, i64, GPR64>;
-defm MVNww : mvn_shifts<"MVNww", 0b0, i32, GPR32>;
-
-def MOVxx :InstAlias<"mov $Rd, $Rm", (ORRxxx_lsl GPR64:$Rd, XZR, GPR64:$Rm, 0)>;
-def MOVww :InstAlias<"mov $Rd, $Rm", (ORRwww_lsl GPR32:$Rd, WZR, GPR32:$Rm, 0)>;
+def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
+                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
 
 //===----------------------------------------------------------------------===//
-// Move wide (immediate) instructions
+// Floating point single operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: MOVN, MOVZ, MOVK + MOV aliases
-
-// A wide variety of different relocations are needed for variants of these
-// instructions, so it turns out that we need a different operand for all of
-// them.
-multiclass movw_operands<string prefix, string instname, int width> {
-  def _imm_asmoperand : AsmOperandClass {
-    let Name = instname # width # "Shifted" # shift;
-    let PredicateMethod = "is" # instname # width # "Imm";
-    let RenderMethod = "addMoveWideImmOperands";
-    let ParserMethod = "ParseImmWithLSLOperand";
-    let DiagnosticType = "MOVWUImm16";
-  }
 
-  def _imm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_imm_asmoperand");
-    let PrintMethod = "printMoveWideImmOperand";
-    let EncoderMethod = "getMoveWideImmOpValue";
-    let DecoderMethod = "DecodeMoveWideImmOperand<" # width # ">";
+defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
+defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
+defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
+defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
+defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
+defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
+defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
 
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
-}
+def : Pat<(v1f64 (int_aarch64_neon_frintn (v1f64 FPR64:$Rn))),
+          (FRINTNDr FPR64:$Rn)>;
 
-defm movn32 : movw_operands<"movn32", "MOVN", 32>;
-defm movn64 : movw_operands<"movn64", "MOVN", 64>;
-defm movz32 : movw_operands<"movz32", "MOVZ", 32>;
-defm movz64 : movw_operands<"movz64", "MOVZ", 64>;
-defm movk32 : movw_operands<"movk32", "MOVK", 32>;
-defm movk64 : movw_operands<"movk64", "MOVK", 64>;
-
-multiclass A64I_movwSizes<bits<2> opc, string asmop, dag ins32bit,
-                          dag ins64bit> {
-
-  def wii : A64I_movw<0b0, opc, (outs GPR32:$Rd), ins32bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
-
-  def xii : A64I_movw<0b1, opc, (outs GPR64:$Rd), ins64bit,
-                      !strconcat(asmop, "\t$Rd, $FullImm"),
-                      [], NoItinerary>,
-            Sched<[WriteALU]> {
-    bits<18> FullImm;
-    let UImm16 = FullImm{15-0};
-    let Shift = FullImm{17-16};
-  }
+// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
+// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
+// <rdar://problem/13715968>
+// TODO: We should really model the FPSR flags correctly. This is really ugly.
+let hasSideEffects = 1 in {
+defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
 }
 
-let isMoveImm = 1, isReMaterializable = 1,
-    isAsCheapAsAMove = 1, hasSideEffects = 0 in {
-  defm MOVN : A64I_movwSizes<0b00, "movn",
-                             (ins movn32_imm:$FullImm),
-                             (ins movn64_imm:$FullImm)>;
-
-  // Some relocations are able to convert between a MOVZ and a MOVN. If these
-  // are applied the instruction must be emitted with the corresponding bits as
-  // 0, which means a MOVZ needs to override that bit from the default.
-  let PostEncoderMethod = "fixMOVZ" in
-  defm MOVZ : A64I_movwSizes<0b10, "movz",
-                             (ins movz32_imm:$FullImm),
-                             (ins movz64_imm:$FullImm)>;
-}
-
-let Constraints = "$src = $Rd",
-    SchedRW = [WriteALU, ReadALU] in
-defm MOVK : A64I_movwSizes<0b11, "movk",
-                           (ins GPR32:$src, movk32_imm:$FullImm),
-                           (ins GPR64:$src, movk64_imm:$FullImm)>;
-
+defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
 
-// And now the "MOV" aliases. These also need their own operands because what
-// they accept is completely different to what the base instructions accept.
-multiclass movalias_operand<string prefix, string basename,
-                            string immpredicate, int width> {
-  def _asmoperand : AsmOperandClass {
-    let Name = basename # width # "MovAlias";
-    let PredicateMethod
-          = "isMoveWideMovAlias<" # width # ", A64Imms::" # immpredicate # ">";
-    let RenderMethod
-      = "addMoveWideMovAliasOperands<" # width # ", "
-                                       # "A64Imms::" # immpredicate # ">";
-  }
-
-  def _movimm : Operand<i64> {
-    let ParserMatchClass = !cast<AsmOperandClass>(prefix # "_asmoperand");
-
-    let MIOperandInfo = (ops uimm16:$UImm16, imm:$Shift);
-  }
+let SchedRW = [WriteFDiv] in {
+defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
 }
 
-defm movz32 : movalias_operand<"movz32", "MOVZ", "isMOVZImm", 32>;
-defm movz64 : movalias_operand<"movz64", "MOVZ", "isMOVZImm", 64>;
-defm movn32 : movalias_operand<"movn32", "MOVN", "isOnlyMOVNImm", 32>;
-defm movn64 : movalias_operand<"movn64", "MOVN", "isOnlyMOVNImm", 64>;
-
-// FIXME: these are officially canonical aliases, but TableGen is too limited to
-// print them at the moment. I believe in this case an "AliasPredicate" method
-// will need to be implemented. to allow it, as well as the more generally
-// useful handling of non-register, non-constant operands.
-class movalias<Instruction INST, RegisterClass GPR, Operand operand>
-  : InstAlias<"mov $Rd, $FullImm", (INST GPR:$Rd, operand:$FullImm)>;
-
-def : movalias<MOVZwii, GPR32, movz32_movimm>;
-def : movalias<MOVZxii, GPR64, movz64_movimm>;
-def : movalias<MOVNwii, GPR32, movn32_movimm>;
-def : movalias<MOVNxii, GPR64, movn64_movimm>;
-
-def movw_addressref_g0 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<0>">;
-def movw_addressref_g1 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<1>">;
-def movw_addressref_g2 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<2>">;
-def movw_addressref_g3 : ComplexPattern<i64, 2, "SelectMOVWAddressRef<3>">;
-
-def : Pat<(A64WrapperLarge movw_addressref_g3:$G3, movw_addressref_g2:$G2,
-                           movw_addressref_g1:$G1, movw_addressref_g0:$G0),
-          (MOVKxii (MOVKxii (MOVKxii (MOVZxii movw_addressref_g3:$G3),
-                                     movw_addressref_g2:$G2),
-                            movw_addressref_g1:$G1),
-                   movw_addressref_g0:$G0)>;
-
 //===----------------------------------------------------------------------===//
-// PC-relative addressing instructions
+// Floating point two operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: ADR, ADRP
-
-def adr_label : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_adr_prel>";
 
-  // This label is a 21-bit offset from PC, unscaled
-  let PrintMethod = "printLabelOperand<21, 1>";
-  let ParserMatchClass = label_asmoperand<21, 1>;
-  let OperandType = "OPERAND_PCREL";
-}
-
-def adrp_label_asmoperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let RenderMethod = "addLabelOperands<21, 4096>";
-  let DiagnosticType = "Label";
-}
-
-def adrp_label : Operand<i64> {
-  let EncoderMethod = "getAdrpLabelOpValue";
-
-  // This label is a 21-bit offset from PC, scaled by the page-size: 4096.
-  let PrintMethod = "printLabelOperand<21, 4096>";
-  let ParserMatchClass = adrp_label_asmoperand;
-  let OperandType = "OPERAND_PCREL";
-}
-
-let hasSideEffects = 0 in {
-  def ADRxi : A64I_PCADR<0b0, (outs GPR64:$Rd), (ins adr_label:$Label),
-                         "adr\t$Rd, $Label", [], NoItinerary>,
-              Sched<[WriteALUs]>;
-
-  def ADRPxi : A64I_PCADR<0b1, (outs GPR64:$Rd), (ins adrp_label:$Label),
-                          "adrp\t$Rd, $Label", [], NoItinerary>,
-               Sched<[WriteALUs]>;
-}
+defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
+let SchedRW = [WriteFDiv] in {
+defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
+}
+defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAX   : TwoOperandFPData<0b0100, "fmax", AArch64fmax>;
+defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_aarch64_neon_fminnm>;
+defm FMIN   : TwoOperandFPData<0b0101, "fmin", AArch64fmin>;
+let SchedRW = [WriteFMul] in {
+defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
+defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
+}
+defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
+
+def : Pat<(v1f64 (AArch64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (AArch64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(v1f64 (int_aarch64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
 
 //===----------------------------------------------------------------------===//
-// System instructions
+// Floating point three operand instructions.
 //===----------------------------------------------------------------------===//
-// Contains: HINT, CLREX, DSB, DMB, ISB, MSR, SYS, SYSL, MRS
-//    + aliases IC, DC, AT, TLBI, NOP, YIELD, WFE, WFI, SEV, SEVL
 
-// Op1 and Op2 fields are sometimes simple 3-bit unsigned immediate values.
-def uimm3_asmoperand : AsmOperandClass {
-  let Name = "UImm3";
-  let PredicateMethod = "isUImm<3>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm3";
-}
+defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
+defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
+     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
+defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
+     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
+defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
+     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
 
-def uimm3 : Operand<i32> {
-  let ParserMatchClass = uimm3_asmoperand;
-}
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
 
-// The HINT alias can accept a simple unsigned 7-bit immediate.
-def uimm7_asmoperand : AsmOperandClass {
-  let Name = "UImm7";
-  let PredicateMethod = "isUImm<7>";
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "UImm7";
-}
+// N.b. FMSUB etc have the accumulator at the *end* of (outs), unlike
+// the NEON variant.
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Ra)),
+          (FMSUBSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-def uimm7 : Operand<i32> {
-  let ParserMatchClass = uimm7_asmoperand;
-}
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Ra)),
+          (FMSUBDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-// Multiclass namedimm is defined with the prefetch operands. Most of these fit
-// into the NamedImmMapper scheme well: they either accept a named operand or
-// any immediate under a particular value (which may be 0, implying no immediate
-// is allowed).
-defm dbarrier : namedimm<"dbarrier", "A64DB::DBarrierMapper">;
-defm isb : namedimm<"isb", "A64ISB::ISBMapper">;
-defm ic : namedimm<"ic", "A64IC::ICMapper">;
-defm dc : namedimm<"dc", "A64DC::DCMapper">;
-defm at : namedimm<"at", "A64AT::ATMapper">;
-defm tlbi : namedimm<"tlbi", "A64TLBI::TLBIMapper">;
-
-// However, MRS and MSR are more complicated for a few reasons:
-//   * There are ~1000 generic names S3_<op1>_<CRn>_<CRm>_<Op2> which have an
-//     implementation-defined effect
-//   * Most registers are shared, but some are read-only or write-only.
-//   * There is a variant of MSR which accepts the same register name (SPSel),
-//     but which would have a different encoding.
-
-// In principle these could be resolved in with more complicated subclasses of
-// NamedImmMapper, however that imposes an overhead on other "named
-// immediates". Both in concrete terms with virtual tables and in unnecessary
-// abstraction.
-
-// The solution adopted here is to take the MRS/MSR Mappers out of the usual
-// hierarchy (they're not derived from NamedImmMapper) and to add logic for
-// their special situation.
-def mrs_asmoperand : AsmOperandClass {
-  let Name = "MRS";
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MRS";
-}
+// We handled -(a + b*c) for FNMADD above, now it's time for "(-a) + (-b)*c" and
+// "(-a) + b*(-c)".
+def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-def mrs_op : Operand<i32> {
-  let ParserMatchClass = mrs_asmoperand;
-  let PrintMethod = "printMRSOperand";
-  let DecoderMethod = "DecodeMRSOperand";
-}
+def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
-def msr_asmoperand : AsmOperandClass {
-  let Name = "MSRWithReg";
+def : Pat<(f32 (fma FPR32:$Rn, (fneg FPR32:$Rm), (fneg FPR32:$Ra))),
+          (FNMADDSrrr FPR32:$Rn, FPR32:$Rm, FPR32:$Ra)>;
 
-  // Note that SPSel is valid for both this and the pstate operands, but with
-  // different immediate encodings. This is why these operands provide a string
-  // AArch64Operand rather than an immediate. The overlap is small enough that
-  // it could be resolved with hackery now, but who can say in future?
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def msr_op : Operand<i32> {
-  let ParserMatchClass = msr_asmoperand;
-  let PrintMethod = "printMSROperand";
-  let DecoderMethod = "DecodeMSROperand";
-}
-
-def pstate_asmoperand : AsmOperandClass {
-  let Name = "MSRPState";
-  // See comment above about parser.
-  let ParserMethod = "ParseSysRegOperand";
-  let DiagnosticType = "MSR";
-}
-
-def pstate_op : Operand<i32> {
-  let ParserMatchClass = pstate_asmoperand;
-  let PrintMethod = "printNamedImmOperand<A64PState::PStateMapper>";
-  let DecoderMethod = "DecodeNamedImmOperand<A64PState::PStateMapper>";
-}
-
-// When <CRn> is specified, an assembler should accept something like "C4", not
-// the usual "#4" immediate.
-def CRx_asmoperand : AsmOperandClass {
-  let Name = "CRx";
-  let PredicateMethod = "isUImm<4>";
-  let RenderMethod = "addImmOperands";
-  let ParserMethod = "ParseCRxOperand";
-  // Diagnostics are handled in all cases by ParseCRxOperand.
-}
-
-def CRx : Operand<i32> {
-  let ParserMatchClass = CRx_asmoperand;
-  let PrintMethod = "printCRxOperand";
-}
-
-
-// Finally, we can start defining the instructions.
-
-// HINT is straightforward, with a few aliases.
-def HINTi : A64I_system<0b0, (outs), (ins uimm7:$UImm7), "hint\t$UImm7",
-                        [], NoItinerary> {
-  bits<7> UImm7;
-  let CRm = UImm7{6-3};
-  let Op2 = UImm7{2-0};
-
-  let Op0 = 0b00;
-  let Op1 = 0b011;
-  let CRn = 0b0010;
-  let Rt = 0b11111;
-}
-
-def : InstAlias<"nop", (HINTi 0)>;
-def : InstAlias<"yield", (HINTi 1)>;
-def : InstAlias<"wfe", (HINTi 2)>;
-def : InstAlias<"wfi", (HINTi 3)>;
-def : InstAlias<"sev", (HINTi 4)>;
-def : InstAlias<"sevl", (HINTi 5)>;
-
-// Quite a few instructions then follow a similar pattern of fixing common
-// fields in the bitpattern, we'll define a helper-class for them.
-class simple_sys<bits<2> op0, bits<3> op1, bits<4> crn, bits<3> op2,
-                 Operand operand, string asmop>
-  : A64I_system<0b0, (outs), (ins operand:$CRm), !strconcat(asmop, "\t$CRm"),
-                [], NoItinerary> {
-  let Op0 = op0;
-  let Op1 = op1;
-  let CRn = crn;
-  let Op2 = op2;
-  let Rt = 0b11111;
-}
-
-
-def CLREXi : simple_sys<0b00, 0b011, 0b0011, 0b010, uimm4, "clrex">;
-def DSBi : simple_sys<0b00, 0b011, 0b0011, 0b100, dbarrier_op, "dsb">;
-def DMBi : simple_sys<0b00, 0b011, 0b0011, 0b101, dbarrier_op, "dmb">;
-def ISBi : simple_sys<0b00, 0b011, 0b0011, 0b110, isb_op, "isb">;
-
-def : InstAlias<"clrex", (CLREXi 0b1111)>;
-def : InstAlias<"isb", (ISBi 0b1111)>;
-
-// (DMBi 0xb) is a "DMB ISH" instruciton, appropriate for Linux SMP
-// configurations at least.
-def : Pat<(atomic_fence imm, imm), (DMBi 0xb)>;
-
-// Any SYS bitpattern can be represented with a complex and opaque "SYS"
-// instruction.
-def SYSiccix : A64I_system<0b0, (outs),
-                           (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm,
-                                uimm3:$Op2, GPR64:$Rt),
-                           "sys\t$Op1, $CRn, $CRm, $Op2, $Rt",
-                           [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// You can skip the Xt argument whether it makes sense or not for the generic
-// SYS instruction.
-def : InstAlias<"sys $Op1, $CRn, $CRm, $Op2",
-                (SYSiccix uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2, XZR)>;
-
-
-// But many have aliases, which obviously don't fit into
-class SYSalias<dag ins, string asmstring>
-  : A64I_system<0b0, (outs), ins, asmstring, [], NoItinerary> {
-  let isAsmParserOnly = 1;
-
-  bits<14> SysOp;
-  let Op0 = 0b01;
-  let Op1 = SysOp{13-11};
-  let CRn = SysOp{10-7};
-  let CRm = SysOp{6-3};
-  let Op2 = SysOp{2-0};
-}
-
-def ICix : SYSalias<(ins ic_op:$SysOp, GPR64:$Rt), "ic\t$SysOp, $Rt">;
-
-def ICi : SYSalias<(ins ic_op:$SysOp), "ic\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-def DCix : SYSalias<(ins dc_op:$SysOp, GPR64:$Rt), "dc\t$SysOp, $Rt">;
-def ATix : SYSalias<(ins at_op:$SysOp, GPR64:$Rt), "at\t$SysOp, $Rt">;
-
-def TLBIix : SYSalias<(ins tlbi_op:$SysOp, GPR64:$Rt), "tlbi\t$SysOp, $Rt">;
-
-def TLBIi : SYSalias<(ins tlbi_op:$SysOp), "tlbi\t$SysOp"> {
-  let Rt = 0b11111;
-}
-
-
-def SYSLxicci : A64I_system<0b1, (outs GPR64:$Rt),
-                            (ins uimm3:$Op1, CRx:$CRn, CRx:$CRm, uimm3:$Op2),
-                            "sysl\t$Rt, $Op1, $CRn, $CRm, $Op2",
-                            [], NoItinerary> {
-  let Op0 = 0b01;
-}
-
-// The instructions themselves are rather simple for MSR and MRS.
-def MSRix : A64I_system<0b0, (outs), (ins msr_op:$SysReg, GPR64:$Rt),
-                        "msr\t$SysReg, $Rt", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MRSxi : A64I_system<0b1, (outs GPR64:$Rt), (ins mrs_op:$SysReg),
-                        "mrs\t$Rt, $SysReg", [], NoItinerary> {
-  bits<16> SysReg;
-  let Op0 = SysReg{15-14};
-  let Op1 = SysReg{13-11};
-  let CRn = SysReg{10-7};
-  let CRm = SysReg{6-3};
-  let Op2 = SysReg{2-0};
-}
-
-def MSRii : A64I_system<0b0, (outs), (ins pstate_op:$PState, uimm4:$CRm),
-                        "msr\t$PState, $CRm", [], NoItinerary> {
-  bits<6> PState;
-
-  let Op0 = 0b00;
-  let Op1 = PState{5-3};
-  let CRn = 0b0100;
-  let Op2 = PState{2-0};
-  let Rt = 0b11111;
-}
+def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
+          (FNMADDDrrr FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
 
 //===----------------------------------------------------------------------===//
-// Test & branch (immediate) instructions
+// Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
-// Contains: TBZ, TBNZ
 
-// The bit to test is a simple unsigned 6-bit immediate in the X-register
-// versions.
-def uimm6 : Operand<i64> {
-  let ParserMatchClass = uimm6_asmoperand;
-}
-
-def label_wid14_scal4_asmoperand : label_asmoperand<14, 4>;
-
-def tbimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_tstbr>";
-
-  // This label is a 14-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<14, 4>";
-  let ParserMatchClass = label_wid14_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def A64eq : ImmLeaf<i32, [{ return Imm == A64CC::EQ; }]>;
-def A64ne : ImmLeaf<i32, [{ return Imm == A64CC::NE; }]>;
-
-// These instructions correspond to patterns involving "and" with a power of
-// two, which we need to be able to select.
-def tstb64_pat : ComplexPattern<i64, 1, "SelectTSTBOperand<64>">;
-def tstb32_pat : ComplexPattern<i32, 1, "SelectTSTBOperand<32>">;
-
-let isBranch = 1, isTerminator = 1 in {
-  def TBZxii : A64I_TBimm<0b0, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]>;
-
-  def TBNZxii : A64I_TBimm<0b1, (outs),
-                        (ins GPR64:$Rt, uimm6:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i64:$Rt, tstb64_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]>;
-
-
-  // Note, these instructions overlap with the above 64-bit patterns. This is
-  // intentional, "tbz x3, #1, somewhere" and "tbz w3, #1, somewhere" would both
-  // do the same thing and are both permitted assembly. They also both have
-  // sensible DAG patterns.
-  def TBZwii : A64I_TBimm<0b0, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64eq, bb:$Label)],
-                        NoItinerary>,
-               Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-
-  def TBNZwii : A64I_TBimm<0b1, (outs),
-                        (ins GPR32:$Rt, uimm5:$Imm, tbimm_target:$Label),
-                        "tbnz\t$Rt, $Imm, $Label",
-                        [(A64br_cc (A64cmp (and i32:$Rt, tstb32_pat:$Imm), 0),
-                                   A64ne, bb:$Label)],
-                        NoItinerary>,
-                Sched<[WriteBr]> {
-    let Imm{5} = 0b0;
-  }
-}
+defm FCMPE : FPComparison<1, "fcmpe">;
+defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
 
 //===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions
+// Floating point conditional comparison instructions.
 //===----------------------------------------------------------------------===//
-// Contains: B, BL
-
-def label_wid26_scal4_asmoperand : label_asmoperand<26, 4>;
-
-def bimm_target : Operand<OtherVT> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_uncondbr>";
 
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
-
-def blimm_target : Operand<i64> {
-  let EncoderMethod = "getLabelOpValue<AArch64::fixup_a64_call>";
-
-  // This label is a 26-bit offset from PC, scaled by the instruction-width: 4.
-  let PrintMethod = "printLabelOperand<26, 4>";
-  let ParserMatchClass = label_wid26_scal4_asmoperand;
-
-  let OperandType = "OPERAND_PCREL";
-}
+defm FCCMPE : FPCondComparison<1, "fccmpe">;
+defm FCCMP  : FPCondComparison<0, "fccmp">;
 
-class A64I_BimmImpl<bit op, string asmop, list<dag> patterns, Operand lbl_type>
-  : A64I_Bimm<op, (outs), (ins lbl_type:$Label),
-              !strconcat(asmop, "\t$Label"), patterns,
-              NoItinerary>,
-    Sched<[WriteBr]>;
+//===----------------------------------------------------------------------===//
+// Floating point conditional select instruction.
+//===----------------------------------------------------------------------===//
 
-let isBranch = 1 in {
-  def Bimm : A64I_BimmImpl<0b0, "b", [(br bb:$Label)], bimm_target> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-  }
+defm FCSEL : FPCondSelect<"fcsel">;
 
-  let SchedRW = [WriteBrL] in {
-    def BLimm : A64I_BimmImpl<0b1, "bl",
-                              [(AArch64Call tglobaladdr:$Label)], blimm_target> {
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
+// CSEL instructions providing f128 types need to be handled by a
+// pseudo-instruction since the eventual code will need to introduce basic
+// blocks and control flow.
+def F128CSEL : Pseudo<(outs FPR128:$Rd),
+                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
+                      [(set (f128 FPR128:$Rd),
+                            (AArch64csel FPR128:$Rn, FPR128:$Rm,
+                                       (i32 imm:$cond), NZCV))]> {
+  let Uses = [NZCV];
+  let usesCustomInserter = 1;
 }
 
-def : Pat<(AArch64Call texternalsym:$Label), (BLimm texternalsym:$Label)>;
 
 //===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions
+// Floating point immediate move.
 //===----------------------------------------------------------------------===//
-// Contains: BR, BLR, RET, ERET, DRP.
-
-// Most of the notional opcode fields in the A64I_Breg format are fixed in A64
-// at the moment.
-class A64I_BregImpl<bits<4> opc,
-                    dag outs, dag ins, string asmstr, list<dag> patterns,
-                    InstrItinClass itin = NoItinerary>
-  : A64I_Breg<opc, 0b11111, 0b000000, 0b00000,
-              outs, ins, asmstr, patterns, itin>,
-    Sched<[WriteBr]> {
-  let isBranch         = 1;
-  let isIndirectBranch = 1;
-}
-
-// Note that these are not marked isCall or isReturn because as far as LLVM is
-// concerned they're not. "ret" is just another jump unless it has been selected
-// by LLVM as the function's return.
 
-let isBranch = 1 in {
-  def BRx : A64I_BregImpl<0b0000,(outs), (ins GPR64:$Rn),
-                          "br\t$Rn", [(brind i64:$Rn)]> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-  }
-
-  let SchedRW = [WriteBrL] in {
-    def BLRx : A64I_BregImpl<0b0001, (outs), (ins GPR64:$Rn),
-                             "blr\t$Rn", [(AArch64Call i64:$Rn)]> {
-      let isBarrier = 0;
-      let isCall = 1;
-      let Defs = [X30];
-    }
-  }
-
-  def RETx : A64I_BregImpl<0b0010, (outs), (ins GPR64:$Rn),
-                           "ret\t$Rn", []> {
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  // Create a separate pseudo-instruction for codegen to use so that we don't
-  // flag x30 as used in every function. It'll be restored before the RET by the
-  // epilogue if it's legitimately used.
-  def RET : A64PseudoExpand<(outs), (ins), [(A64ret)], (RETx (ops X30))> {
-    let isTerminator = 1;
-    let isBarrier = 1;
-    let isReturn = 1;
-  }
-
-  def ERET : A64I_BregImpl<0b0100, (outs), (ins), "eret", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-    let isTerminator = 1;
-    let isReturn = 1;
-  }
-
-  def DRPS : A64I_BregImpl<0b0101, (outs), (ins), "drps", []> {
-    let Rn = 0b11111;
-    let isBarrier = 1;
-  }
+let isReMaterializable = 1 in {
+defm FMOV : FPMoveImmediate<"fmov">;
 }
 
-def RETAlias : InstAlias<"ret", (RETx X30)>;
-
-
 //===----------------------------------------------------------------------===//
-// Address generation patterns
+// Advanced SIMD two vector instructions.
 //===----------------------------------------------------------------------===//
 
-// Primary method of address generation for the small/absolute memory model is
-// an ADRP/ADR pair:
-//     ADRP x0, some_variable
-//     ADD x0, x0, #:lo12:some_variable
-//
-// The load/store elision of the ADD is accomplished when selecting
-// addressing-modes. This just mops up the cases where that doesn't work and we
-// really need an address in some register.
-
-// This wrapper applies a LO12 modifier to the address. Otherwise we could just
-// use the same address.
-
-class ADRP_ADD<SDNode Wrapper, SDNode addrop>
- : Pat<(Wrapper addrop:$Hi, addrop:$Lo12, (i32 imm)),
-       (ADDxxi_lsl0_s (ADRPxi addrop:$Hi), addrop:$Lo12)>;
-
-def : ADRP_ADD<A64WrapperSmall, tblockaddress>;
-def : ADRP_ADD<A64WrapperSmall, texternalsym>;
-def : ADRP_ADD<A64WrapperSmall, tglobaladdr>;
-def : ADRP_ADD<A64WrapperSmall, tglobaltlsaddr>;
-def : ADRP_ADD<A64WrapperSmall, tjumptable>;
-def : ADRP_ADD<A64WrapperSmall, tconstpool>;
+defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_aarch64_neon_cls>;
+defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
+defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", AArch64cmltz>;
+defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
+defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
+
+defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_aarch64_neon_fcvtas>;
+defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_aarch64_neon_fcvtau>;
+defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (v4i16 V64:$Rn))),
+          (FCVTLv4i16 V64:$Rn)>;
+def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
+                                                              (i64 4)))),
+          (FCVTLv8i16 V128:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
+def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
+                                                    (i64 2))))),
+          (FCVTLv4i32 V128:$Rn)>;
+
+defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
+defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
+defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_aarch64_neon_fcvtns>;
+defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_aarch64_neon_fcvtnu>;
+defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
+def : Pat<(v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
+          (FCVTNv4i16 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd,
+                          (v4i16 (int_aarch64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
+          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
+def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
+          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
+defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_aarch64_neon_fcvtps>;
+defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_aarch64_neon_fcvtpu>;
+defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
+                                        int_aarch64_neon_fcvtxn>;
+defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
+defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
+let isCodeGenOnly = 1 in {
+defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
+                                       int_aarch64_neon_fcvtzs>;
+defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
+                                       int_aarch64_neon_fcvtzu>;
+}
+defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
+defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_aarch64_neon_frecpe>;
+defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
+defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
+defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
+defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_aarch64_neon_frintn>;
+defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
+defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
+defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
+defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_aarch64_neon_frsqrte>;
+defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
+defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
+                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
+// Aliases for MVN -> NOT.
+def : InstAlias<"mvn{ $Vd.8b, $Vn.8b|.8b $Vd, $Vn}",
+                (NOTv8i8 V64:$Vd, V64:$Vn)>;
+def : InstAlias<"mvn{ $Vd.16b, $Vn.16b|.16b $Vd, $Vn}",
+                (NOTv16i8 V128:$Vd, V128:$Vn)>;
+
+def : Pat<(AArch64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
+def : Pat<(AArch64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
+def : Pat<(AArch64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
+def : Pat<(AArch64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
+def : Pat<(AArch64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
+def : Pat<(AArch64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
+
+def : Pat<(AArch64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v1i64 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(AArch64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(AArch64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
+def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
+
+defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_aarch64_neon_rbit>;
+defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", AArch64rev16>;
+defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", AArch64rev32>;
+defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", AArch64rev64>;
+defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_saddlp node:$RHS))> >;
+defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_aarch64_neon_saddlp>;
+defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
+defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
+defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_aarch64_neon_sqxtn>;
+defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_aarch64_neon_sqxtun>;
+defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_aarch64_neon_suqadd>;
+defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
+       BinOpFrag<(add node:$LHS, (int_aarch64_neon_uaddlp node:$RHS))> >;
+defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
+                    int_aarch64_neon_uaddlp>;
+defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
+defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_aarch64_neon_uqxtn>;
+defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_aarch64_neon_urecpe>;
+defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>;
+defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
+defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
+
+def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+
+// Patterns for vector long shift (by element width). These need to match all
+// three of zext, sext and anyext so it's easier to pull the patterns out of the
+// definition.
+multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
+  def : Pat<(AArch64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
+            (SHLLv8i8 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
+            (SHLLv16i8 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
+            (SHLLv4i16 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
+            (SHLLv8i16 V128:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
+            (SHLLv2i32 V64:$Rn)>;
+  def : Pat<(AArch64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
+            (SHLLv4i32 V128:$Rn)>;
+}
+
+defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
+defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
 
 //===----------------------------------------------------------------------===//
-// GOT access patterns
+// Advanced SIMD three vector instructions.
 //===----------------------------------------------------------------------===//
 
-class GOTLoadSmall<SDNode addrfrag>
-  : Pat<(A64GOTLoad (A64WrapperSmall addrfrag:$Hi, addrfrag:$Lo12, 8)),
-        (LS64_LDR (ADRPxi addrfrag:$Hi), addrfrag:$Lo12)>;
-
-def : GOTLoadSmall<texternalsym>;
-def : GOTLoadSmall<tglobaladdr>;
-def : GOTLoadSmall<tglobaltlsaddr>;
+defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
+defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_aarch64_neon_addp>;
+defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_aarch64_neon_fabd>;
+defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_aarch64_neon_facge>;
+defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_aarch64_neon_facgt>;
+defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_aarch64_neon_addp>;
+defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
+defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
+defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_aarch64_neon_fmaxnmp>;
+defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_aarch64_neon_fmaxnm>;
+defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_aarch64_neon_fmaxp>;
+defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", AArch64fmax>;
+defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_aarch64_neon_fminnmp>;
+defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_aarch64_neon_fminnm>;
+defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_aarch64_neon_fminp>;
+defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", AArch64fmin>;
+
+// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
+// instruction expects the addend first, while the fma intrinsic puts it last.
+defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
+            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
+            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+
+// The following def pats catch the case where the LHS of an FMA is negated.
+// The TriOpFrag above catches the case where the middle operand is negated.
+def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
+          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
+          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_aarch64_neon_fmulx>;
+defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
+defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_aarch64_neon_frsqrts>;
+defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
+defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
+                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
+                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
+defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
+defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_sabd node:$MHS, node:$RHS))> >;
+defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_aarch64_neon_sabd>;
+defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_aarch64_neon_shadd>;
+defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_aarch64_neon_shsub>;
+defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_aarch64_neon_smaxp>;
+defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_aarch64_neon_smax>;
+defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_aarch64_neon_sminp>;
+defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_aarch64_neon_smin>;
+defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
+defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
+      TriOpFrag<(add node:$LHS, (int_aarch64_neon_uabd node:$MHS, node:$RHS))> >;
+defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_aarch64_neon_uabd>;
+defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_aarch64_neon_uhadd>;
+defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_aarch64_neon_uhsub>;
+defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_aarch64_neon_umaxp>;
+defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_aarch64_neon_umax>;
+defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_aarch64_neon_uminp>;
+defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_aarch64_neon_umin>;
+defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+
+defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
+defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
+                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
+defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
+    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
+defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
+defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
+                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
+defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
+
+def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+
+def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
+def : InstAlias<"mov{\t$dst.8h, $src.8h|.8h\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.4s, $src.4s|.4s\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+def : InstAlias<"mov{\t$dst.2d, $src.2d|.2d\t$dst, $src}",
+                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
+
+def : InstAlias<"mov{\t$dst.8b, $src.8b|.8b\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 1>;
+def : InstAlias<"mov{\t$dst.4h, $src.4h|.4h\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.2s, $src.2s|.2s\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+def : InstAlias<"mov{\t$dst.1d, $src.1d|.1d\t$dst, $src}",
+                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
+
+def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmls.8b\t$dst, $src1, $src2}",
+                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmls.16b\t$dst, $src1, $src2}",
+                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmls.4h\t$dst, $src1, $src2}",
+                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmls.8h\t$dst, $src1, $src2}",
+                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmls.2s\t$dst, $src1, $src2}",
+                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmls.4s\t$dst, $src1, $src2}",
+                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmls.2d\t$dst, $src1, $src2}",
+                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlo.8b\t$dst, $src1, $src2}",
+                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlo.16b\t$dst, $src1, $src2}",
+                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlo.4h\t$dst, $src1, $src2}",
+                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlo.8h\t$dst, $src1, $src2}",
+                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlo.2s\t$dst, $src1, $src2}",
+                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlo.4s\t$dst, $src1, $src2}",
+                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlo.2d\t$dst, $src1, $src2}",
+                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmle.8b\t$dst, $src1, $src2}",
+                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmle.16b\t$dst, $src1, $src2}",
+                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmle.4h\t$dst, $src1, $src2}",
+                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmle.8h\t$dst, $src1, $src2}",
+                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmle.2s\t$dst, $src1, $src2}",
+                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmle.4s\t$dst, $src1, $src2}",
+                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmle.2d\t$dst, $src1, $src2}",
+                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
+                "|cmlt.8b\t$dst, $src1, $src2}",
+                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
+                "|cmlt.16b\t$dst, $src1, $src2}",
+                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
+                "|cmlt.4h\t$dst, $src1, $src2}",
+                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
+                "|cmlt.8h\t$dst, $src1, $src2}",
+                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|cmlt.2s\t$dst, $src1, $src2}",
+                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|cmlt.4s\t$dst, $src1, $src2}",
+                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|cmlt.2d\t$dst, $src1, $src2}",
+                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmle.2s\t$dst, $src1, $src2}",
+                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmle.4s\t$dst, $src1, $src2}",
+                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmle.2d\t$dst, $src1, $src2}",
+                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|fcmlt.2s\t$dst, $src1, $src2}",
+                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|fcmlt.4s\t$dst, $src1, $src2}",
+                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|fcmlt.2d\t$dst, $src1, $src2}",
+                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
+                "|facle.2s\t$dst, $src1, $src2}",
+                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
+                "|facle.4s\t$dst, $src1, $src2}",
+                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
+                "|facle.2d\t$dst, $src1, $src2}",
+                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
+
+def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
+                "|faclt.2s\t$dst, $src1, $src2}",
+                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
+                "|faclt.4s\t$dst, $src1, $src2}",
+                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
+def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
+                "|faclt.2d\t$dst, $src1, $src2}",
+                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Tail call handling
+// Advanced SIMD three scalar instructions.
 //===----------------------------------------------------------------------===//
 
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [XSP] in {
-  def TC_RETURNdi
-    : PseudoInst<(outs), (ins i64imm:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff))]>;
-
-  def TC_RETURNxi
-    : PseudoInst<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff),
-                 [(AArch64tcret i64:$dst, (i32 timm:$FPDiff))]>;
-}
-
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
-    Uses = [XSP] in {
-  def TAIL_Bimm : A64PseudoExpand<(outs), (ins bimm_target:$Label), [],
-                                  (Bimm bimm_target:$Label)>;
-
-  def TAIL_BRx : A64PseudoExpand<(outs), (ins tcGPR64:$Rd), [],
-                                 (BRx GPR64:$Rd)>;
-}
-
-
-def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
-          (TC_RETURNdi texternalsym:$dst, imm:$FPDiff)>;
+defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
+defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", AArch64cmeq>;
+defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", AArch64cmge>;
+defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", AArch64cmgt>;
+defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", AArch64cmhi>;
+defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", AArch64cmhs>;
+defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", AArch64cmtst>;
+defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_aarch64_sisd_fabd>;
+def : Pat<(v1f64 (int_aarch64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
+          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
+defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
+                                     int_aarch64_neon_facge>;
+defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
+                                     int_aarch64_neon_facgt>;
+defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", AArch64fcmeq>;
+defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", AArch64fcmge>;
+defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", AArch64fcmgt>;
+defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_aarch64_neon_fmulx>;
+defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_aarch64_neon_frecps>;
+defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_aarch64_neon_frsqrts>;
+defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_aarch64_neon_sqadd>;
+defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_aarch64_neon_sqrshl>;
+defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_aarch64_neon_sqshl>;
+defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_aarch64_neon_sqsub>;
+defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_aarch64_neon_srshl>;
+defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_aarch64_neon_sshl>;
+defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
+defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_aarch64_neon_uqadd>;
+defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_aarch64_neon_uqrshl>;
+defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_aarch64_neon_uqshl>;
+defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
+defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
+defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+
+def : InstAlias<"cmls $dst, $src1, $src2",
+                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmle $dst, $src1, $src2",
+                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlo $dst, $src1, $src2",
+                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"cmlt $dst, $src1, $src2",
+                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmle $dst, $src1, $src2",
+                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"fcmlt $dst, $src1, $src2",
+                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"facle $dst, $src1, $src2",
+                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1), 0>;
+def : InstAlias<"faclt $dst, $src1, $src2",
+                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
 
 //===----------------------------------------------------------------------===//
-// Thread local storage
+// Advanced SIMD three scalar instructions (mixed operands).
 //===----------------------------------------------------------------------===//
+defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
+                                       int_aarch64_neon_sqdmulls_scalar>;
+defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
+defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
+
+def : Pat<(i64 (int_aarch64_neon_sqadd (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_sqsub (i64 FPR64:$Rd),
+                   (i64 (int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                                        (i32 FPR32:$Rm))))),
+          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
 
-// This is a pseudo-instruction representing the ".tlsdesccall" directive in
-// assembly. Its effect is to insert an R_AARCH64_TLSDESC_CALL relocation at the
-// current location. It should always be immediately followed by a BLR
-// instruction, and is intended solely for relaxation by the linker.
-
-def : Pat<(A64threadpointer), (MRSxi 0xde82)>;
-
-def TLSDESCCALL : PseudoInst<(outs), (ins i64imm:$Lbl), []> {
-  let hasSideEffects = 1;
-}
-
-def TLSDESC_BLRx : PseudoInst<(outs), (ins GPR64:$Rn, i64imm:$Var),
-                            [(A64tlsdesc_blr i64:$Rn, tglobaltlsaddr:$Var)]> {
-  let isCall = 1;
-  let Defs = [X30];
-}
+//===----------------------------------------------------------------------===//
+// Advanced SIMD two scalar instructions.
+//===----------------------------------------------------------------------===//
 
-def : Pat<(A64tlsdesc_blr i64:$Rn, texternalsym:$Var),
-          (TLSDESC_BLRx $Rn, texternalsym:$Var)>;
+defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_aarch64_neon_abs>;
+defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", AArch64cmeqz>;
+defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", AArch64cmgez>;
+defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", AArch64cmgtz>;
+defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", AArch64cmlez>;
+defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", AArch64cmltz>;
+defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", AArch64fcmeqz>;
+defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", AArch64fcmgez>;
+defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", AArch64fcmgtz>;
+defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", AArch64fcmlez>;
+defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", AArch64fcmltz>;
+defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
+defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
+defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
+defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
+defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
+defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
+defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
+defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
+def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
+defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
+defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
+defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
+defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
+defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
+defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
+                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
+defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", AArch64sitof>;
+defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_aarch64_neon_sqabs>;
+defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_aarch64_neon_sqneg>;
+defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_aarch64_neon_scalar_sqxtn>;
+defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_aarch64_neon_scalar_sqxtun>;
+defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
+                                     int_aarch64_neon_suqadd>;
+defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", AArch64uitof>;
+defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_aarch64_neon_scalar_uqxtn>;
+defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
+                                    int_aarch64_neon_usqadd>;
+
+def : Pat<(AArch64neg (v1i64 V64:$Rn)), (NEGv1i64 V64:$Rn)>;
+
+def : Pat<(v1i64 (int_aarch64_neon_fcvtas (v1f64 FPR64:$Rn))),
+          (FCVTASv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtau (v1f64 FPR64:$Rn))),
+          (FCVTAUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtms (v1f64 FPR64:$Rn))),
+          (FCVTMSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtmu (v1f64 FPR64:$Rn))),
+          (FCVTMUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtns (v1f64 FPR64:$Rn))),
+          (FCVTNSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtnu (v1f64 FPR64:$Rn))),
+          (FCVTNUv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtps (v1f64 FPR64:$Rn))),
+          (FCVTPSv1i64 FPR64:$Rn)>;
+def : Pat<(v1i64 (int_aarch64_neon_fcvtpu (v1f64 FPR64:$Rn))),
+          (FCVTPUv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpe (f32 FPR32:$Rn))),
+          (FRECPEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpe (f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frecpe (v1f64 FPR64:$Rn))),
+          (FRECPEv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frecpx (f32 FPR32:$Rn))),
+          (FRECPXv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frecpx (f64 FPR64:$Rn))),
+          (FRECPXv1i64 FPR64:$Rn)>;
+
+def : Pat<(f32 (int_aarch64_neon_frsqrte (f32 FPR32:$Rn))),
+          (FRSQRTEv1i32 FPR32:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_frsqrte (f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+def : Pat<(v1f64 (int_aarch64_neon_frsqrte (v1f64 FPR64:$Rn))),
+          (FRSQRTEv1i64 FPR64:$Rn)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// Here are the patterns for 8 and 16-bits to float.
+// 8-bits -> float.
+multiclass UIntToFPROLoadPat<ValueType DstTy, ValueType SrcTy,
+                             SDPatternOperator loadop, Instruction UCVTF,
+                             ROAddrMode ro, Instruction LDRW, Instruction LDRX,
+                             SubRegIndex sub> {
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Wpat GPR64sp:$Rn, GPR32:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRW GPR64sp:$Rn, GPR32:$Rm, ro.Wext:$extend),
+                                 sub))>;
+
+  def : Pat<(DstTy (uint_to_fp (SrcTy
+                     (loadop (ro.Xpat GPR64sp:$Rn, GPR64:$Rm,
+                                      ro.Wext:$extend))))),
+           (UCVTF (INSERT_SUBREG (DstTy (IMPLICIT_DEF)),
+                                 (LDRX GPR64sp:$Rn, GPR64:$Rm, ro.Xext:$extend),
+                                 sub))>;
+}
+
+defm : UIntToFPROLoadPat<f32, i32, zextloadi8,
+                         UCVTFv1i32, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f32 (uint_to_fp (i32
+               (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                     (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> float.
+defm : UIntToFPROLoadPat<f32, i32, zextloadi16,
+                         UCVTFv1i32, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f32 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// UCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi8,
+                         UCVTFv1i64, ro8, LDRBroW, LDRBroX, bsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                    (zextloadi8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset), bsub))>;
+// 16-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, zextloadi16,
+                         UCVTFv1i64, ro16, LDRHroW, LDRHroX, hsub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (zextloadi16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURHi GPR64sp:$Rn, simm9:$offset), hsub))>;
+// 32-bits -> double.
+defm : UIntToFPROLoadPat<f64, i32, load,
+                         UCVTFv1i64, ro32, LDRSroW, LDRSroX, ssub>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub))>;
+def : Pat <(f64 (uint_to_fp (i32
+                  (load (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                          (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
 
 //===----------------------------------------------------------------------===//
-// Bitfield patterns
+// Advanced SIMD three different-sized vector instructions.
 //===----------------------------------------------------------------------===//
 
-def bfi32_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((32 - N->getZExtValue()) % 32, MVT::i64);
+defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_aarch64_neon_addhn>;
+defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_aarch64_neon_subhn>;
+defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_aarch64_neon_raddhn>;
+defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_aarch64_neon_rsubhn>;
+defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_aarch64_neon_pmull>;
+defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
+                                             int_aarch64_neon_sabd>;
+defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
+                                          int_aarch64_neon_sabd>;
+defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
+            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
+defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
+                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
+defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
+                                               int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
+                                               int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
+                                     int_aarch64_neon_sqdmull>;
+defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
+                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
+defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
+                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
+defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
+                                              int_aarch64_neon_uabd>;
+defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
+                                          int_aarch64_neon_uabd>;
+defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
+                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
+defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
+                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
+defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_aarch64_neon_umull>;
+defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
+                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
+defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
+                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
+
+// Patterns for 64-bit pmull
+def : Pat<(int_aarch64_neon_pmull64 V64:$Rn, V64:$Rm),
+          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
+def : Pat<(int_aarch64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
+                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
+          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
+
+// CodeGen patterns for addhn and subhn instructions, which can actually be
+// written in LLVM IR without too much difficulty.
+
+// ADDHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
+          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (add V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+// SUBHN
+def : Pat<(v8i8 (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
+          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
+def : Pat<(v4i16 (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 16))))),
+          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
+def : Pat<(v2i32 (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                           (i32 32))))),
+          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v8i8 V64:$Rd),
+                          (trunc (v8i16 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 8))))),
+          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v4i16 V64:$Rd),
+                          (trunc (v4i32 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 16))))),
+          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+def : Pat<(concat_vectors (v2i32 V64:$Rd),
+                          (trunc (v2i64 (AArch64vlshr (sub V128:$Rn, V128:$Rm),
+                                                    (i32 32))))),
+          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
+                            V128:$Rn, V128:$Rm)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD bitwise extract from vector instruction.
+//----------------------------------------------------------------------------
+
+defm EXT : SIMDBitwiseExtract<"ext">;
+
+def : Pat<(v4i16 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v8i16 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v2f32 (AArch64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
+          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
+def : Pat<(v4i32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v4f32 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2i64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+def : Pat<(v2f64 (AArch64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
+          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
+
+// We use EXT to handle extract_subvector to copy the upper 64-bits of a
+// 128-bit vector.
+def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
+          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD zip vector
+//----------------------------------------------------------------------------
+
+defm TRN1 : SIMDZipVector<0b010, "trn1", AArch64trn1>;
+defm TRN2 : SIMDZipVector<0b110, "trn2", AArch64trn2>;
+defm UZP1 : SIMDZipVector<0b001, "uzp1", AArch64uzp1>;
+defm UZP2 : SIMDZipVector<0b101, "uzp2", AArch64uzp2>;
+defm ZIP1 : SIMDZipVector<0b011, "zip1", AArch64zip1>;
+defm ZIP2 : SIMDZipVector<0b111, "zip2", AArch64zip2>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD TBL/TBX instructions
+//----------------------------------------------------------------------------
+
+defm TBL : SIMDTableLookup<    0, "tbl">;
+defm TBX : SIMDTableLookupTied<1, "tbx">;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBLv16i8One V128:$Ri, V128:$Rn)>;
+
+def : Pat<(v8i8 (int_aarch64_neon_tbx1 (v8i8 V64:$Rd),
+                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
+          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
+def : Pat<(v16i8 (int_aarch64_neon_tbx1 (v16i8 V128:$Rd),
+                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
+          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar CPY instruction
+//----------------------------------------------------------------------------
+
+defm CPY : SIMDScalarCPY<"cpy">;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar pairwise instructions
+//----------------------------------------------------------------------------
+
+defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
+defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
+defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
+defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
+defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
+defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
+def : Pat<(i64 (int_aarch64_neon_saddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(i64 (int_aarch64_neon_uaddv (v2i64 V128:$Rn))),
+          (ADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v2f32 V64:$Rn))),
+          (FADDPv2i32p V64:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_faddv (v4f32 V128:$Rn))),
+          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
+def : Pat<(f64 (int_aarch64_neon_faddv (v2f64 V128:$Rn))),
+          (FADDPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxnmv (v2f32 V64:$Rn))),
+          (FMAXNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxnmv (v2f64 V128:$Rn))),
+          (FMAXNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fmaxv (v2f32 V64:$Rn))),
+          (FMAXPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fmaxv (v2f64 V128:$Rn))),
+          (FMAXPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminnmv (v2f32 V64:$Rn))),
+          (FMINNMPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminnmv (v2f64 V128:$Rn))),
+          (FMINNMPv2i64p V128:$Rn)>;
+def : Pat<(f32 (int_aarch64_neon_fminv (v2f32 V64:$Rn))),
+          (FMINPv2i32p V64:$Rn)>;
+def : Pat<(f64 (int_aarch64_neon_fminv (v2f64 V128:$Rn))),
+          (FMINPv2i64p V128:$Rn)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD INS/DUP instructions
+//----------------------------------------------------------------------------
+
+def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
+def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
+def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
+def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
+def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
+def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
+def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
+
+def DUPv2i64lane : SIMDDup64FromElement;
+def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
+def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
+def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
+def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
+def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
+def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
+
+def : Pat<(v2f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v2f32 (DUPv2i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v4f32 (AArch64dup (f32 FPR32:$Rn))),
+          (v4f32 (DUPv4i32lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
+            (i64 0)))>;
+def : Pat<(v2f64 (AArch64dup (f64 FPR64:$Rn))),
+          (v2f64 (DUPv2i64lane
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
+            (i64 0)))>;
+
+def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
+         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
+def : Pat<(v2f64 (AArch64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
+          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
+
+// If there's an (AArch64dup (vector_extract ...) ...), we can use a duplane
+// instruction even if the types don't match: we just have to remap the lane
+// carefully. N.b. this trick only applies to truncations.
+def VecIndex_x2 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(2 * N->getZExtValue(), MVT::i64);
 }]>;
-
-def bfi64_lsb_to_immr : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((64 - N->getZExtValue()) % 64, MVT::i64);
+def VecIndex_x4 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(4 * N->getZExtValue(), MVT::i64);
 }]>;
-
-def bfi_width_to_imms : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() - 1, MVT::i64);
+def VecIndex_x8 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(8 * N->getZExtValue(), MVT::i64);
 }]>;
 
+multiclass DUPWithTruncPats<ValueType ResVT, ValueType Src64VT,
+                            ValueType Src128VT, ValueType ScalVT,
+                            Instruction DUP, SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src128VT V128:$Rn),
+                                                     imm:$idx)))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (ScalVT (vector_extract (Src64VT V64:$Rn),
+                                                     imm:$idx)))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTruncPats<v8i8,   v4i16, v8i16, i32, DUPv8i8lane,  VecIndex_x2>;
+defm : DUPWithTruncPats<v8i8,   v2i32, v4i32, i32, DUPv8i8lane,  VecIndex_x4>;
+defm : DUPWithTruncPats<v4i16,  v2i32, v4i32, i32, DUPv4i16lane, VecIndex_x2>;
+
+defm : DUPWithTruncPats<v16i8,  v4i16, v8i16, i32, DUPv16i8lane, VecIndex_x2>;
+defm : DUPWithTruncPats<v16i8,  v2i32, v4i32, i32, DUPv16i8lane, VecIndex_x4>;
+defm : DUPWithTruncPats<v8i16,  v2i32, v4i32, i32, DUPv8i16lane, VecIndex_x2>;
+
+multiclass DUPWithTrunci64Pats<ValueType ResVT, Instruction DUP,
+                               SDNodeXForm IdxXFORM> {
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v2i64 V128:$Rn),
+                                                         imm:$idx))))),
+            (DUP V128:$Rn, (IdxXFORM imm:$idx))>;
+
+  def : Pat<(ResVT (AArch64dup (i32 (trunc (vector_extract (v1i64 V64:$Rn),
+                                                         imm:$idx))))),
+            (DUP (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), (IdxXFORM imm:$idx))>;
+}
+
+defm : DUPWithTrunci64Pats<v8i8,  DUPv8i8lane,   VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v4i16, DUPv4i16lane,  VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v2i32, DUPv2i32lane,  VecIndex_x2>;
+
+defm : DUPWithTrunci64Pats<v16i8, DUPv16i8lane, VecIndex_x8>;
+defm : DUPWithTrunci64Pats<v8i16, DUPv8i16lane, VecIndex_x4>;
+defm : DUPWithTrunci64Pats<v4i32, DUPv4i32lane, VecIndex_x2>;
+
+// SMOV and UMOV definitions, with some extra patterns for convenience
+defm SMOV : SMov;
+defm UMOV : UMov;
+
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
+          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
+          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
+          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
+
+// Extracting i8 or i16 elements will have the zero-extend transformed to
+// an 'and' mask by type legalization since neither i8 nor i16 are legal types
+// for AArch64. Match these patterns here since UMOV already zeroes out the high
+// bits of the destination register.
+def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
+               (i32 0xff)),
+          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
+def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
+               (i32 0xffff)),
+          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
+
+defm INS : SIMDIns;
+
+def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
+          (SUBREG_TO_REG (i32 0),
+                         (f32 (COPY_TO_REGCLASS GPR32:$Rn, FPR32)), ssub)>;
+
+def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
+            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
+                                  (i32 FPR32:$Rn), ssub))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
+            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
+                                  (i64 FPR64:$Rn), dsub))>;
+
+def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
+          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
+def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
+
+def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi32lane
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+              (i64 0)),
+            dsub)>;
+def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
+            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
+          (INSvi32lane
+            V128:$Rn, VectorIndexS:$imm,
+            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
+            (i64 0))>;
+def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
+            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
+          (INSvi64lane
+            V128:$Rn, VectorIndexD:$imm,
+            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
+            (i64 0))>;
+
+// Copy an element at a constant index in one vector into a constant indexed
+// element of another.
+// FIXME refactor to a shared class/dev parameterized on vector type, vector
+// index type and INS extension
+def : Pat<(v16i8 (int_aarch64_neon_vcopy_lane
+                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
+                   VectorIndexB:$idx2)),
+          (v16i8 (INSvi8lane
+                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
+          )>;
+def : Pat<(v8i16 (int_aarch64_neon_vcopy_lane
+                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
+                   VectorIndexH:$idx2)),
+          (v8i16 (INSvi16lane
+                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
+          )>;
+def : Pat<(v4i32 (int_aarch64_neon_vcopy_lane
+                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
+                   VectorIndexS:$idx2)),
+          (v4i32 (INSvi32lane
+                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
+          )>;
+def : Pat<(v2i64 (int_aarch64_neon_vcopy_lane
+                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
+                   VectorIndexD:$idx2)),
+          (v2i64 (INSvi64lane
+                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
+          )>;
+
+multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
+                                ValueType VTScal, Instruction INS> {
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd, V128:$Rn, imm:$Immn)>;
+
+  def : Pat<(VT128 (vector_insert V128:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (INS V128:$src, imm:$Immd,
+                 (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT128 V128:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub),
+                                 imm:$Immd, V128:$Rn, imm:$Immn),
+                            dsub)>;
+
+  def : Pat<(VT64 (vector_insert V64:$src,
+                        (VTScal (vector_extract (VT64 V64:$Rn), imm:$Immn)),
+                        imm:$Immd)),
+            (EXTRACT_SUBREG
+                (INS (SUBREG_TO_REG (i64 0), V64:$src, dsub), imm:$Immd,
+                     (SUBREG_TO_REG (i64 0), V64:$Rn, dsub), imm:$Immn),
+                dsub)>;
+}
+
+defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
+defm : Neon_INS_elt_pattern<v16i8, v8i8,  i32, INSvi8lane>;
+defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, INSvi32lane>;
+defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, INSvi32lane>;
+
+
+// Floating point vector extractions are codegen'd as either a sequence of
+// subregister extractions, possibly fed by an INS if the lane number is
+// anything other than zero.
+def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
+          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
+          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
+def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
+          (f64 (EXTRACT_SUBREG
+            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexD:$idx),
+            dsub))>;
+def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
+          (f32 (EXTRACT_SUBREG
+            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
+                         V128:$Rn, VectorIndexS:$idx),
+            ssub))>;
+
+// All concat_vectors operations are canonicalised to act on i64 vectors for
+// AArch64. In the general case we need an instruction, which had just as well be
+// INS.
+class ConcatPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
+        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
+                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
+
+def : ConcatPat<v2i64, v1i64>;
+def : ConcatPat<v2f64, v1f64>;
+def : ConcatPat<v4i32, v2i32>;
+def : ConcatPat<v4f32, v2f32>;
+def : ConcatPat<v8i16, v4i16>;
+def : ConcatPat<v16i8, v8i8>;
+
+// If the high lanes are undef, though, we can just ignore them:
+class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
+  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
+        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
+
+def : ConcatUndefPat<v2i64, v1i64>;
+def : ConcatUndefPat<v2f64, v1f64>;
+def : ConcatUndefPat<v4i32, v2i32>;
+def : ConcatUndefPat<v4f32, v2f32>;
+def : ConcatUndefPat<v8i16, v4i16>;
+def : ConcatUndefPat<v16i8, v8i8>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD across lanes instructions
+//----------------------------------------------------------------------------
+
+defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
+defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
+defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
+defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
+defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
+defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
+defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
+defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_aarch64_neon_fmaxnmv>;
+defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_aarch64_neon_fmaxv>;
+defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_aarch64_neon_fminnmv>;
+defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_aarch64_neon_fminv>;
+
+multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi8to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (SMOVvi16to32
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+           (i64 0)))>;
+// If there is a sign extension after this intrinsic, consume it as smov already
+// performed it
+def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
+          ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
+          ssub))>;
+
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
+            ssub))>;
+// If there is a masking operation keeping only what has been actually
+// generated, consume it.
+def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
+          ssub))>;
+
+}
+
+multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          (i64 0)))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (SMOVvi16to32
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          (i64 0)))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+           ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
+                                                Intrinsic intOp> {
+  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
+          ssub))>;
+def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
+          ssub))>;
+
+def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
+          (i32 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
+            ssub))>;
+def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
+        (i32 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
+          ssub))>;
+
+def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
+        (i64 (EXTRACT_SUBREG
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
+          dsub))>;
+}
+
+defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_aarch64_neon_saddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_saddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_aarch64_neon_uaddv>;
+// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
+def : Pat<(i32 (int_aarch64_neon_uaddv (v2i32 V64:$Rn))),
+          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_aarch64_neon_smaxv>;
+def : Pat<(i32 (int_aarch64_neon_smaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_aarch64_neon_sminv>;
+def : Pat<(i32 (int_aarch64_neon_sminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_aarch64_neon_umaxv>;
+def : Pat<(i32 (int_aarch64_neon_umaxv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_aarch64_neon_uminv>;
+def : Pat<(i32 (int_aarch64_neon_uminv (v2i32 V64:$Rn))),
+           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
+
+defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_aarch64_neon_saddlv>;
+defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_aarch64_neon_uaddlv>;
+
+// The vaddlv_s32 intrinsic gets mapped to SADDLP.
+def : Pat<(i64 (int_aarch64_neon_saddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+// The vaddlv_u32 intrinsic gets mapped to UADDLP.
+def : Pat<(i64 (int_aarch64_neon_uaddlv (v2i32 V64:$Rn))),
+          (i64 (EXTRACT_SUBREG
+            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
+              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
+            dsub))>;
+
+//------------------------------------------------------------------------------
+// AdvSIMD modified immediate instructions
+//------------------------------------------------------------------------------
+
+// AdvSIMD BIC
+defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", AArch64bici>;
+// AdvSIMD ORR
+defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", AArch64orri>;
+
+def : InstAlias<"bic $Vd.4h, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.8h, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.2s, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"bic $Vd.4s, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"bic.4h $Vd, $imm", (BICv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.8h $Vd, $imm", (BICv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.2s $Vd, $imm", (BICv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"bic.4s $Vd, $imm", (BICv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"orr $Vd.4h, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.8h, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.2s, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0)>;
+def : InstAlias<"orr $Vd.4s, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0)>;
+
+def : InstAlias<"orr.4h $Vd, $imm", (ORRv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.8h $Vd, $imm", (ORRv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.2s $Vd, $imm", (ORRv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"orr.4s $Vd, $imm", (ORRv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+// AdvSIMD FMOV
+def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
+                                              "fmov", ".2d",
+                       [(set (v2f64 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
+                                              "fmov", ".2s",
+                       [(set (v2f32 V64:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
+                                              "fmov", ".4s",
+                       [(set (v4f32 V128:$Rd), (AArch64fmov imm0_255:$imm8))]>;
+
+// AdvSIMD MOVI
+
+// EDIT byte mask: scalar
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
+                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 here.
+def : Pat<(f64 (AArch64movi_edit imm0_255:$shift)),
+          (MOVID imm0_255:$shift)>;
+
+def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
+def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
+
+def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
+def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
+
+// EDIT byte mask: 2d
+
+// The movi_edit node has the immediate value already encoded, so we use
+// a plain imm0_255 in the pattern
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in
+def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
+                                                simdimmtype10,
+                                                "movi", ".2d",
+                   [(set (v2i64 V128:$Rd), (AArch64movi_edit imm0_255:$imm8))]>;
+
+
+// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
+// Complexity is added to break a tie with a plain MOVI.
+let AddedComplexity = 1 in {
+def : Pat<(f32   fpimm0),
+          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
+      Requires<[HasZCZ]>;
+def : Pat<(f64   fpimm0),
+          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
+      Requires<[HasZCZ]>;
+}
+
+def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
+
+def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
+
+def : Pat<(v2f64 (AArch64dup (f64 fpimm0))), (MOVIv2d_ns (i32 0))>;
+def : Pat<(v4f32 (AArch64dup (f32 fpimm0))), (MOVIv2d_ns (i32 0))>;
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
+
+def : InstAlias<"movi $Vd.4h, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.8h, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.2s, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi $Vd.4s, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"movi.4h $Vd, $imm", (MOVIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.8h $Vd, $imm", (MOVIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.2s $Vd, $imm", (MOVIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"movi.4s $Vd, $imm", (MOVIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+// Per byte: 8b & 16b
+def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
+                                                 "movi", ".8b",
+                       [(set (v8i8 V64:$Rd), (AArch64movi imm0_255:$imm8))]>;
+def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
+                                                 "movi", ".16b",
+                       [(set (v16i8 V128:$Rd), (AArch64movi imm0_255:$imm8))]>;
+
+// AdvSIMD MVNI
+
+// EDIT per word & halfword: 2s, 4h, 4s, & 8h
+defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
+
+def : InstAlias<"mvni $Vd.4h, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.8h, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.2s, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni $Vd.4s, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : InstAlias<"mvni.4h $Vd, $imm", (MVNIv4i16 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.8h $Vd, $imm", (MVNIv8i16 V128:$Vd, imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.2s $Vd, $imm", (MVNIv2i32 V64:$Vd,  imm0_255:$imm, 0), 0>;
+def : InstAlias<"mvni.4s $Vd, $imm", (MVNIv4i32 V128:$Vd, imm0_255:$imm, 0), 0>;
+
+def : Pat<(v2i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i32 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v4i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
+def : Pat<(v8i16 (AArch64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
+          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
+
+// EDIT per word: 2s & 4s with MSL shifter
+def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
+                      [(set (v2i32 V64:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
+                      [(set (v4i32 V128:$Rd),
+                            (AArch64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD indexed element
+//----------------------------------------------------------------------------
+
+let neverHasSideEffects = 1 in {
+  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
+  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
+}
+
+// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
+// instruction expects the addend first, while the intrinsic expects it last.
+
+// On the other hand, there are quite a few valid combinatorial options due to
+// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
+
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
+defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
+           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
+
+multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
+  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (v2f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
+  // and DUP scalar.
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64duplane32 (v4f32 (fneg V128:$Rm)),
+                                           VectorIndexS:$idx))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (v4f32 (AArch64duplane32
+                                      (v4f32 (insert_subvector undef,
+                                                 (v2f32 (fneg V64:$Rm)),
+                                                 (i32 0))),
+                                      VectorIndexS:$idx)))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
+                               VectorIndexS:$idx)>;
+  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
+                           (AArch64dup (f32 (fneg FPR32Op:$Rm))))),
+            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
+
+  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
+  // (DUPLANE from 64-bit would be trivial).
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64duplane64 (v2f64 (fneg V128:$Rm)),
+                                           VectorIndexD:$idx))),
+            (FMLSv2i64_indexed
+                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
+                           (AArch64dup (f64 (fneg FPR64Op:$Rm))))),
+            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
+
+  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v4f32 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
+                         (vector_extract (v2f32 (fneg V64:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
+                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
+
+  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
+  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
+                         (vector_extract (v2f64 (fneg V128:$Rm)),
+                                         VectorIndexS:$idx))),
+            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
+                V128:$Rm, VectorIndexS:$idx)>;
+}
+
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
+defm : FMLSIndexedAfterNegPatterns<
+           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
+
+defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_aarch64_neon_fmulx>;
+defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
+
+def : Pat<(v2f32 (fmul V64:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv2i32_indexed V64:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v4f32 (fmul V128:$Rn, (AArch64dup (f32 FPR32:$Rm)))),
+          (FMULv4i32_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
+            (i64 0))>;
+def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
+          (FMULv2i64_indexed V128:$Rn,
+            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
+            (i64 0))>;
+
+defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
+defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
+defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
+              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
+              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
+defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
+defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
+                int_aarch64_neon_smull>;
+defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
+                                           int_aarch64_neon_sqadd>;
+defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
+                                           int_aarch64_neon_sqsub>;
+defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
+defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
+    TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
+    TriOpFrag<(sub node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
+defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
+                int_aarch64_neon_umull>;
+
+// A scalar sqdmull with the second operand being a vector lane can be
+// handled directly with the indexed instruction encoding.
+def : Pat<(int_aarch64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
+                                          (vector_extract (v4i32 V128:$Vm),
+                                                           VectorIndexS:$idx)),
+          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD scalar shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
+defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
+defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
+defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
+// Codegen patterns for the above. We don't put these directly on the
+// instructions because TableGen's type inference can't handle the truth.
+// Having the same base pattern for fp <--> int totally freaks it out.
+def : Pat<(int_aarch64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
+          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(i64 (int_aarch64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1i64 (int_aarch64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
+          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(int_aarch64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
+          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(f64 (int_aarch64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+def : Pat<(v1f64 (int_aarch64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
+                                            vecshiftR64:$imm)),
+          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
+
+defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", AArch64vshl>;
+defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
+defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
+                                     int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
+                                     int_aarch64_neon_sqrshrun>;
+defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
+                                     int_aarch64_neon_sqshrn>;
+defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
+                                     int_aarch64_neon_sqshrun>;
+defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
+defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64srshri node:$MHS, node:$RHS))>>;
+defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
+                                     int_aarch64_neon_uqrshrn>;
+defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
+                                     int_aarch64_neon_uqshrn>;
+defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64urshri node:$MHS, node:$RHS))>>;
+defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
+    TriOpFrag<(add node:$LHS,
+                   (AArch64vlshr node:$MHS, node:$RHS))>>;
+
+//----------------------------------------------------------------------------
+// AdvSIMD vector shift instructions
+//----------------------------------------------------------------------------
+defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_aarch64_neon_vcvtfp2fxs>;
+defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_aarch64_neon_vcvtfp2fxu>;
+defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
+                                   int_aarch64_neon_vcvtfxs2fp>;
+defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
+                                         int_aarch64_neon_rshrn>;
+defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
+defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
+                          BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
+def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftL64:$imm))),
+          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
+defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
+                                         int_aarch64_neon_sqrshrn>;
+defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
+                                         int_aarch64_neon_sqrshrun>;
+defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", AArch64sqshlui>;
+defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", AArch64sqshli>;
+defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
+                                         int_aarch64_neon_sqshrn>;
+defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
+                                         int_aarch64_neon_sqshrun>;
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
+def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+                                      (i32 vecshiftR64:$imm))),
+          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
+defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
+defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
+                 TriOpFrag<(add node:$LHS,
+                                (AArch64srshri node:$MHS, node:$RHS))> >;
+defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
+                BinOpFrag<(AArch64vshl (sext node:$LHS), node:$RHS)>>;
+
+defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", AArch64vashr>;
+defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
+                TriOpFrag<(add node:$LHS, (AArch64vashr node:$MHS, node:$RHS))>>;
+defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
+                        int_aarch64_neon_vcvtfxu2fp>;
+defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
+                                         int_aarch64_neon_uqrshrn>;
+defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", AArch64uqshli>;
+defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
+                                         int_aarch64_neon_uqshrn>;
+defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", AArch64urshri>;
+defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
+                TriOpFrag<(add node:$LHS,
+                               (AArch64urshri node:$MHS, node:$RHS))> >;
+defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
+                BinOpFrag<(AArch64vshl (zext node:$LHS), node:$RHS)>>;
+defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", AArch64vlshr>;
+defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
+                TriOpFrag<(add node:$LHS, (AArch64vlshr node:$MHS, node:$RHS))> >;
+
+// SHRN patterns for when a logical right shift was used instead of arithmetic
+// (the immediate guarantees no sign bits actually end up in the result so it
+// doesn't matter).
+def : Pat<(v8i8 (trunc (AArch64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
+          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v4i16 (trunc (AArch64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
+          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v2i32 (trunc (AArch64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
+          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
+
+def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
+                                 (trunc (AArch64vlshr (v8i16 V128:$Rn),
+                                                    vecshiftR16Narrow:$imm)))),
+          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR16Narrow:$imm)>;
+def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
+                                 (trunc (AArch64vlshr (v4i32 V128:$Rn),
+                                                    vecshiftR32Narrow:$imm)))),
+          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
+                                 (trunc (AArch64vlshr (v2i64 V128:$Rn),
+                                                    vecshiftR64Narrow:$imm)))),
+          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
+                           V128:$Rn, vecshiftR32Narrow:$imm)>;
+
+// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
+// Anyexts are implemented as zexts.
+def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
+// Also match an extend from the upper half of a 128 bit source register.
+def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
+          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
+          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
+def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
+          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
+
+// Vector shift sxtl aliases
+def : InstAlias<"sxtl.8h $dst, $src1",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.8h, $src1.8b",
+                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.4s $dst, $src1",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.4s, $src1.4h",
+                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl.2d $dst, $src1",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"sxtl $dst.2d, $src1.2s",
+                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift sxtl2 aliases
+def : InstAlias<"sxtl2.8h $dst, $src1",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
+                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.4s $dst, $src1",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
+                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2.2d $dst, $src1",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
+                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// Vector shift uxtl aliases
+def : InstAlias<"uxtl.8h $dst, $src1",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.8h, $src1.8b",
+                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.4s $dst, $src1",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.4s, $src1.4h",
+                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl.2d $dst, $src1",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+def : InstAlias<"uxtl $dst.2d, $src1.2s",
+                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
+
+// Vector shift uxtl2 aliases
+def : InstAlias<"uxtl2.8h $dst, $src1",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
+                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.4s $dst, $src1",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
+                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2.2d $dst, $src1",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
+                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
+
+// If an integer is about to be converted to a floating point value,
+// just load it on the floating point unit.
+// These patterns are more complex because floating point loads do not
+// support sign extension.
+// The sign extension has to be explicitly added and is only supported for
+// one step: byte-to-half, half-to-word, word-to-doubleword.
+// SCVTF GPR -> FPR is 9 cycles.
+// SCVTF FPR -> FPR is 4 cyclces.
+// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
+// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
+// and still being faster.
+// However, this is not good for code size.
+// 8-bits -> float. 2 sizes step-up.
+class SExtLoadi8CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi8 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                              (f64
+                                (EXTRACT_SUBREG
+                                  (SSHLLv8i8_shift
+                                    (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        bsub),
+                                    0),
+                                  dsub)),
+                               0),
+                             ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi8CVTf32Pat<(ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext),
+                          (LDRBroW  GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext),
+                          (LDRBroX  GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$ext)>;
+def : SExtLoadi8CVTf32Pat<(am_indexed8 GPR64sp:$Rn, uimm12s1:$offset),
+                          (LDRBui GPR64sp:$Rn, uimm12s1:$offset)>;
+def : SExtLoadi8CVTf32Pat<(am_unscaled8 GPR64sp:$Rn, simm9:$offset),
+                          (LDURBi GPR64sp:$Rn, simm9:$offset)>;
+
+// 16-bits -> float. 1 size step-up.
+class SExtLoadi16CVTf32Pat<dag addrmode, dag INST>
+  : Pat<(f32 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+        (SCVTFv1i32 (f32 (EXTRACT_SUBREG
+                            (SSHLLv4i16_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  hsub),
+                                0),
+                            ssub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf32Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW   GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX   GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf32Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf32Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+
+// 32-bits to 32-bits are handled in target specific dag combine:
+// performIntToFpCombine.
+// 64-bits integer to 32-bits floating point, not possible with
+// SCVTF on floating point registers (both source and destination
+// must have the same size).
+
+// Here are the patterns for 8, 16, 32, and 64-bits to double.
+// 8-bits -> double. 3 size step-up: give up.
+// 16-bits -> double. 2 size step.
+class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (sextloadi16 addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                 (f64
+                                  (EXTRACT_SUBREG
+                                    (SSHLLv4i16_shift
+                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                        INST,
+                                        hsub),
+                                     0),
+                                   dsub)),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
+                           (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
+                           (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext)>;
+def : SExtLoadi16CVTf64Pat<(am_indexed16 GPR64sp:$Rn, uimm12s2:$offset),
+                           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+def : SExtLoadi16CVTf64Pat<(am_unscaled16 GPR64sp:$Rn, simm9:$offset),
+                           (LDURHi GPR64sp:$Rn, simm9:$offset)>;
+// 32-bits -> double. 1 size step-up.
+class SExtLoadi32CVTf64Pat<dag addrmode, dag INST>
+  : Pat <(f64 (sint_to_fp (i32 (load addrmode)))),
+           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
+                              (SSHLLv2i32_shift
+                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
+                                  INST,
+                                  ssub),
+                               0),
+                             dsub)))>, Requires<[NotForCodeSize]>;
+
+def : SExtLoadi32CVTf64Pat<(ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext),
+                           (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext),
+                           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$ext)>;
+def : SExtLoadi32CVTf64Pat<(am_indexed32 GPR64sp:$Rn, uimm12s4:$offset),
+                           (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
+def : SExtLoadi32CVTf64Pat<(am_unscaled32 GPR64sp:$Rn, simm9:$offset),
+                           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
+
+// 64-bits -> double are handled in target specific dag combine:
+// performIntToFpCombine.
+
+
+//----------------------------------------------------------------------------
+// AdvSIMD Load-Store Structure
+//----------------------------------------------------------------------------
+defm LD1 : SIMDLd1Multiple<"ld1">;
+defm LD2 : SIMDLd2Multiple<"ld2">;
+defm LD3 : SIMDLd3Multiple<"ld3">;
+defm LD4 : SIMDLd4Multiple<"ld4">;
+
+defm ST1 : SIMDSt1Multiple<"st1">;
+defm ST2 : SIMDSt2Multiple<"st2">;
+defm ST3 : SIMDSt3Multiple<"st3">;
+defm ST4 : SIMDSt4Multiple<"st4">;
+
+class Ld1Pat<ValueType ty, Instruction INST>
+  : Pat<(ty (load GPR64sp:$Rn)), (INST GPR64sp:$Rn)>;
+
+def : Ld1Pat<v16i8, LD1Onev16b>;
+def : Ld1Pat<v8i16, LD1Onev8h>;
+def : Ld1Pat<v4i32, LD1Onev4s>;
+def : Ld1Pat<v2i64, LD1Onev2d>;
+def : Ld1Pat<v8i8,  LD1Onev8b>;
+def : Ld1Pat<v4i16, LD1Onev4h>;
+def : Ld1Pat<v2i32, LD1Onev2s>;
+def : Ld1Pat<v1i64, LD1Onev1d>;
+
+class St1Pat<ValueType ty, Instruction INST>
+  : Pat<(store ty:$Vt, GPR64sp:$Rn),
+        (INST ty:$Vt, GPR64sp:$Rn)>;
+
+def : St1Pat<v16i8, ST1Onev16b>;
+def : St1Pat<v8i16, ST1Onev8h>;
+def : St1Pat<v4i32, ST1Onev4s>;
+def : St1Pat<v2i64, ST1Onev2d>;
+def : St1Pat<v8i8,  ST1Onev8b>;
+def : St1Pat<v4i16, ST1Onev4h>;
+def : St1Pat<v2i32, ST1Onev2s>;
+def : St1Pat<v1i64, ST1Onev1d>;
+
+//---
+// Single-element
+//---
+
+defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
+defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
+defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
+defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
+let mayLoad = 1, neverHasSideEffects = 1 in {
+defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
+defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
+defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
+defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
+defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
+defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
+defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
+defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
+defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
+defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
+defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
+defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
+defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
+defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
+defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
+defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
+}
+
+def : Pat<(v8i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv8b GPR64sp:$Rn)>;
+def : Pat<(v16i8 (AArch64dup (i32 (extloadi8 GPR64sp:$Rn)))),
+          (LD1Rv16b GPR64sp:$Rn)>;
+def : Pat<(v4i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8i16 (AArch64dup (i32 (extloadi16 GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v2i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4i32 (AArch64dup (i32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1i64 (AArch64dup (i64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+// Grab the floating point version too
+def : Pat<(v2f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv2s GPR64sp:$Rn)>;
+def : Pat<(v4f32 (AArch64dup (f32 (load GPR64sp:$Rn)))),
+          (LD1Rv4s GPR64sp:$Rn)>;
+def : Pat<(v2f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv2d GPR64sp:$Rn)>;
+def : Pat<(v1f64 (AArch64dup (f64 (load GPR64sp:$Rn)))),
+          (LD1Rv1d GPR64sp:$Rn)>;
+
+class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne128:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (LD1 VecListOne128:$Rd, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
+def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
+def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
+
+class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction LD1>
+  : Pat<(vector_insert (VTy VecListOne64:$Rd),
+           (STy (scalar_load GPR64sp:$Rn)), VecIndex:$idx),
+        (EXTRACT_SUBREG
+            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
+                          VecIndex:$idx, GPR64sp:$Rn),
+            dsub)>;
+
+def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
+def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
+def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
+
+
+defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
+defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
+defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
+defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
+
+// Stores
+defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
+defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
+defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
+defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
+
+let AddedComplexity = 15 in
+class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                    ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
+def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
+def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
+def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
+
+let AddedComplexity = 15 in
+class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                   ValueType VTy, ValueType STy, Instruction ST1>
+  : Pat<(scalar_store
+             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+             GPR64sp:$Rn),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn)>;
+
+def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
+def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
+def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
+
+multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
+             VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost64Pat<post_truncsti8, VectorIndexB, v8i8, i32, ST1i8_POST, 1>;
+defm : St1LanePost64Pat<post_truncsti16, VectorIndexH, v4i16, i32, ST1i16_POST,
+                        2>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
+
+multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
+                             ValueType VTy, ValueType STy, Instruction ST1,
+                             int offset> {
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, offset),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, XZR)>;
+
+  def : Pat<(scalar_store
+              (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
+              GPR64sp:$Rn, GPR64:$Rm),
+        (ST1 VecListOne128:$Vt, VecIndex:$idx, GPR64sp:$Rn, $Rm)>;
+}
+
+defm : St1LanePost128Pat<post_truncsti8, VectorIndexB, v16i8, i32, ST1i8_POST,
+                         1>;
+defm : St1LanePost128Pat<post_truncsti16, VectorIndexH, v8i16, i32, ST1i16_POST,
+                         2>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4i32, i32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
+defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
+
+let mayStore = 1, neverHasSideEffects = 1 in {
+defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
+defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
+defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
+defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
+defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
+defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
+defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
+defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
+defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
+defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
+defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
+defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
+}
+
+defm ST1 : SIMDLdSt1SingleAliases<"st1">;
+defm ST2 : SIMDLdSt2SingleAliases<"st2">;
+defm ST3 : SIMDLdSt3SingleAliases<"st3">;
+defm ST4 : SIMDLdSt4SingleAliases<"st4">;
+
+//----------------------------------------------------------------------------
+// Crypto extensions
+//----------------------------------------------------------------------------
+
+def AESErr   : AESTiedInst<0b0100, "aese",   int_aarch64_crypto_aese>;
+def AESDrr   : AESTiedInst<0b0101, "aesd",   int_aarch64_crypto_aesd>;
+def AESMCrr  : AESInst<    0b0110, "aesmc",  int_aarch64_crypto_aesmc>;
+def AESIMCrr : AESInst<    0b0111, "aesimc", int_aarch64_crypto_aesimc>;
+
+def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_aarch64_crypto_sha1c>;
+def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_aarch64_crypto_sha1p>;
+def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_aarch64_crypto_sha1m>;
+def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_aarch64_crypto_sha1su0>;
+def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_aarch64_crypto_sha256h>;
+def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_aarch64_crypto_sha256h2>;
+def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_aarch64_crypto_sha256su1>;
+
+def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_aarch64_crypto_sha1h>;
+def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_aarch64_crypto_sha1su1>;
+def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_aarch64_crypto_sha256su0>;
+
+//----------------------------------------------------------------------------
+// Compiler-pseudos
+//----------------------------------------------------------------------------
+// FIXME: Like for X86, these should go in their own separate .td file.
+
+// Any instruction that defines a 32-bit result leaves the high half of the
+// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
+// be copying from a truncate. But any other 32-bit operation will zero-extend
+// up to 64 bits.
+// FIXME: X86 also checks for CMOV here. Do we need something similar?
+def def32 : PatLeaf<(i32 GPR32:$src), [{
+  return N->getOpcode() != ISD::TRUNCATE &&
+         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
+         N->getOpcode() != ISD::CopyFromReg;
+}]>;
 
-// The simpler patterns deal with cases where no AND mask is actually needed
-// (either all bits are used or the low 32 bits are used).
-let AddedComplexity = 10 in {
-
-def : Pat<(A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-           (BFIxxii $src, $Rn,
-                    (bfi64_lsb_to_immr (i64 imm:$ImmR)),
-                    (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-def : Pat<(A64Bfi i32:$src, i32:$Rn, imm:$ImmR, imm:$ImmS),
-          (BFIwwii $src, $Rn,
-                   (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                   (bfi_width_to_imms (i64 imm:$ImmS)))>;
-
-
-def : Pat<(and (A64Bfi i64:$src, i64:$Rn, imm:$ImmR, imm:$ImmS),
-               (i64 4294967295)),
-          (SUBREG_TO_REG (i64 0),
-                         (BFIwwii (EXTRACT_SUBREG $src, sub_32),
-                                  (EXTRACT_SUBREG $Rn, sub_32),
-                                  (bfi32_lsb_to_immr (i64 imm:$ImmR)),
-                                  (bfi_width_to_imms (i64 imm:$ImmS))),
-                         sub_32)>;
-
-}
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous patterns
-//===----------------------------------------------------------------------===//
-
-// Truncation from 64 to 32-bits just involves renaming your register.
-def : Pat<(i32 (trunc i64:$val)), (EXTRACT_SUBREG $val, sub_32)>;
-
-// Similarly, extension where we don't care about the high bits is
-// just a rename.
-def : Pat<(i64 (anyext i32:$val)),
-          (INSERT_SUBREG (IMPLICIT_DEF), $val, sub_32)>;
-
-// SELECT instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : PseudoInst<(outs FPR128:$Rd),
-                         (ins FPR128:$Rn, FPR128:$Rm, cond_code_op:$Cond),
-                         [(set f128:$Rd, (simple_select f128:$Rn, f128:$Rm))]> {
-  let Uses = [NZCV];
-  let usesCustomInserter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Load/store patterns
-//===----------------------------------------------------------------------===//
-
-// There are lots of patterns here, because we need to allow at least three
-// parameters to vary independently.
-//   1. Instruction: "ldrb w9, [sp]", "ldrh w9, [sp]", ...
-//   2. LLVM source: zextloadi8, anyextloadi8, ...
-//   3. Address-generation: A64Wrapper, (add BASE, OFFSET), ...
+// In the case of a 32-bit def that is known to implicitly zero-extend,
+// we can use a SUBREG_TO_REG.
+def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
+
+// For an anyext, we don't care what the high bits are, so we can perform an
+// INSERT_SUBREF into an IMPLICIT_DEF.
+def : Pat<(i64 (anyext GPR32:$src)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
+
+// When we need to explicitly zero-extend, we use an unsigned bitfield move
+// instruction (UBFM) on the enclosing super-reg.
+def : Pat<(i64 (zext GPR32:$src)),
+ (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+
+// To sign extend, we use a signed bitfield move instruction (SBFM) on the
+// containing super-reg.
+def : Pat<(i64 (sext GPR32:$src)),
+   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
+def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
+def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
+                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
+                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
+
+def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
+                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
+def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
+                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
+
+def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 (i64shift_a        imm0_63:$imm)),
+                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
+
+// sra patterns have an AddedComplexity of 10, so make sure we have a higher
+// AddedComplexity for the following patterns since we want to match sext + sra
+// patterns before we attempt to match a single sra node.
+let AddedComplexity = 20 in {
+// We support all sext + sra combinations which preserve at least one bit of the
+// original value which is to be sign extended. E.g. we support shifts up to
+// bitwidth-1 bits.
+def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
+
+def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
+def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
+          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
+
+def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
+          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
+                   (i64 imm0_31:$imm), 31)>;
+} // AddedComplexity = 20
+
+// To truncate, we can simply extract from a subregister.
+def : Pat<(i32 (trunc GPR64sp:$src)),
+          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
+
+// __builtin_trap() uses the BRK instruction on AArch64.
+def : Pat<(trap), (BRK 1)>;
+
+// Conversions within AdvSIMD types in the same register size are free.
+// But because we need a consistent lane ordering, in big endian many
+// conversions require one or more REV instructions.
 //
-// The biggest problem turns out to be the address-generation variable. At the
-// point of instantiation we need to produce two DAGs, one for the pattern and
-// one for the instruction. Doing this at the lowest level of classes doesn't
-// work.
+// Consider a simple memory load followed by a bitconvert then a store.
+//   v0 = load v2i32
+//   v1 = BITCAST v2i32 v0 to v4i16
+//        store v4i16 v2
 //
-// Consider the simple uimm12 addressing mode, and the desire to match both (add
-// GPR64xsp:$Rn, uimm12:$Offset) and GPR64xsp:$Rn, particularly on the
-// instruction side. We'd need to insert either "GPR64xsp" and "uimm12" or
-// "GPR64xsp" and "0" into an unknown dag. !subst is not capable of this
-// operation, and PatFrags are for selection not output.
+// In big endian mode every memory access has an implicit byte swap. LDR and
+// STR do a 64-bit byte swap, whereas LD1/ST1 do a byte swap per lane - that
+// is, they treat the vector as a sequence of elements to be byte-swapped.
+// The two pairs of instructions are fundamentally incompatible. We've decided
+// to use LD1/ST1 only to simplify compiler implementation.
 //
-// As a result, the address-generation patterns are the final
-// instantiations. However, we do still need to vary the operand for the address
-// further down (At the point we're deciding A64WrapperSmall, we don't know
-// the memory width of the operation).
-
-//===------------------------------
-// 1. Basic infrastructural defs
-//===------------------------------
-
-// First, some simple classes for !foreach and !subst to use:
-class Decls {
-  dag pattern;
-}
-
-def decls : Decls;
-def ALIGN;
-def INST;
-def OFFSET;
-def SHIFT;
-
-// You can't use !subst on an actual immediate, but you *can* use it on an
-// operand record that happens to match a single immediate. So we do.
-def imm_eq0 : ImmLeaf<i64, [{ return Imm == 0; }]>;
-def imm_eq1 : ImmLeaf<i64, [{ return Imm == 1; }]>;
-def imm_eq2 : ImmLeaf<i64, [{ return Imm == 2; }]>;
-def imm_eq3 : ImmLeaf<i64, [{ return Imm == 3; }]>;
-def imm_eq4 : ImmLeaf<i64, [{ return Imm == 4; }]>;
-
-// If the low bits of a pointer are known to be 0 then an "or" is just as good
-// as addition for computing an offset. This fragment forwards that check for
-// TableGen's use.
-def add_like_or : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),
-[{
-  return CurDAG->isBaseWithConstantOffset(SDValue(N, 0));
-}]>;
-
-// Load/store (unsigned immediate) operations with relocations against global
-// symbols (for lo12) are only valid if those symbols have correct alignment
-// (since the immediate offset is divided by the access scale, it can't have a
-// remainder).
+// LD1/ST1 perform the equivalent of a sequence of LDR/STR + REV. This makes
+// the original code sequence:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = BITCAST v2i32 v1 to v4i16
+//   v3 = REV v4i16 v2               (implicit)
+//        store v4i16 v3
 //
-// The guaranteed alignment is provided as part of the WrapperSmall
-// operation, and checked against one of these.
-def any_align   : ImmLeaf<i32, [{ (void)Imm; return true; }]>;
-def min_align2  : ImmLeaf<i32, [{ return Imm >= 2; }]>;
-def min_align4  : ImmLeaf<i32, [{ return Imm >= 4; }]>;
-def min_align8  : ImmLeaf<i32, [{ return Imm >= 8; }]>;
-def min_align16 : ImmLeaf<i32, [{ return Imm >= 16; }]>;
-
-// "Normal" load/store instructions can be used on atomic operations, provided
-// the ordering parameter is at most "monotonic". Anything above that needs
-// special handling with acquire/release instructions.
-class simple_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_load_simple_i8  : simple_load<atomic_load_8>;
-def atomic_load_simple_i16 : simple_load<atomic_load_16>;
-def atomic_load_simple_i32 : simple_load<atomic_load_32>;
-def atomic_load_simple_i64 : simple_load<atomic_load_64>;
-
-class simple_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  return cast<AtomicSDNode>(N)->getOrdering() <= Monotonic;
-}]>;
-
-def atomic_store_simple_i8  : simple_store<atomic_store_8>;
-def atomic_store_simple_i16 : simple_store<atomic_store_16>;
-def atomic_store_simple_i32 : simple_store<atomic_store_32>;
-def atomic_store_simple_i64 : simple_store<atomic_store_64>;
-
-//===------------------------------
-// 2. UImm12 and SImm9
-//===------------------------------
-
-// These instructions have two operands providing the address so they can be
-// treated similarly for most purposes.
-
-//===------------------------------
-// 2.1 Base patterns covering extend/truncate semantics
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ls_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag address, ValueType transty,
-                          ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset)>;
-}
-
-// Instructions accessing a memory chunk smaller than a register (or, in a
-// pinch, the same size) have a characteristic set of patterns they want to
-// match: extending loads and truncating stores. This class deals with the
-// sign-neutral version of those patterns.
+// But this is now broken - the value stored is different to the value loaded
+// due to lane reordering. To fix this, on every BITCAST we must perform two
+// other REVs:
+//   v0 = load v2i32
+//   v1 = REV v2i32                  (implicit)
+//   v2 = REV v2i32
+//   v3 = BITCAST v2i32 v2 to v4i16
+//   v4 = REV v4i16
+//   v5 = REV v4i16 v4               (implicit)
+//        store v4i16 v5
 //
-// It will be instantiated across multiple addressing-modes.
-multiclass ls_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset,
-                         dag address, ValueType sty>
-  : ls_atomic_pats<LOAD, STORE, Base, Offset, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address), (LOAD Base, Offset)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address), (LOAD Base, Offset)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-}
-
-// Next come patterns for sign-extending loads.
-multiclass load_signed_pats<string T, string U, dag Base, dag Offset,
-                            dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w" # U) Base, Offset)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x" # U) Base, Offset)>;
-
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ls_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset)>;
-  def : Pat<(store sty:$Rt, address), (STORE $Rt, Base, Offset)>;
-}
-
-// Integer operations also get atomic instructions to select for.
-multiclass ls_int_neutral_pats<Instruction LOAD, Instruction STORE, dag Base,
-                           dag Offset, dag address, ValueType sty>
-  : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
-    ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
-
-//===------------------------------
-// 2.2. Addressing-mode instantiations
-//===------------------------------
-
-multiclass uimm12_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDR, LS8_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, byte_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, byte_uimm12,
-                                !subst(ALIGN, any_align, decls.pattern))),
-                       i8>;
-  defm : ls_small_pats<LS16_LDR, LS16_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, hword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, hword_uimm12,
-                                !subst(ALIGN, min_align2, decls.pattern))),
-                       i16>;
-  defm : ls_small_pats<LS32_LDR, LS32_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, word_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, word_uimm12,
-                                !subst(ALIGN, min_align4, decls.pattern))),
-                       i32>;
-
-  defm : ls_int_neutral_pats<LS32_LDR, LS32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          i32>;
-
-  defm : ls_int_neutral_pats<LS64_LDR, LS64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
-
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, word_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
-
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, dword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
-
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, qword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
-
-  defm : load_signed_pats<"B", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, byte_uimm12,
-                                   !subst(ALIGN, any_align, decls.pattern))),
-                          i8>;
-
-  defm : load_signed_pats<"H", "", Base,
-                          !foreach(decls.pattern, Offset,
-                                   !subst(OFFSET, hword_uimm12, decls.pattern)),
-                          !foreach(decls.pattern, address,
-                                   !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(OFFSET, word_uimm12,
-                                  !subst(ALIGN, min_align4, decls.pattern)))),
-            (LDRSWx Base, !foreach(decls.pattern, Offset,
-                                  !subst(OFFSET, word_uimm12, decls.pattern)))>;
-}
-
-// Straightforward patterns of last resort: a pointer with or without an
-// appropriate offset.
-defm : uimm12_pats<(i64 i64:$Rn), (i64 i64:$Rn), (i64 0)>;
-defm : uimm12_pats<(add i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// The offset could be hidden behind an "or", of course:
-defm : uimm12_pats<(add_like_or i64:$Rn, OFFSET:$UImm12),
-                   (i64 i64:$Rn), (i64 OFFSET:$UImm12)>;
-
-// Global addresses under the small-absolute model should use these
-// instructions. There are ELF relocations specifically for it.
-defm : uimm12_pats<(A64WrapperSmall tglobaladdr:$Hi, tglobaladdr:$Lo12, ALIGN),
-                   (ADRPxi tglobaladdr:$Hi), (i64 tglobaladdr:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tglobaltlsaddr:$Hi, tglobaltlsaddr:$Lo12,
-                                    ALIGN),
-                   (ADRPxi tglobaltlsaddr:$Hi), (i64 tglobaltlsaddr:$Lo12)>;
-
-// External symbols that make it this far should also get standard relocations.
-defm : uimm12_pats<(A64WrapperSmall texternalsym:$Hi, texternalsym:$Lo12,
-                                    ALIGN),
-                   (ADRPxi texternalsym:$Hi), (i64 texternalsym:$Lo12)>;
-
-defm : uimm12_pats<(A64WrapperSmall tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                   (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-// We also want to use uimm12 instructions for local variables at the moment.
-def tframeindex_XFORM : SDNodeXForm<frameindex, [{
-  int FI = cast<FrameIndexSDNode>(N)->getIndex();
-  return CurDAG->getTargetFrameIndex(FI, MVT::i64);
-}]>;
-
-defm : uimm12_pats<(i64 frameindex:$Rn),
-                   (tframeindex_XFORM tframeindex:$Rn), (i64 0)>;
-
-// These can be much simpler than uimm12 because we don't to change the operand
-// type (e.g. LDURB and LDURH take the same operands).
-multiclass simm9_pats<dag address, dag Base, dag Offset> {
-  defm : ls_small_pats<LS8_LDUR, LS8_STUR, Base, Offset, address, i8>;
-  defm : ls_small_pats<LS16_LDUR, LS16_STUR, Base, Offset, address, i16>;
-
-  defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
-  defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
-
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
-
-  def : Pat<(i64 (zextloadi32 address)),
-            (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
-
-  def : Pat<(truncstorei32 i64:$Rt, address),
-            (LS32_STUR (EXTRACT_SUBREG $Rt, sub_32), Base, Offset)>;
-
-  defm : load_signed_pats<"B", "_U", Base, Offset, address, i8>;
-  defm : load_signed_pats<"H", "_U", Base, Offset, address, i16>;
-  def : Pat<(sextloadi32 address), (LDURSWx Base, Offset)>;
-}
-
-defm : simm9_pats<(add i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-defm : simm9_pats<(add_like_or i64:$Rn, simm9:$SImm9),
-                  (i64 $Rn), (SDXF_simm9 simm9:$SImm9)>;
-
-
-//===------------------------------
-// 3. Register offset patterns
-//===------------------------------
-
-// Atomic patterns can be shared between integer operations of all sizes, a
-// quick multiclass here allows reuse.
-multiclass ro_atomic_pats<Instruction LOAD, Instruction STORE, dag Base,
-                          dag Offset, dag Extend, dag address,
-                          ValueType transty, ValueType sty> {
-  def : Pat<(!cast<PatFrag>("atomic_load_simple_" # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<PatFrag>("atomic_store_simple_" # sty) address, transty:$Rt),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-// The register offset instructions take three operands giving the instruction,
-// and have an annoying split between instructions where Rm is 32-bit and
-// 64-bit. So we need a special hierarchy to describe them. Other than that the
-// same operations should be supported as for simm9 and uimm12 addressing.
-
-multiclass ro_small_pats<Instruction LOAD, Instruction STORE,
-                         dag Base, dag Offset, dag Extend,
-                         dag address, ValueType sty>
-  : ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, i32, sty> {
-  def : Pat<(!cast<SDNode>(zextload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  def : Pat<(!cast<SDNode>(extload # sty) address),
-            (LOAD Base, Offset, Extend)>;
-
-  // For zero-extension to 64-bits we have to tell LLVM that the whole 64-bit
-  // register was actually set.
-  def : Pat<(i64 (!cast<SDNode>(zextload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(i64 (!cast<SDNode>(extload # sty) address)),
-            (SUBREG_TO_REG (i64 0), (LOAD Base, Offset, Extend), sub_32)>;
-
-  def : Pat<(!cast<SDNode>(truncstore # sty) i32:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-
-  // For truncating store from 64-bits, we have to manually tell LLVM to
-  // ignore the high bits of the x register.
-  def : Pat<(!cast<SDNode>(truncstore # sty) i64:$Rt, address),
-            (STORE (EXTRACT_SUBREG $Rt, sub_32), Base, Offset, Extend)>;
-
-}
-
-// Next come patterns for sign-extending loads.
-multiclass ro_signed_pats<string T, string Rm, dag Base, dag Offset, dag Extend,
-                          dag address, ValueType sty> {
-  def : Pat<(i32 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "w_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-
-  def : Pat<(i64 (!cast<SDNode>("sextload" # sty) address)),
-            (!cast<Instruction>("LDRS" # T # "x_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-// and finally "natural-width" loads and stores come next.
-multiclass ro_neutral_pats<Instruction LOAD, Instruction STORE,
-                           dag Base, dag Offset, dag Extend, dag address,
-                           ValueType sty> {
-  def : Pat<(sty (load address)), (LOAD Base, Offset, Extend)>;
-  def : Pat<(store sty:$Rt, address),
-            (STORE $Rt, Base, Offset, Extend)>;
-}
-
-multiclass ro_int_neutral_pats<Instruction LOAD, Instruction STORE,
-                               dag Base, dag Offset, dag Extend, dag address,
-                               ValueType sty>
-  : ro_neutral_pats<LOAD, STORE, Base, Offset, Extend, address, sty>,
-    ro_atomic_pats<LOAD, STORE, Base, Offset, Extend, address, sty, sty>;
-
-multiclass regoff_pats<string Rm, dag address, dag Base, dag Offset,
-                       dag Extend> {
-  defm : ro_small_pats<!cast<Instruction>("LS8_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS8_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq0, decls.pattern)),
-                       i8>;
-  defm : ro_small_pats<!cast<Instruction>("LS16_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS16_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq1, decls.pattern)),
-                       i16>;
-  defm : ro_small_pats<!cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                       !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                       Base, Offset, Extend,
-                       !foreach(decls.pattern, address,
-                                !subst(SHIFT, imm_eq2, decls.pattern)),
-                       i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS32_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq2, decls.pattern)),
-                            i32>;
-
-  defm : ro_int_neutral_pats<
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_LDR"),
-                            !cast<Instruction>("LS64_" # Rm # "_RegOffset_STR"),
-                            Base, Offset, Extend,
-                            !foreach(decls.pattern, address,
-                                     !subst(SHIFT, imm_eq3, decls.pattern)),
-                            i64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP16_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP16_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq1, decls.pattern)),
-                         f16>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP32_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP32_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern)),
-                         f32>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP64_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP64_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq3, decls.pattern)),
-                         f64>;
-
-  defm : ro_neutral_pats<!cast<Instruction>("LSFP128_" # Rm # "_RegOffset_LDR"),
-                         !cast<Instruction>("LSFP128_" # Rm # "_RegOffset_STR"),
-                         Base, Offset, Extend,
-                         !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq4, decls.pattern)),
-                         f128>;
-
-  defm : ro_signed_pats<"B", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq0, decls.pattern)),
-                        i8>;
-
-  defm : ro_signed_pats<"H", Rm, Base, Offset, Extend,
-                        !foreach(decls.pattern, address,
-                                 !subst(SHIFT, imm_eq1, decls.pattern)),
-                        i16>;
-
-  def : Pat<(sextloadi32 !foreach(decls.pattern, address,
-                                  !subst(SHIFT, imm_eq2, decls.pattern))),
-            (!cast<Instruction>("LDRSWx_" # Rm # "_RegOffset")
-              Base, Offset, Extend)>;
-}
-
-
-// Finally we're in a position to tell LLVM exactly what addresses are reachable
-// using register-offset instructions. Essentially a base plus a possibly
-// extended, possibly shifted (by access size) offset.
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (sext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 6)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (sext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 7)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (zext i32:$Rm)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Wm", (add i64:$Rn, (shl (zext i32:$Rm), SHIFT)),
-                   (i64 i64:$Rn), (i32 i32:$Rm), (i64 3)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, i64:$Rm),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 2)>;
-
-defm : regoff_pats<"Xm", (add i64:$Rn, (shl i64:$Rm, SHIFT)),
-                   (i64 i64:$Rn), (i64 i64:$Rm), (i64 3)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD (NEON) Support
+// This means an extra two instructions, but actually in most cases the two REV
+// instructions can be combined into one. For example:
+//   (REV64_2s (REV64_4h X)) === (REV32_4h X)
+//
+// There is also no 128-bit REV instruction. This must be synthesized with an
+// EXT instruction.
 //
+// Most bitconverts require some sort of conversion. The only exceptions are:
+//   a) Identity conversions -  vNfX <-> vNiX
+//   b) Single-lane-to-scalar - v1fX <-> fX or v1iX <-> iX
+//
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert GPR64:$Xn)),
+                 (REV64v8i8 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4i16 (bitconvert GPR64:$Xn)),
+                 (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
+                 (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+
+def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
+          (REV64v8i8 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
+          (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+}
+def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
+
+def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))),
+          (COPY_TO_REGCLASS GPR32:$Xn, FPR32)>;
+def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))),
+          (COPY_TO_REGCLASS FPR32:$Xn, GPR32)>;
+def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))),
+          (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))),
+          (COPY_TO_REGCLASS FPR64:$Xn, GPR64)>;
+def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1i64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1i64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2i32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2i32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4i16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4i16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4i16 (REV64v4i16 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))),
+                             (v8i8 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
+                             (v8i8 (REV64v8i8 FPR64:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))),
+                             (f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
+                             (f64 (REV64v8i8 FPR64:$src))>;
+}
+def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))),
+                             (v1f64 (REV64v8i8 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
+                             (v1f64 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))),
+                             (v2f32 (REV32v8i8 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
+                             (v2f32 (REV64v2i32 FPR64:$src))>;
+}
+def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
+                            (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
+def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                            (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                            (REV64v16i8 FPR128:$src), (i32 8)))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))),
+                             (v2f64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2f64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2f64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))),
+                             (v4f32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                    (REV64v4i32 FPR128:$src), (i32 8)))>;
+def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4f32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4f32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
+                             (v2i64 (EXTv16i8 FPR128:$src,
+                                              FPR128:$src, (i32 8)))>;
+def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))),
+                             (v2i64 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
+                             (v2i64 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
+                             (v4i32 (EXTv16i8 (REV64v4i32 FPR128:$src),
+                                              (REV64v4i32 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))),
+                             (v4i32 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
+                             (v4i32 (REV64v4i32 FPR128:$src))>;
+}
+def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
+
+let Predicates = [IsLE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))),
+                             (v8i16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8i16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8i16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8i16 (REV32v8i16 FPR128:$src))>;
+}
+
+let Predicates = [IsLE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
+}
+let Predicates = [IsBE] in {
+def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
+                             (v16i8 (EXTv16i8 (REV64v16i8 FPR128:$src),
+                                              (REV64v16i8 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))),
+                             (v16i8 (REV64v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
+                             (v16i8 (REV32v16i8 FPR128:$src))>;
+}
+
+def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
+          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
+
+// A 64-bit subvector insert to the first 128-bit vector position
+// is a subregister copy that needs no instruction.
+def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
+          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
+// or v2f32.
+def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
+                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
+           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
+def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
+                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
+           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
+    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
+    // so we match on v4f32 here, not v2f32. This will also catch adding
+    // the low two lanes of a true v4f32 vector.
+def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
+                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
+          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
+
+// Scalar 64-bit shifts in FPR64 registers.
+def : Pat<(i64 (int_aarch64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+def : Pat<(i64 (int_aarch64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
+          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
+
+// Tail call return handling. These are all compiler pseudo-instructions,
+// so no encoding information or anything like that.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
+  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst, i32imm:$FPDiff),[]>;
+  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst, i32imm:$FPDiff), []>;
+}
+
+def : Pat<(AArch64tcret tcGPR64:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNri tcGPR64:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret tglobaladdr:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
+def : Pat<(AArch64tcret texternalsym:$dst, (i32 timm:$FPDiff)),
+          (TCRETURNdi texternalsym:$dst, imm:$FPDiff)>;
 
-include "AArch64InstrNEON.td"
+include "AArch64InstrAtomics.td"
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
deleted file mode 100644
index 0b97e3b..0000000
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ /dev/null
@@ -1,9476 +0,0 @@
-//===-- AArch64InstrNEON.td - NEON support for AArch64 -----*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the AArch64 NEON instruction set.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NEON-specific DAG Nodes.
-//===----------------------------------------------------------------------===//
-
-// (outs Result), (ins Imm, OpCmode)
-def SDT_Neon_movi : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
-
-def Neon_movi     : SDNode<"AArch64ISD::NEON_MOVIMM", SDT_Neon_movi>;
-
-def Neon_mvni     : SDNode<"AArch64ISD::NEON_MVNIMM", SDT_Neon_movi>;
-
-// (outs Result), (ins Imm)
-def Neon_fmovi : SDNode<"AArch64ISD::NEON_FMOVIMM", SDTypeProfile<1, 1,
-                        [SDTCisVec<0>, SDTCisVT<1, i32>]>>;
-
-// (outs Result), (ins LHS, RHS, CondCode)
-def Neon_cmp : SDNode<"AArch64ISD::NEON_CMP", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-// (outs Result), (ins LHS, 0/0.0 constant, CondCode)
-def Neon_cmpz : SDNode<"AArch64ISD::NEON_CMPZ", SDTypeProfile<1, 3,
-                 [SDTCisVec<0>,  SDTCisVec<1>]>>;
-
-// (outs Result), (ins LHS, RHS)
-def Neon_tst : SDNode<"AArch64ISD::NEON_TST", SDTypeProfile<1, 2,
-                 [SDTCisVec<0>,  SDTCisSameAs<1, 2>]>>;
-
-def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                     SDTCisVT<2, i32>]>;
-def Neon_sqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
-def Neon_uqrshlImm   : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
-
-def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                               SDTCisSameAs<0, 2>]>;
-def Neon_uzp1    : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
-def Neon_uzp2    : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
-def Neon_zip1    : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
-def Neon_zip2    : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
-def Neon_trn1    : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
-def Neon_trn2    : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
-
-def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
-def Neon_rev64    : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
-def Neon_rev32    : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
-def Neon_rev16    : SDNode<"AArch64ISD::NEON_REV16", SDTVSHUF>;
-def Neon_vdup : SDNode<"AArch64ISD::NEON_VDUP", SDTypeProfile<1, 1,
-                       [SDTCisVec<0>]>>;
-def Neon_vduplane : SDNode<"AArch64ISD::NEON_VDUPLANE", SDTypeProfile<1, 2,
-                           [SDTCisVec<0>, SDTCisVec<1>, SDTCisVT<2, i64>]>>;
-def Neon_vextract : SDNode<"AArch64ISD::NEON_VEXTRACT", SDTypeProfile<1, 3,
-                           [SDTCisVec<0>,  SDTCisSameAs<0, 1>,
-                           SDTCisSameAs<0, 2>, SDTCisVT<3, i64>]>>;
-
-//===----------------------------------------------------------------------===//
-// Addressing-mode instantiations
-//===----------------------------------------------------------------------===//
-
-multiclass ls_64_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
-                      !foreach(decls.pattern, Offset,
-                               !subst(OFFSET, dword_uimm12, decls.pattern)),
-                      !foreach(decls.pattern, address,
-                               !subst(OFFSET, dword_uimm12,
-                               !subst(ALIGN, min_align8, decls.pattern))),
-                      Ty>;
-}
-
-multiclass ls_128_pats<dag address, dag Base, dag Offset, ValueType Ty> {
-defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
-                       !foreach(decls.pattern, Offset,
-                                !subst(OFFSET, qword_uimm12, decls.pattern)),
-                       !foreach(decls.pattern, address,
-                                !subst(OFFSET, qword_uimm12,
-                                !subst(ALIGN, min_align16, decls.pattern))),
-                      Ty>;
-}
-
-multiclass uimm12_neon_pats<dag address, dag Base, dag Offset> {
-  defm : ls_64_pats<address, Base, Offset, v8i8>;
-  defm : ls_64_pats<address, Base, Offset, v4i16>;
-  defm : ls_64_pats<address, Base, Offset, v2i32>;
-  defm : ls_64_pats<address, Base, Offset, v1i64>;
-  defm : ls_64_pats<address, Base, Offset, v2f32>;
-  defm : ls_64_pats<address, Base, Offset, v1f64>;
-
-  defm : ls_128_pats<address, Base, Offset, v16i8>;
-  defm : ls_128_pats<address, Base, Offset, v8i16>;
-  defm : ls_128_pats<address, Base, Offset, v4i32>;
-  defm : ls_128_pats<address, Base, Offset, v2i64>;
-  defm : ls_128_pats<address, Base, Offset, v4f32>;
-  defm : ls_128_pats<address, Base, Offset, v2f64>;
-}
-
-defm : uimm12_neon_pats<(A64WrapperSmall
-                          tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                        (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
-//===----------------------------------------------------------------------===//
-// Multiclasses
-//===----------------------------------------------------------------------===//
-
-multiclass NeonI_3VSame_B_sizes<bit u, bits<2> size,  bits<5> opcode,
-                                string asmop, SDPatternOperator opnode8B,
-                                SDPatternOperator opnode16B,
-                                bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, size, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode8B (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode16B (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-
-}
-
-multiclass NeonI_3VSame_HS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4H : NeonI_3VSame<0b0, u, 0b01, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.4h, $Rn.4h, $Rm.4h",
-              [(set (v4i16 VPR64:$Rd),
-                 (v4i16 (opnode (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _8H : NeonI_3VSame<0b1, u, 0b01, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.8h, $Rn.8h, $Rm.8h",
-              [(set (v8i16 VPR128:$Rd),
-                 (v8i16 (opnode (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2S : NeonI_3VSame<0b0, u, 0b10, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (v2i32 VPR64:$Rd),
-                 (v2i32 (opnode (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, 0b10, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (v4i32 VPR128:$Rd),
-                 (v4i32 (opnode (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-multiclass NeonI_3VSame_BHS_sizes<bit u, bits<5> opcode,
-                                  string asmop, SDPatternOperator opnode,
-                                  bit Commutable = 0>
-   : NeonI_3VSame_HS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _8B :  NeonI_3VSame<0b0, u, 0b00, opcode,
-               (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-               asmop # "\t$Rd.8b, $Rn.8b, $Rm.8b",
-               [(set (v8i8 VPR64:$Rd),
-                  (v8i8 (opnode (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _16B : NeonI_3VSame<0b1, u, 0b00, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-               asmop # "\t$Rd.16b, $Rn.16b, $Rm.16b",
-               [(set (v16i8 VPR128:$Rd),
-                  (v16i8 (opnode (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))))],
-               NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_3VSame_BHSD_sizes<bit u, bits<5> opcode,
-                                   string asmop, SDPatternOperator opnode,
-                                   bit Commutable = 0>
-   : NeonI_3VSame_BHS_sizes<u, opcode,  asmop, opnode, Commutable> {
-  let isCommutable = Commutable in {
-    def _2D : NeonI_3VSame<0b1, u, 0b11, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (v2i64 VPR128:$Rd),
-                 (v2i64 (opnode (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-// Multiclass NeonI_3VSame_SD_sizes: Operand types are floating point types,
-// but Result types can be integer or floating point types.
-multiclass NeonI_3VSame_SD_sizes<bit u, bit size, bits<5> opcode,
-                                 string asmop, SDPatternOperator opnode,
-                                 ValueType ResTy2S, ValueType ResTy4S,
-                                 ValueType ResTy2D, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _2S : NeonI_3VSame<0b0, u, {size, 0b0}, opcode,
-              (outs VPR64:$Rd), (ins VPR64:$Rn, VPR64:$Rm),
-              asmop # "\t$Rd.2s, $Rn.2s, $Rm.2s",
-              [(set (ResTy2S VPR64:$Rd),
-                 (ResTy2S (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _4S : NeonI_3VSame<0b1, u, {size, 0b0}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-              [(set (ResTy4S VPR128:$Rd),
-                 (ResTy4S (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def _2D : NeonI_3VSame<0b1, u, {size, 0b1}, opcode,
-              (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-              asmop # "\t$Rd.2d, $Rn.2d, $Rm.2d",
-              [(set (ResTy2D VPR128:$Rd),
-                 (ResTy2D (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))))],
-              NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Instruction Definitions
-//===----------------------------------------------------------------------===//
-
-// Vector Arithmetic Instructions
-
-// Vector Add (Integer and Floating-Point)
-
-defm ADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b10000, "add", add, 1>;
-defm FADDvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11010, "fadd", fadd,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Patterns to match add of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (add FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (add FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (add FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (ADDvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Sub (Integer and Floating-Point)
-
-defm SUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b10000, "sub", sub, 0>;
-defm FSUBvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11010, "fsub", fsub,
-                                     v2f32, v4f32, v2f64, 0>;
-
-// Patterns to match sub of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (sub FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (sub FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (sub FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (SUBvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Integer and Floating-Point)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b10011, "mul", mul, 1>;
-defm FMULvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11011, "fmul", fmul,
-                                     v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match mul of v1i8/v1i16/v1i32 types
-def : Pat<(v1i8 (mul FPR8:$Rn, FPR8:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                         (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (mul FPR16:$Rn, FPR16:$Rm)),
-          (EXTRACT_SUBREG 
-              (MULvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                         (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (mul FPR32:$Rn, FPR32:$Rm)),
-          (EXTRACT_SUBREG
-              (MULvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                         (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Vector Multiply (Polynomial)
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm PMULvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b10011, "pmul",
-                                    int_arm_neon_vmulp, int_arm_neon_vmulp, 1>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Integer)
-
-// class NeonI_3VSame_Constraint_impl: NeonI_3VSame with no data type and
-// two operands constraints.
-class NeonI_3VSame_Constraint_impl<string asmop, string asmlane,
-  RegisterOperand VPRC, ValueType OpTy, bit q, bit u, bits<2> size,
-  bits<5> opcode, SDPatternOperator opnode>
-  : NeonI_3VSame<q, u, size, opcode,
-    (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, VPRC:$Rm),
-    asmop # "\t$Rd" # asmlane # ", $Rn" # asmlane # ", $Rm" # asmlane,
-    [(set (OpTy VPRC:$Rd),
-       (OpTy (opnode (OpTy VPRC:$src), (OpTy VPRC:$Rn), (OpTy VPRC:$Rm))))],
-    NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-def Neon_mla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-def Neon_mls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (sub node:$Ra, (mul node:$Rn, node:$Rm))>;
-
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def MLAvvv_8B:  NeonI_3VSame_Constraint_impl<"mla", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_16B: NeonI_3VSame_Constraint_impl<"mla", ".16b", VPR128, v16i8,
-                                             0b1, 0b0, 0b00, 0b10010, Neon_mla>;
-def MLAvvv_4H:  NeonI_3VSame_Constraint_impl<"mla", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_8H:  NeonI_3VSame_Constraint_impl<"mla", ".8h",  VPR128, v8i16,
-                                             0b1, 0b0, 0b01, 0b10010, Neon_mla>;
-def MLAvvv_2S:  NeonI_3VSame_Constraint_impl<"mla", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b0, 0b10, 0b10010, Neon_mla>;
-def MLAvvv_4S:  NeonI_3VSame_Constraint_impl<"mla", ".4s",  VPR128, v4i32,
-                                             0b1, 0b0, 0b10, 0b10010, Neon_mla>;
-
-def MLSvvv_8B:  NeonI_3VSame_Constraint_impl<"mls", ".8b",  VPR64,  v8i8,
-                                             0b0, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_16B: NeonI_3VSame_Constraint_impl<"mls", ".16b", VPR128, v16i8,
-                                             0b1, 0b1, 0b00, 0b10010, Neon_mls>;
-def MLSvvv_4H:  NeonI_3VSame_Constraint_impl<"mls", ".4h",  VPR64,  v4i16,
-                                             0b0, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_8H:  NeonI_3VSame_Constraint_impl<"mls", ".8h",  VPR128, v8i16,
-                                             0b1, 0b1, 0b01, 0b10010, Neon_mls>;
-def MLSvvv_2S:  NeonI_3VSame_Constraint_impl<"mls", ".2s",  VPR64,  v2i32,
-                                             0b0, 0b1, 0b10, 0b10010, Neon_mls>;
-def MLSvvv_4S:  NeonI_3VSame_Constraint_impl<"mls", ".4s",  VPR128, v4i32,
-                                             0b1, 0b1, 0b10, 0b10010, Neon_mls>;
-}
-
-// Vector Multiply-accumulate and Multiply-subtract (Floating Point)
-
-def Neon_fmla : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fadd node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-def Neon_fmls : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                        (fsub node:$Ra, (fmul_su node:$Rn, node:$Rm))>;
-
-let Predicates = [HasNEON, UseFusedMAC],
-    SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC] in {
-def FMLAvvv_2S: NeonI_3VSame_Constraint_impl<"fmla", ".2s",  VPR64,  v2f32,
-                                             0b0, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_4S: NeonI_3VSame_Constraint_impl<"fmla", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b00, 0b11001, Neon_fmla>;
-def FMLAvvv_2D: NeonI_3VSame_Constraint_impl<"fmla", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b01, 0b11001, Neon_fmla>;
-
-def FMLSvvv_2S: NeonI_3VSame_Constraint_impl<"fmls", ".2s",  VPR64,  v2f32,
-                                              0b0, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_4S: NeonI_3VSame_Constraint_impl<"fmls", ".4s",  VPR128, v4f32,
-                                             0b1, 0b0, 0b10, 0b11001, Neon_fmls>;
-def FMLSvvv_2D: NeonI_3VSame_Constraint_impl<"fmls", ".2d",  VPR128, v2f64,
-                                             0b1, 0b0, 0b11, 0b11001, Neon_fmls>;
-}
-
-// We're also allowed to match the fma instruction regardless of compile
-// options.
-def : Pat<(v2f32 (fma VPR64:$Rn, VPR64:$Rm, VPR64:$Ra)),
-          (FMLAvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma VPR128:$Rn, VPR128:$Rm, VPR128:$Ra)),
-          (FMLAvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-def : Pat<(v2f32 (fma (fneg VPR64:$Rn), VPR64:$Rm, VPR64:$Ra)),
-          (FMLSvvv_2S VPR64:$Ra, VPR64:$Rn, VPR64:$Rm)>;
-def : Pat<(v4f32 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_4S VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-def : Pat<(v2f64 (fma (fneg VPR128:$Rn), VPR128:$Rm, VPR128:$Ra)),
-          (FMLSvvv_2D VPR128:$Ra, VPR128:$Rn, VPR128:$Rm)>;
-
-// Vector Divide (Floating-Point)
-
-let SchedRW = [WriteFPDiv, ReadFPDiv, ReadFPDiv] in {
-defm FDIVvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11111, "fdiv", fdiv,
-                                     v2f32, v4f32, v2f64, 0>;
-}
-
-// Vector Bitwise Operations
-
-// Vector Bitwise AND
-
-defm ANDvvv : NeonI_3VSame_B_sizes<0b0, 0b00, 0b00011, "and", and, and, 1>;
-
-// Vector Bitwise Exclusive OR
-
-defm EORvvv : NeonI_3VSame_B_sizes<0b1, 0b00, 0b00011, "eor", xor, xor, 1>;
-
-// Vector Bitwise OR
-
-defm ORRvvv : NeonI_3VSame_B_sizes<0b0, 0b10, 0b00011, "orr", or, or, 1>;
-
-// ORR disassembled as MOV if Vn==Vm
-
-// Vector Move - register
-// Alias for ORR if Vn=Vm.
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def : NeonInstAlias<"mov $Rd.8b, $Rn.8b",
-                    (ORRvvv_8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rn), 0>;
-def : NeonInstAlias<"mov $Rd.16b, $Rn.16b",
-                    (ORRvvv_16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rn), 0>;
-
-// The MOVI instruction takes two immediate operands.  The first is the
-// immediate encoding, while the second is the cmode.  A cmode of 14, or
-// 0b1110, produces a MOVI operation, rather than a MVNI, ORR, or BIC.
-def Neon_AllZero : PatFrag<(ops), (Neon_movi (i32 0), (i32 14))>;
-def Neon_AllOne : PatFrag<(ops), (Neon_movi (i32 255), (i32 14))>;
-
-def Neon_not8B  : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v8i8 Neon_AllOne)))>;
-def Neon_not16B : PatFrag<(ops node:$in),
-                          (xor node:$in, (bitconvert (v16i8 Neon_AllOne)))>;
-
-def Neon_orn8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (or node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_orn16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (or node:$Rn, (Neon_not16B node:$Rm))>;
-
-def Neon_bic8B : PatFrag<(ops node:$Rn, node:$Rm),
-                         (and node:$Rn, (Neon_not8B node:$Rm))>;
-
-def Neon_bic16B : PatFrag<(ops node:$Rn, node:$Rm),
-                          (and node:$Rn, (Neon_not16B node:$Rm))>;
-
-
-// Vector Bitwise OR NOT - register
-
-defm ORNvvv : NeonI_3VSame_B_sizes<0b0, 0b11, 0b00011, "orn",
-                                   Neon_orn8B, Neon_orn16B, 0>;
-
-// Vector Bitwise Bit Clear (AND NOT) - register
-
-defm BICvvv : NeonI_3VSame_B_sizes<0b0, 0b01, 0b00011, "bic",
-                                   Neon_bic8B, Neon_bic16B, 0>;
-
-multiclass Neon_bitwise2V_patterns<SDPatternOperator opnode8B,
-                                   SDPatternOperator opnode16B,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  def : Pat<(v2i32 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode8B VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i32 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode16B VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instructions AND, EOR, ORR, BIC, ORN
-defm : Neon_bitwise2V_patterns<and, and, ANDvvv_8B, ANDvvv_16B>;
-defm : Neon_bitwise2V_patterns<or,  or,  ORRvvv_8B, ORRvvv_16B>;
-defm : Neon_bitwise2V_patterns<xor, xor, EORvvv_8B, EORvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_bic8B, Neon_bic16B, BICvvv_8B, BICvvv_16B>;
-defm : Neon_bitwise2V_patterns<Neon_orn8B, Neon_orn16B, ORNvvv_8B, ORNvvv_16B>;
-
-//   Vector Bitwise Select
-def BSLvvv_8B  : NeonI_3VSame_Constraint_impl<"bsl", ".8b",  VPR64, v8i8,
-                                              0b0, 0b1, 0b01, 0b00011, vselect>;
-
-def BSLvvv_16B : NeonI_3VSame_Constraint_impl<"bsl", ".16b", VPR128, v16i8,
-                                              0b1, 0b1, 0b01, 0b00011, vselect>;
-
-multiclass Neon_bitwise3V_patterns<SDPatternOperator opnode,
-                                   Instruction INST8B,
-                                   Instruction INST16B> {
-  // Disassociate type from instruction definition
-  def : Pat<(v8i8 (opnode (v8i8 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (opnode (v2i32 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (opnode (v4i16 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (opnode (v1i64 VPR64:$src), VPR64:$Rn, VPR64:$Rm)),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (opnode (v16i8 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (opnode (v8i16 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (opnode (v2i64 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (opnode (v4i32 VPR128:$src), VPR128:$Rn, VPR128:$Rm)),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match BSL instruction pattern with non-constant operand
-  def : Pat<(v8i8 (or (and VPR64:$Rn, VPR64:$Rd),
-                    (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (or (and VPR64:$Rn, VPR64:$Rd),
-                     (and VPR64:$Rm, (Neon_not8B VPR64:$Rd)))),
-          (INST8B VPR64:$Rd, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (or (and VPR128:$Rn, VPR128:$Rd),
-                     (and VPR128:$Rm, (Neon_not16B VPR128:$Rd)))),
-          (INST16B VPR128:$Rd, VPR128:$Rn, VPR128:$Rm)>;
-
-  // Allow to match llvm.arm.* intrinsics.
-  def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 VPR64:$src),
-                    (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 VPR64:$src),
-                    (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 VPR64:$src),
-                    (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1i64 (int_arm_neon_vbsl (v1i64 VPR64:$src),
-                    (v1i64 VPR64:$Rn), (v1i64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v2f32 (int_arm_neon_vbsl (v2f32 VPR64:$src),
-                    (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v1f64 (int_arm_neon_vbsl (v1f64 VPR64:$src),
-                    (v1f64 VPR64:$Rn), (v1f64 VPR64:$Rm))),
-            (INST8B VPR64:$src, VPR64:$Rn, VPR64:$Rm)>;
-  def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 VPR128:$src),
-                    (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 VPR128:$src),
-                    (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 VPR128:$src),
-                    (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2i64 (int_arm_neon_vbsl (v2i64 VPR128:$src),
-                    (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v4f32 (int_arm_neon_vbsl (v4f32 VPR128:$src),
-                    (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-  def : Pat<(v2f64 (int_arm_neon_vbsl (v2f64 VPR128:$src),
-                    (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (INST16B VPR128:$src, VPR128:$Rn, VPR128:$Rm)>;
-}
-
-// Additional patterns for bitwise instruction BSL
-defm: Neon_bitwise3V_patterns<vselect, BSLvvv_8B, BSLvvv_16B>;
-
-def Neon_NoBSLop : PatFrag<(ops node:$src, node:$Rn, node:$Rm),
-                           (vselect node:$src, node:$Rn, node:$Rm),
-                           [{ (void)N; return false; }]>;
-
-// Vector Bitwise Insert if True
-
-def BITvvv_8B  : NeonI_3VSame_Constraint_impl<"bit", ".8b", VPR64,   v8i8,
-                   0b0, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-def BITvvv_16B : NeonI_3VSame_Constraint_impl<"bit", ".16b", VPR128, v16i8,
-                   0b1, 0b1, 0b10, 0b00011, Neon_NoBSLop>;
-
-// Vector Bitwise Insert if False
-
-def BIFvvv_8B  : NeonI_3VSame_Constraint_impl<"bif", ".8b", VPR64,  v8i8,
-                                0b0, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-def BIFvvv_16B : NeonI_3VSame_Constraint_impl<"bif", ".16b", VPR128, v16i8,
-                                0b1, 0b1, 0b11, 0b00011, Neon_NoBSLop>;
-
-// Vector Absolute Difference and Accumulate (Signed, Unsigned)
-
-def Neon_uaba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabdu node:$Rn, node:$Rm))>;
-def Neon_saba : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                       (add node:$Ra, (int_arm_neon_vabds node:$Rn, node:$Rm))>;
-
-// Vector Absolute Difference and Accumulate (Unsigned)
-def UABAvvv_8B :  NeonI_3VSame_Constraint_impl<"uaba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_16B : NeonI_3VSame_Constraint_impl<"uaba", ".16b", VPR128, v16i8,
-                    0b1, 0b1, 0b00, 0b01111, Neon_uaba>;
-def UABAvvv_4H :  NeonI_3VSame_Constraint_impl<"uaba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_8H :  NeonI_3VSame_Constraint_impl<"uaba", ".8h",  VPR128, v8i16,
-                    0b1, 0b1, 0b01, 0b01111, Neon_uaba>;
-def UABAvvv_2S :  NeonI_3VSame_Constraint_impl<"uaba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b1, 0b10, 0b01111, Neon_uaba>;
-def UABAvvv_4S :  NeonI_3VSame_Constraint_impl<"uaba", ".4s",  VPR128, v4i32,
-                    0b1, 0b1, 0b10, 0b01111, Neon_uaba>;
-
-// Vector Absolute Difference and Accumulate (Signed)
-def SABAvvv_8B :  NeonI_3VSame_Constraint_impl<"saba", ".8b",  VPR64,  v8i8,
-                    0b0, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_16B : NeonI_3VSame_Constraint_impl<"saba", ".16b", VPR128, v16i8,
-                    0b1, 0b0, 0b00, 0b01111, Neon_saba>;
-def SABAvvv_4H :  NeonI_3VSame_Constraint_impl<"saba", ".4h",  VPR64,  v4i16,
-                    0b0, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_8H :  NeonI_3VSame_Constraint_impl<"saba", ".8h",  VPR128, v8i16,
-                    0b1, 0b0, 0b01, 0b01111, Neon_saba>;
-def SABAvvv_2S :  NeonI_3VSame_Constraint_impl<"saba", ".2s",  VPR64,  v2i32,
-                    0b0, 0b0, 0b10, 0b01111, Neon_saba>;
-def SABAvvv_4S :  NeonI_3VSame_Constraint_impl<"saba", ".4s",  VPR128, v4i32,
-                    0b1, 0b0, 0b10, 0b01111, Neon_saba>;
-
-
-// Vector Absolute Difference (Signed, Unsigned)
-defm UABDvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01110, "uabd", int_arm_neon_vabdu, 0>;
-defm SABDvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01110, "sabd", int_arm_neon_vabds, 0>;
-
-// Vector Absolute Difference (Floating Point)
-defm FABDvvv: NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11010, "fabd",
-                                    int_arm_neon_vabds, v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Step (Floating Point)
-defm FRECPSvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11111, "frecps",
-                                       int_arm_neon_vrecps,
-                                       v2f32, v4f32, v2f64, 0>;
-
-// Vector Reciprocal Square Root Step (Floating Point)
-defm FRSQRTSvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11111, "frsqrts",
-                                        int_arm_neon_vrsqrts,
-                                        v2f32, v4f32, v2f64, 0>;
-
-// Vector Comparisons
-
-def Neon_cmeq : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETEQ)>;
-def Neon_cmphs : PatFrag<(ops node:$lhs, node:$rhs),
-                         (Neon_cmp node:$lhs, node:$rhs, SETUGE)>;
-def Neon_cmge : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGE)>;
-def Neon_cmhi : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETUGT)>;
-def Neon_cmgt : PatFrag<(ops node:$lhs, node:$rhs),
-                        (Neon_cmp node:$lhs, node:$rhs, SETGT)>;
-
-// NeonI_compare_aliases class: swaps register operands to implement
-// comparison aliases, e.g., CMLE is alias for CMGE with operands reversed.
-class NeonI_compare_aliases<string asmop, string asmlane,
-                            Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<asmop # "\t$Rd" # asmlane #", $Rn" # asmlane #
-                    ", $Rm" # asmlane,
-                  (inst VPRC:$Rd, VPRC:$Rm, VPRC:$Rn), 0b0>;
-
-// Vector Comparisons (Integer)
-
-// Vector Compare Mask Equal (Integer)
-let isCommutable =1 in {
-defm CMEQvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b10001, "cmeq", Neon_cmeq, 0>;
-}
-
-// Vector Compare Mask Higher or Same (Unsigned Integer)
-defm CMHSvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00111, "cmhs", Neon_cmphs, 0>;
-
-// Vector Compare Mask Greater Than or Equal (Integer)
-defm CMGEvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00111, "cmge", Neon_cmge, 0>;
-
-// Vector Compare Mask Higher (Unsigned Integer)
-defm CMHIvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b00110, "cmhi", Neon_cmhi, 0>;
-
-// Vector Compare Mask Greater Than (Integer)
-defm CMGTvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b00110, "cmgt", Neon_cmgt, 0>;
-
-// Vector Compare Mask Bitwise Test (Integer)
-defm CMTSTvvv:  NeonI_3VSame_BHSD_sizes<0b0, 0b10001, "cmtst", Neon_tst, 0>;
-
-// Vector Compare Mask Less or Same (Unsigned Integer)
-// CMLS is alias for CMHS with operands reversed.
-def CMLSvvv_8B  : NeonI_compare_aliases<"cmls", ".8b",  CMHSvvv_8B,  VPR64>;
-def CMLSvvv_16B : NeonI_compare_aliases<"cmls", ".16b", CMHSvvv_16B, VPR128>;
-def CMLSvvv_4H  : NeonI_compare_aliases<"cmls", ".4h",  CMHSvvv_4H,  VPR64>;
-def CMLSvvv_8H  : NeonI_compare_aliases<"cmls", ".8h",  CMHSvvv_8H,  VPR128>;
-def CMLSvvv_2S  : NeonI_compare_aliases<"cmls", ".2s",  CMHSvvv_2S,  VPR64>;
-def CMLSvvv_4S  : NeonI_compare_aliases<"cmls", ".4s",  CMHSvvv_4S,  VPR128>;
-def CMLSvvv_2D  : NeonI_compare_aliases<"cmls", ".2d",  CMHSvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than or Equal (Integer)
-// CMLE is alias for CMGE with operands reversed.
-def CMLEvvv_8B  : NeonI_compare_aliases<"cmle", ".8b",  CMGEvvv_8B,  VPR64>;
-def CMLEvvv_16B : NeonI_compare_aliases<"cmle", ".16b", CMGEvvv_16B, VPR128>;
-def CMLEvvv_4H  : NeonI_compare_aliases<"cmle", ".4h",  CMGEvvv_4H,  VPR64>;
-def CMLEvvv_8H  : NeonI_compare_aliases<"cmle", ".8h",  CMGEvvv_8H,  VPR128>;
-def CMLEvvv_2S  : NeonI_compare_aliases<"cmle", ".2s",  CMGEvvv_2S,  VPR64>;
-def CMLEvvv_4S  : NeonI_compare_aliases<"cmle", ".4s",  CMGEvvv_4S,  VPR128>;
-def CMLEvvv_2D  : NeonI_compare_aliases<"cmle", ".2d",  CMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Lower (Unsigned Integer)
-// CMLO is alias for CMHI with operands reversed.
-def CMLOvvv_8B  : NeonI_compare_aliases<"cmlo", ".8b",  CMHIvvv_8B,  VPR64>;
-def CMLOvvv_16B : NeonI_compare_aliases<"cmlo", ".16b", CMHIvvv_16B, VPR128>;
-def CMLOvvv_4H  : NeonI_compare_aliases<"cmlo", ".4h",  CMHIvvv_4H,  VPR64>;
-def CMLOvvv_8H  : NeonI_compare_aliases<"cmlo", ".8h",  CMHIvvv_8H,  VPR128>;
-def CMLOvvv_2S  : NeonI_compare_aliases<"cmlo", ".2s",  CMHIvvv_2S,  VPR64>;
-def CMLOvvv_4S  : NeonI_compare_aliases<"cmlo", ".4s",  CMHIvvv_4S,  VPR128>;
-def CMLOvvv_2D  : NeonI_compare_aliases<"cmlo", ".2d",  CMHIvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Integer)
-// CMLT is alias for CMGT with operands reversed.
-def CMLTvvv_8B  : NeonI_compare_aliases<"cmlt", ".8b",  CMGTvvv_8B,  VPR64>;
-def CMLTvvv_16B : NeonI_compare_aliases<"cmlt", ".16b", CMGTvvv_16B, VPR128>;
-def CMLTvvv_4H  : NeonI_compare_aliases<"cmlt", ".4h",  CMGTvvv_4H,  VPR64>;
-def CMLTvvv_8H  : NeonI_compare_aliases<"cmlt", ".8h",  CMGTvvv_8H,  VPR128>;
-def CMLTvvv_2S  : NeonI_compare_aliases<"cmlt", ".2s",  CMGTvvv_2S,  VPR64>;
-def CMLTvvv_4S  : NeonI_compare_aliases<"cmlt", ".4s",  CMGTvvv_4S,  VPR128>;
-def CMLTvvv_2D  : NeonI_compare_aliases<"cmlt", ".2d",  CMGTvvv_2D,  VPR128>;
-
-
-def neon_uimm0_asmoperand : AsmOperandClass
-{
-  let Name = "UImm0";
-  let PredicateMethod = "isUImm<0>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm0 : Operand<i32>, ImmLeaf<i32, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printNeonUImm0Operand";
-
-}
-
-multiclass NeonI_cmpz_sizes<bit u, bits<5> opcode, string asmop, CondCode CC>
-{
-  def _8B :  NeonI_2VMisc<0b0, u, 0b00, opcode,
-             (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.8b, $Rn.8b, $Imm",
-             [(set (v8i8 VPR64:$Rd),
-                (v8i8 (Neon_cmpz (v8i8 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _16B : NeonI_2VMisc<0b1, u, 0b00, opcode,
-             (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-             asmop # "\t$Rd.16b, $Rn.16b, $Imm",
-             [(set (v16i8 VPR128:$Rd),
-                (v16i8 (Neon_cmpz (v16i8 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-             NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4H : NeonI_2VMisc<0b0, u, 0b01, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4h, $Rn.4h, $Imm",
-            [(set (v4i16 VPR64:$Rd),
-               (v4i16 (Neon_cmpz (v4i16 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _8H : NeonI_2VMisc<0b1, u, 0b01, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.8h, $Rn.8h, $Imm",
-            [(set (v8i16 VPR128:$Rd),
-               (v8i16 (Neon_cmpz (v8i16 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2S : NeonI_2VMisc<0b0, u, 0b10, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2s, $Rn.2s, $Imm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2i32 VPR64:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, 0b10, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.4s, $Rn.4s, $Imm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4i32 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, 0b11, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, neon_uimm0:$Imm),
-            asmop # "\t$Rd.2d, $Rn.2d, $Imm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2i64 VPR128:$Rn), (i32 imm:$Imm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Integer)
-defm CMEQvvi : NeonI_cmpz_sizes<0b0, 0b01001, "cmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Signed Integer)
-defm CMGEvvi : NeonI_cmpz_sizes<0b1, 0b01000, "cmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Signed Integer)
-defm CMGTvvi : NeonI_cmpz_sizes<0b0, 0b01000, "cmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Signed Integer)
-defm CMLEvvi : NeonI_cmpz_sizes<0b1, 0b01001, "cmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Signed Integer)
-defm CMLTvvi : NeonI_cmpz_sizes<0b0, 0b01010, "cmlt", SETLT>;
-
-// Vector Comparisons (Floating Point)
-
-// Vector Compare Mask Equal (Floating Point)
-let isCommutable =1 in {
-defm FCMEQvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11100, "fcmeq", Neon_cmeq,
-                                      v2i32, v4i32, v2i64, 0>;
-}
-
-// Vector Compare Mask Greater Than Or Equal (Floating Point)
-defm FCMGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11100, "fcmge", Neon_cmge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Greater Than (Floating Point)
-defm FCMGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11100, "fcmgt", Neon_cmgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Compare Mask Less Than Or Equal (Floating Point)
-// FCMLE is alias for FCMGE with operands reversed.
-def FCMLEvvv_2S  : NeonI_compare_aliases<"fcmle", ".2s",  FCMGEvvv_2S,  VPR64>;
-def FCMLEvvv_4S  : NeonI_compare_aliases<"fcmle", ".4s",  FCMGEvvv_4S,  VPR128>;
-def FCMLEvvv_2D  : NeonI_compare_aliases<"fcmle", ".2d",  FCMGEvvv_2D,  VPR128>;
-
-// Vector Compare Mask Less Than (Floating Point)
-// FCMLT is alias for FCMGT with operands reversed.
-def FCMLTvvv_2S  : NeonI_compare_aliases<"fcmlt", ".2s",  FCMGTvvv_2S,  VPR64>;
-def FCMLTvvv_4S  : NeonI_compare_aliases<"fcmlt", ".4s",  FCMGTvvv_4S,  VPR128>;
-def FCMLTvvv_2D  : NeonI_compare_aliases<"fcmlt", ".2d",  FCMGTvvv_2D,  VPR128>;
-
-def fpzero_izero_asmoperand : AsmOperandClass {
-  let Name = "FPZeroIZero";
-  let ParserMethod = "ParseFPImm0AndImm0Operand";
-  let DiagnosticType = "FPZero";
-}
-
-def fpzz32 : Operand<f32>,
-             ComplexPattern<f32, 1, "SelectFPZeroOperand", [fpimm]> {
-  let ParserMatchClass = fpzero_izero_asmoperand;
-  let PrintMethod = "printFPZeroOperand";
-  let DecoderMethod = "DecodeFPZeroOperand";
-}
-
-multiclass NeonI_fpcmpz_sizes<bit u, bit size, bits<5> opcode,
-                              string asmop, CondCode CC>
-{
-  def _2S : NeonI_2VMisc<0b0, u, {size, 0b0}, opcode,
-            (outs VPR64:$Rd), (ins VPR64:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2s, $Rn.2s, $FPImm",
-            [(set (v2i32 VPR64:$Rd),
-               (v2i32 (Neon_cmpz (v2f32 VPR64:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _4S : NeonI_2VMisc<0b1, u, {size, 0b0}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.4s, $Rn.4s, $FPImm",
-            [(set (v4i32 VPR128:$Rd),
-               (v4i32 (Neon_cmpz (v4f32 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def _2D : NeonI_2VMisc<0b1, u, {size, 0b1}, opcode,
-            (outs VPR128:$Rd), (ins VPR128:$Rn, fpzz32:$FPImm),
-            asmop # "\t$Rd.2d, $Rn.2d, $FPImm",
-            [(set (v2i64 VPR128:$Rd),
-               (v2i64 (Neon_cmpz (v2f64 VPR128:$Rn), (f32 fpzz32:$FPImm), CC)))],
-            NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-// Vector Compare Mask Equal to Zero (Floating Point)
-defm FCMEQvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01101, "fcmeq", SETEQ>;
-
-// Vector Compare Mask Greater Than or Equal to Zero (Floating Point)
-defm FCMGEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01100, "fcmge", SETGE>;
-
-// Vector Compare Mask Greater Than Zero (Floating Point)
-defm FCMGTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01100, "fcmgt", SETGT>;
-
-// Vector Compare Mask Less Than or Equal To Zero (Floating Point)
-defm FCMLEvvi : NeonI_fpcmpz_sizes<0b1, 0b1, 0b01101, "fcmle", SETLE>;
-
-// Vector Compare Mask Less Than Zero (Floating Point)
-defm FCMLTvvi : NeonI_fpcmpz_sizes<0b0, 0b1, 0b01110, "fcmlt", SETLT>;
-
-// Vector Absolute Comparisons (Floating Point)
-
-// Vector Absolute Compare Mask Greater Than Or Equal (Floating Point)
-defm FACGEvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11101, "facge",
-                                      int_arm_neon_vacge,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Greater Than (Floating Point)
-defm FACGTvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11101, "facgt",
-                                      int_arm_neon_vacgt,
-                                      v2i32, v4i32, v2i64, 0>;
-
-// Vector Absolute Compare Mask Less Than Or Equal (Floating Point)
-// FACLE is alias for FACGE with operands reversed.
-def FACLEvvv_2S  : NeonI_compare_aliases<"facle", ".2s",  FACGEvvv_2S,  VPR64>;
-def FACLEvvv_4S  : NeonI_compare_aliases<"facle", ".4s",  FACGEvvv_4S,  VPR128>;
-def FACLEvvv_2D  : NeonI_compare_aliases<"facle", ".2d",  FACGEvvv_2D,  VPR128>;
-
-// Vector Absolute Compare Mask Less Than (Floating Point)
-// FACLT is alias for FACGT with operands reversed.
-def FACLTvvv_2S  : NeonI_compare_aliases<"faclt", ".2s",  FACGTvvv_2S,  VPR64>;
-def FACLTvvv_4S  : NeonI_compare_aliases<"faclt", ".4s",  FACGTvvv_4S,  VPR128>;
-def FACLTvvv_2D  : NeonI_compare_aliases<"faclt", ".2d",  FACGTvvv_2D,  VPR128>;
-
-// Vector halving add (Integer Signed, Unsigned)
-defm SHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00000, "shadd",
-                                        int_arm_neon_vhadds, 1>;
-defm UHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00000, "uhadd",
-                                        int_arm_neon_vhaddu, 1>;
-
-// Vector halving sub (Integer Signed, Unsigned)
-defm SHSUBvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00100, "shsub",
-                                        int_arm_neon_vhsubs, 0>;
-defm UHSUBvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00100, "uhsub",
-                                        int_arm_neon_vhsubu, 0>;
-
-// Vector rouding halving add (Integer Signed, Unsigned)
-defm SRHADDvvv :  NeonI_3VSame_BHS_sizes<0b0, 0b00010, "srhadd",
-                                         int_arm_neon_vrhadds, 1>;
-defm URHADDvvv :  NeonI_3VSame_BHS_sizes<0b1, 0b00010, "urhadd",
-                                         int_arm_neon_vrhaddu, 1>;
-
-// Vector Saturating add (Integer Signed, Unsigned)
-defm SQADDvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00001, "sqadd",
-                   int_arm_neon_vqadds, 1>;
-defm UQADDvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00001, "uqadd",
-                   int_arm_neon_vqaddu, 1>;
-
-// Vector Saturating sub (Integer Signed, Unsigned)
-defm SQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b0, 0b00101, "sqsub",
-                   int_arm_neon_vqsubs, 1>;
-defm UQSUBvvv :  NeonI_3VSame_BHSD_sizes<0b1, 0b00101, "uqsub",
-                   int_arm_neon_vqsubu, 1>;
-
-// Vector Shift Left (Signed and Unsigned Integer)
-defm SSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01000, "sshl",
-                 int_arm_neon_vshifts, 1>;
-defm USHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01000, "ushl",
-                 int_arm_neon_vshiftu, 1>;
-
-// Vector Saturating Shift Left (Signed and Unsigned Integer)
-defm SQSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01001, "sqshl",
-                  int_arm_neon_vqshifts, 1>;
-defm UQSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01001, "uqshl",
-                  int_arm_neon_vqshiftu, 1>;
-
-// Vector Rouding Shift Left (Signed and Unsigned Integer)
-defm SRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01010, "srshl",
-                  int_arm_neon_vrshifts, 1>;
-defm URSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01010, "urshl",
-                  int_arm_neon_vrshiftu, 1>;
-
-// Vector Saturating Rouding Shift Left (Signed and Unsigned Integer)
-defm SQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b0, 0b01011, "sqrshl",
-                   int_arm_neon_vqrshifts, 1>;
-defm UQRSHLvvv : NeonI_3VSame_BHSD_sizes<0b1, 0b01011, "uqrshl",
-                   int_arm_neon_vqrshiftu, 1>;
-
-// Vector Maximum (Signed and Unsigned Integer)
-defm SMAXvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01100, "smax", int_arm_neon_vmaxs, 1>;
-defm UMAXvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01100, "umax", int_arm_neon_vmaxu, 1>;
-
-// Vector Minimum (Signed and Unsigned Integer)
-defm SMINvvv : NeonI_3VSame_BHS_sizes<0b0, 0b01101, "smin", int_arm_neon_vmins, 1>;
-defm UMINvvv : NeonI_3VSame_BHS_sizes<0b1, 0b01101, "umin", int_arm_neon_vminu, 1>;
-
-// Vector Maximum (Floating Point)
-defm FMAXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11110, "fmax",
-                                     int_arm_neon_vmaxs,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum (Floating Point)
-defm FMINvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11110, "fmin",
-                                     int_arm_neon_vmins,
-                                     v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11000, "fmaxnm",
-                                       int_aarch64_neon_vmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum (Floating Point) - prefer a number over a quiet NaN)
-defm FMINNMvvv : NeonI_3VSame_SD_sizes<0b0, 0b1, 0b11000, "fminnm",
-                                       int_aarch64_neon_vminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Maximum Pairwise (Signed and Unsigned Integer)
-defm SMAXPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10100, "smaxp", int_arm_neon_vpmaxs, 1>;
-defm UMAXPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10100, "umaxp", int_arm_neon_vpmaxu, 1>;
-
-// Vector Minimum Pairwise (Signed and Unsigned Integer)
-defm SMINPvvv : NeonI_3VSame_BHS_sizes<0b0, 0b10101, "sminp", int_arm_neon_vpmins, 1>;
-defm UMINPvvv : NeonI_3VSame_BHS_sizes<0b1, 0b10101, "uminp", int_arm_neon_vpminu, 1>;
-
-// Vector Maximum Pairwise (Floating Point)
-defm FMAXPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11110, "fmaxp",
-                                     int_arm_neon_vpmaxs, v2f32, v4f32, v2f64, 1>;
-
-// Vector Minimum Pairwise (Floating Point)
-defm FMINPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11110, "fminp",
-                                     int_arm_neon_vpmins, v2f32, v4f32, v2f64, 1>;
-
-// Vector maxNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMAXNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11000, "fmaxnmp",
-                                       int_aarch64_neon_vpmaxnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector minNum Pairwise (Floating Point) -  prefer a number over a quiet NaN)
-defm FMINNMPvvv : NeonI_3VSame_SD_sizes<0b1, 0b1, 0b11000, "fminnmp",
-                                       int_aarch64_neon_vpminnm,
-                                       v2f32, v4f32, v2f64, 1>;
-
-// Vector Addition Pairwise (Integer)
-defm ADDP : NeonI_3VSame_BHSD_sizes<0b0, 0b10111, "addp", int_arm_neon_vpadd, 1>;
-
-// Vector Addition Pairwise (Floating Point)
-defm FADDP : NeonI_3VSame_SD_sizes<0b1, 0b0, 0b11010, "faddp",
-                                       int_arm_neon_vpadd,
-                                       v2f32, v4f32, v2f64, 1>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-// Vector Saturating Doubling Multiply High
-defm SQDMULHvvv : NeonI_3VSame_HS_sizes<0b0, 0b10110, "sqdmulh",
-                    int_arm_neon_vqdmulh, 1>;
-
-// Vector Saturating Rouding Doubling Multiply High
-defm SQRDMULHvvv : NeonI_3VSame_HS_sizes<0b1, 0b10110, "sqrdmulh",
-                     int_arm_neon_vqrdmulh, 1>;
-
-// Vector Multiply Extended (Floating Point)
-defm FMULXvvv : NeonI_3VSame_SD_sizes<0b0, 0b0, 0b11011, "fmulx",
-                                      int_aarch64_neon_vmulx,
-                                      v2f32, v4f32, v2f64, 1>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for 
-// ADDP, SMINP, UMINP, SMAXP, UMAXP having i32 as output
-class Neon_VectorPair_v2i32_pattern<SDPatternOperator opnode, Instruction INST>
-  : Pat<(v1i32 (opnode (v2i32 VPR64:$Rn))),
-        (EXTRACT_SUBREG
-             (v2i32 (INST (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rn))),
-             sub_32)>;
-
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_sminv, SMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_uminv, UMINPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_smaxv, SMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_umaxv, UMAXPvvv_2S>;
-def : Neon_VectorPair_v2i32_pattern<int_aarch64_neon_vaddv, ADDP_2S>;
-
-// Vector Immediate Instructions
-
-multiclass neon_mov_imm_shift_asmoperands<string PREFIX>
-{
-  def _asmoperand : AsmOperandClass
-    {
-      let Name = "NeonMovImmShift" # PREFIX;
-      let RenderMethod = "addNeonMovImmShift" # PREFIX # "Operands";
-      let PredicateMethod = "isNeonMovImmShift" # PREFIX;
-    }
-}
-
-// Definition of vector immediates shift operands
-
-// The selectable use-cases extract the shift operation
-// information from the OpCmode fields encoded in the immediate.
-def neon_mod_shift_imm_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  if (!HasShift) return SDValue();
-  return CurDAG->getTargetConstant(ShiftImm, MVT::i32);
-}]>;
-
-// Vector immediates shift operands which accept LSL and MSL
-// shift operators with shift value in the range of 0, 8, 16, 24 (LSL),
-// or 0, 8 (LSLH) or 8, 16 (MSL).
-defm neon_mov_imm_LSL : neon_mov_imm_shift_asmoperands<"LSL">;
-defm neon_mov_imm_MSL : neon_mov_imm_shift_asmoperands<"MSL">;
-// LSLH restricts shift amount to  0, 8 out of 0, 8, 16, 24
-defm neon_mov_imm_LSLH : neon_mov_imm_shift_asmoperands<"LSLH">;
-
-multiclass neon_mov_imm_shift_operands<string PREFIX,
-                                       string HALF, string ISHALF, code pred>
-{
-   def _operand : Operand<i32>, ImmLeaf<i32, pred, neon_mod_shift_imm_XFORM>
-    {
-      let PrintMethod =
-        "printNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let DecoderMethod =
-        "DecodeNeonMovImmShiftOperand<A64SE::" # PREFIX # ", " # ISHALF # ">";
-      let ParserMatchClass =
-        !cast<AsmOperandClass>("neon_mov_imm_" # PREFIX # HALF # "_asmoperand");
-    }
-}
-
-defm neon_mov_imm_LSL  : neon_mov_imm_shift_operands<"LSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_MSL  : neon_mov_imm_shift_operands<"MSL", "", "false", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && ShiftOnesIn);
-}]>;
-
-defm neon_mov_imm_LSLH  : neon_mov_imm_shift_operands<"LSL", "H", "true", [{
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  unsigned HasShift =
-    A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-  return (HasShift && !ShiftOnesIn);
-}]>;
-
-def neon_uimm1_asmoperand : AsmOperandClass
-{
-  let Name = "UImm1";
-  let PredicateMethod = "isUImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm2_asmoperand : AsmOperandClass
-{
-  let Name = "UImm2";
-  let PredicateMethod = "isUImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8_asmoperand : AsmOperandClass
-{
-  let Name = "UImm8";
-  let PredicateMethod = "isUImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-
-def neon_uimm8 : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm8_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm64_mask_asmoperand : AsmOperandClass
-{
-  let Name = "NeonUImm64Mask";
-  let PredicateMethod = "isNeonUImm64Mask";
-  let RenderMethod = "addNeonUImm64MaskOperands";
-}
-
-// MCOperand for 64-bit bytemask with each byte having only the
-// value 0x00 and 0xff is encoded as an unsigned 8-bit value
-def neon_uimm64_mask : Operand<i32>, ImmLeaf<i32, [{(void)Imm; return true;}]> {
-  let ParserMatchClass = neon_uimm64_mask_asmoperand;
-  let PrintMethod = "printNeonUImm64MaskOperand";
-}
-
-multiclass NeonI_mov_imm_lsl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bits<2> Simm;
-       let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-     }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSL_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b0};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                              (outs VPR64:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                              [(set (v4i16 VPR64:$Rd),
-                                 (v4i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_LSLH_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                              [(set (v8i16 VPR128:$Rd),
-                                 (v8i16 (opnode (timm:$Imm),
-                                   (neon_mov_imm_LSLH_operand:$Simm))))],
-                              NoItinerary>,
-               Sched<[WriteFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b0};
-     }
-}
-
-multiclass NeonI_mov_imm_with_constraint_lsl_sizes<string asmop, bit op,
-                                                   SDPatternOperator opnode,
-                                                   SDPatternOperator neonopnode>
-{
-  let Constraints = "$src = $Rd" in {
-    // shift zeros, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                 [(set (v2i32 VPR64:$Rd),
-                    (v2i32 (opnode (v2i32 VPR64:$src),
-                      (v2i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    def _4S  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSL_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                 [(set (v4i32 VPR128:$Rd),
-                    (v4i32 (opnode (v4i32 VPR128:$src),
-                      (v4i32 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bits<2> Simm;
-      let cmode = {0b0, Simm{1}, Simm{0}, 0b1};
-    }
-
-    // shift zeros, per halfword
-    def _4H  : NeonI_1VModImm<0b0, op,
-                 (outs VPR64:$Rd),
-                 (ins VPR64:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.4h, $Imm$Simm"),
-                 [(set (v4i16 VPR64:$Rd),
-                    (v4i16 (opnode (v4i16 VPR64:$src),
-                       (v4i16 (neonopnode timm:$Imm,
-                          neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit  Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-
-    def _8H  : NeonI_1VModImm<0b1, op,
-                 (outs VPR128:$Rd),
-                 (ins VPR128:$src, neon_uimm8:$Imm,
-                   neon_mov_imm_LSLH_operand:$Simm),
-                 !strconcat(asmop, "\t$Rd.8h, $Imm$Simm"),
-                 [(set (v8i16 VPR128:$Rd),
-                    (v8i16 (opnode (v8i16 VPR128:$src),
-                      (v8i16 (neonopnode timm:$Imm,
-                        neon_mov_imm_LSL_operand:$Simm)))))],
-                 NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]> {
-      bit Simm;
-      let cmode = {0b1, 0b0, Simm, 0b1};
-    }
-  }
-}
-
-multiclass NeonI_mov_imm_msl_sizes<string asmop, bit op,
-                                   SDPatternOperator opnode>
-{
-    // shift ones, per word
-    def _2S  : NeonI_1VModImm<0b0, op,
-                             (outs VPR64:$Rd),
-                             (ins neon_uimm8:$Imm,
-                               neon_mov_imm_MSL_operand:$Simm),
-                             !strconcat(asmop, "\t$Rd.2s, $Imm$Simm"),
-                              [(set (v2i32 VPR64:$Rd),
-                                 (v2i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                             NoItinerary>,
-               Sched<[WriteFPALU]> {
-       bit Simm;
-       let cmode = {0b1, 0b1, 0b0, Simm};
-     }
-
-   def _4S  : NeonI_1VModImm<0b1, op,
-                              (outs VPR128:$Rd),
-                              (ins neon_uimm8:$Imm,
-                                neon_mov_imm_MSL_operand:$Simm),
-                              !strconcat(asmop, "\t$Rd.4s, $Imm$Simm"),
-                              [(set (v4i32 VPR128:$Rd),
-                                 (v4i32 (opnode (timm:$Imm),
-                                   (neon_mov_imm_MSL_operand:$Simm))))],
-                              NoItinerary>,
-              Sched<[WriteFPALU]> {
-     bit Simm;
-     let cmode = {0b1, 0b1, 0b0, Simm};
-   }
-}
-
-// Vector Move Immediate Shifted
-let isReMaterializable = 1 in {
-defm MOVIvi_lsl : NeonI_mov_imm_lsl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Shifted
-let isReMaterializable = 1 in {
-defm MVNIvi_lsl : NeonI_mov_imm_lsl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-// Vector Bitwise Bit Clear (AND NOT) - immediate
-let isReMaterializable = 1 in {
-defm BICvi_lsl : NeonI_mov_imm_with_constraint_lsl_sizes<"bic", 0b1,
-                                                         and, Neon_mvni>;
-}
-
-// Vector Bitwise OR - immedidate
-
-let isReMaterializable = 1 in {
-defm ORRvi_lsl   : NeonI_mov_imm_with_constraint_lsl_sizes<"orr", 0b0,
-                                                           or, Neon_movi>;
-}
-
-// Additional patterns for Vector Bitwise Bit Clear (AND NOT) - immedidate
-// LowerBUILD_VECTOR favors lowering MOVI over MVNI.
-// BIC immediate instructions selection requires additional patterns to
-// transform Neon_movi operands into BIC immediate operands
-
-def neon_mov_imm_LSLH_transform_XFORM : SDNodeXForm<imm, [{
-  uint64_t OpCmode = N->getZExtValue();
-  unsigned ShiftImm;
-  unsigned ShiftOnesIn;
-  (void)A64Imms::decodeNeonModShiftImm(OpCmode, ShiftImm, ShiftOnesIn);
-  // LSLH restricts shift amount to  0, 8 which are encoded as 0 and 1
-  // Transform encoded shift amount 0 to 1 and 1 to 0.
-  return CurDAG->getTargetConstant(!ShiftImm, MVT::i32);
-}]>;
-
-def neon_mov_imm_LSLH_transform_operand
-  : ImmLeaf<i32, [{
-    unsigned ShiftImm;
-    unsigned ShiftOnesIn;
-    unsigned HasShift =
-      A64Imms::decodeNeonModShiftImm(Imm, ShiftImm, ShiftOnesIn);
-    return (HasShift && !ShiftOnesIn); }],
-  neon_mov_imm_LSLH_transform_XFORM>;
-
-// Transform (and A, (4h Neon_movi 0xff)) -> BIC 4h (A, 0xff, LSL 8)
-// Transform (and A, (4h Neon_movi 0xff LSL #8)) -> BIC 4h (A, 0xff)
-def : Pat<(v4i16 (and VPR64:$src,
-            (v4i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-// Transform (and A, (8h Neon_movi 8h 0xff)) -> BIC 8h (A, 0xff, LSL 8)
-// Transform (and A, (8h Neon_movi 0xff LSL #8)) -> BIC 8h (A, 0xff)
-def : Pat<(v8i16 (and VPR128:$src,
-            (v8i16 (Neon_movi 255,
-              neon_mov_imm_LSLH_transform_operand:$Simm)))),
-          (BICvi_lsl_8H VPR128:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v8i8 (and VPR64:$src,
-                  (bitconvert(v4i16 (Neon_movi 255,
-                    neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i32 (and VPR64:$src,
-                 (bitconvert(v4i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-          (BICvi_lsl_4H VPR64:$src, 255,
-            neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v1i64 (and VPR64:$src,
-                (bitconvert(v4i16 (Neon_movi 255,
-                  neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_4H VPR64:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-def : Pat<(v16i8 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v4i32 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-def : Pat<(v2i64 (and VPR128:$src,
-                 (bitconvert(v8i16 (Neon_movi 255,
-                   neon_mov_imm_LSLH_transform_operand:$Simm))))),
-        (BICvi_lsl_8H VPR128:$src, 255,
-          neon_mov_imm_LSLH_transform_operand:$Simm)>;
-
-multiclass Neon_bitwiseVi_patterns<SDPatternOperator opnode,
-                                   SDPatternOperator neonopnode,
-                                   Instruction INST4H,
-                                   Instruction INST8H,
-                                   Instruction INST2S,
-                                   Instruction INST4S> {
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v4i16 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i32 (opnode VPR64:$src,
-                   (bitconvert(v4i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST4H VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v4i16 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4H VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i32 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v8i16 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST8H VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v8i8 (opnode VPR64:$src,
-                    (bitconvert(v2i32 (neonopnode timm:$Imm,
-                      neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v4i16 (opnode VPR64:$src,
-                   (bitconvert(v2i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-            (INST2S VPR64:$src, neon_uimm8:$Imm,
-              neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v1i64 (opnode VPR64:$src,
-                  (bitconvert(v2i32 (neonopnode timm:$Imm,
-                    neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST2S VPR64:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-
-  def : Pat<(v16i8 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v8i16 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-  def : Pat<(v2i64 (opnode VPR128:$src,
-                   (bitconvert(v4i32 (neonopnode timm:$Imm,
-                     neon_mov_imm_LSLH_operand:$Simm))))),
-          (INST4S VPR128:$src, neon_uimm8:$Imm,
-            neon_mov_imm_LSLH_operand:$Simm)>;
-}
-
-// Additional patterns for Vector Vector Bitwise Bit Clear (AND NOT) - immediate
-defm : Neon_bitwiseVi_patterns<and, Neon_mvni, BICvi_lsl_4H, BICvi_lsl_8H,
-                               BICvi_lsl_2S, BICvi_lsl_4S>;
-
-// Additional patterns for Vector Bitwise OR - immedidate
-defm : Neon_bitwiseVi_patterns<or, Neon_movi, ORRvi_lsl_4H, ORRvi_lsl_8H,
-                               ORRvi_lsl_2S, ORRvi_lsl_4S>;
-
-
-// Vector Move Immediate Masked
-let isReMaterializable = 1 in {
-defm MOVIvi_msl : NeonI_mov_imm_msl_sizes<"movi", 0b0, Neon_movi>;
-}
-
-// Vector Move Inverted Immediate Masked
-let isReMaterializable = 1 in {
-defm MVNIvi_msl : NeonI_mov_imm_msl_sizes<"mvni", 0b1, Neon_mvni>;
-}
-
-class NeonI_mov_imm_lsl_aliases<string asmop, string asmlane,
-                                Instruction inst, RegisterOperand VPRC>
-  : NeonInstAlias<!strconcat(asmop, "\t$Rd," # asmlane # ", $Imm"),
-                        (inst VPRC:$Rd, neon_uimm8:$Imm,  0), 0b0>;
-
-// Aliases for Vector Move Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"movi", ".2s", MOVIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4s", MOVIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".4h", MOVIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"movi", ".8h", MOVIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Move Inverted Immediate Shifted
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".2s", MVNIvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4s", MVNIvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".4h", MVNIvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"mvni", ".8h", MVNIvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise Bit Clear (AND NOT) - immediate
-def : NeonI_mov_imm_lsl_aliases<"bic", ".2s", BICvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4s", BICvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".4h", BICvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"bic", ".8h", BICvi_lsl_8H, VPR128>;
-
-// Aliases for Vector Bitwise OR - immedidate
-def : NeonI_mov_imm_lsl_aliases<"orr", ".2s", ORRvi_lsl_2S, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4s", ORRvi_lsl_4S, VPR128>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".4h", ORRvi_lsl_4H, VPR64>;
-def : NeonI_mov_imm_lsl_aliases<"orr", ".8h", ORRvi_lsl_8H, VPR128>;
-
-//  Vector Move Immediate - per byte
-let isReMaterializable = 1 in {
-def MOVIvi_8B : NeonI_1VModImm<0b0, 0b0,
-                               (outs VPR64:$Rd), (ins neon_uimm8:$Imm),
-                               "movi\t$Rd.8b, $Imm",
-                               [(set (v8i8 VPR64:$Rd),
-                                  (v8i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-
-def MOVIvi_16B : NeonI_1VModImm<0b1, 0b0,
-                                (outs VPR128:$Rd), (ins neon_uimm8:$Imm),
-                                "movi\t$Rd.16b, $Imm",
-                                [(set (v16i8 VPR128:$Rd),
-                                   (v16i8 (Neon_movi (timm:$Imm), (i32 imm))))],
-                                 NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, per double word
-let isReMaterializable = 1 in {
-def MOVIvi_2D : NeonI_1VModImm<0b1, 0b1,
-                               (outs VPR128:$Rd), (ins neon_uimm64_mask:$Imm),
-                               "movi\t $Rd.2d, $Imm",
-                               [(set (v2i64 VPR128:$Rd),
-                                  (v2i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                               NoItinerary>,
-                Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Move Immediate - bytemask, one doubleword
-
-let isReMaterializable = 1 in {
-def MOVIdi : NeonI_1VModImm<0b0, 0b1,
-                           (outs FPR64:$Rd), (ins neon_uimm64_mask:$Imm),
-                           "movi\t $Rd, $Imm",
-                           [(set (v1i64 FPR64:$Rd),
-                             (v1i64 (Neon_movi (timm:$Imm), (i32 imm))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU]> {
-  let cmode = 0b1110;
-}
-}
-
-// Vector Floating Point Move Immediate
-
-class NeonI_FMOV_impl<string asmlane, RegisterOperand VPRC, ValueType OpTy,
-                      Operand immOpType, bit q, bit op>
-  : NeonI_1VModImm<q, op,
-                   (outs VPRC:$Rd), (ins immOpType:$Imm),
-                   "fmov\t$Rd" # asmlane # ", $Imm",
-                   [(set (OpTy VPRC:$Rd),
-                      (OpTy (Neon_fmovi (timm:$Imm))))],
-                   NoItinerary>,
-    Sched<[WriteFPALU]> {
-     let cmode = 0b1111;
-   }
-
-let isReMaterializable = 1 in {
-def FMOVvi_2S : NeonI_FMOV_impl<".2s", VPR64,  v2f32, fmov32_operand, 0b0, 0b0>;
-def FMOVvi_4S : NeonI_FMOV_impl<".4s", VPR128, v4f32, fmov32_operand, 0b1, 0b0>;
-def FMOVvi_2D : NeonI_FMOV_impl<".2d", VPR128, v2f64, fmov64_operand, 0b1, 0b1>;
-}
-
-// Vector Shift (Immediate)
-
-// Shift Right/Left Immediate - The immh:immb field of these shifts are encoded
-// as follows:
-//
-//    Offset    Encoding
-//     8        immh:immb<6:3> = '0001xxx', <imm> is encoded in immh:immb<2:0>
-//     16       immh:immb<6:4> = '001xxxx', <imm> is encoded in immh:immb<3:0>
-//     32       immh:immb<6:5> = '01xxxxx', <imm> is encoded in immh:immb<4:0>
-//     64       immh:immb<6>   = '1xxxxxx', <imm> is encoded in immh:immb<5:0>
-//
-// The shift right immediate amount, in the range 1 to element bits, is computed
-// as Offset - UInt(immh:immb).  The shift left immediate amount, in the range 0
-// to element bits - 1, is computed as UInt(immh:immb) - Offset.
-
-class shr_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShrImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShrImm" # OFFSET;
-}
-
-class shr_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftRightImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftRightImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shr_imm" # OFFSET # "_asmoperand");
-}
-
-def shr_imm8_asmoperand : shr_imm_asmoperands<"8">;
-def shr_imm16_asmoperand : shr_imm_asmoperands<"16">;
-def shr_imm32_asmoperand : shr_imm_asmoperands<"32">;
-def shr_imm64_asmoperand : shr_imm_asmoperands<"64">;
-
-def shr_imm8 : shr_imm<"8">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 8;}]>;
-def shr_imm16 : shr_imm<"16">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 16;}]>;
-def shr_imm32 : shr_imm<"32">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 32;}]>;
-def shr_imm64 : shr_imm<"64">, ImmLeaf<i32, [{return Imm > 0 && Imm <= 64;}]>;
-
-class shl_imm_asmoperands<string OFFSET> : AsmOperandClass {
-  let Name = "ShlImm" # OFFSET;
-  let RenderMethod = "addImmOperands";
-  let DiagnosticType = "ShlImm" # OFFSET;
-}
-
-class shl_imm<string OFFSET> : Operand<i32> {
-  let EncoderMethod = "getShiftLeftImm" # OFFSET;
-  let DecoderMethod = "DecodeShiftLeftImm" # OFFSET;
-  let ParserMatchClass =
-    !cast<AsmOperandClass>("shl_imm" # OFFSET # "_asmoperand");
-}
-
-def shl_imm8_asmoperand : shl_imm_asmoperands<"8">;
-def shl_imm16_asmoperand : shl_imm_asmoperands<"16">;
-def shl_imm32_asmoperand : shl_imm_asmoperands<"32">;
-def shl_imm64_asmoperand : shl_imm_asmoperands<"64">;
-
-def shl_imm8 : shl_imm<"8">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 8;}]>;
-def shl_imm16 : shl_imm<"16">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 16;}]>;
-def shl_imm32 : shl_imm<"32">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 32;}]>;
-def shl_imm64 : shl_imm<"64">, ImmLeaf<i32, [{return Imm >= 0 && Imm < 64;}]>;
-
-class N2VShift<bit q, bit u, bits<5> opcode, string asmop, string T,
-               RegisterOperand VPRC, ValueType Ty, Operand ImmTy, SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd),
-                        (Ty (OpNode (Ty VPRC:$Rn),
-                          (Ty (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShL<bit u, bits<5> opcode, string asmop> {
-  // 64-bit vector types.
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8, shl> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16, shl> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32, shl> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64, shl> {
-    let Inst{22} = 0b1;        // immh:immb = 1xxxxxx
-  }
-}
-
-multiclass NeonI_N2VShR<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShift<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                     OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                     OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                     OpNode> {
-     let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                      OpNode> {
-                      let Inst{22-19} = 0b0001;
-                    }
-
-  def _8H : N2VShift<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                     OpNode> {
-                     let Inst{22-20} = 0b001;
-                    }
-
-  def _4S : N2VShift<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                     OpNode> {
-                      let Inst{22-21} = 0b01;
-                    }
-
-  def _2D : N2VShift<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                     OpNode> {
-                      let Inst{22} = 0b1;
-                    }
-}
-
-// Shift left
-
-defm SHLvvi : NeonI_N2VShL<0b0, 0b01010, "shl">;
-
-// Additional patterns to match vector shift left by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shl_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shl_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shl_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shl_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shl_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SHLvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shl_imm32:$Imm),
-              sub_32)>;
-
-// Shift right
-defm SSHRvvi : NeonI_N2VShR<0b0, 0b00000, "sshr", sra>;
-defm USHRvvi : NeonI_N2VShR<0b1, 0b00000, "ushr", srl>;
-
-// Additional patterns to match vector shift right by immediate.
-// (v1i8/v1i16/v1i32 types)
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (SSHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn),
-                     (v1i8 (Neon_vdup (i32 (shr_imm8:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          shr_imm8:$Imm),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn),
-                      (v1i16 (Neon_vdup (i32 (shr_imm16:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          shr_imm16:$Imm),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn),
-                      (v1i32 (Neon_vdup (i32 (shr_imm32:$Imm)))))),
-          (EXTRACT_SUBREG
-              (USHRvvi_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          shr_imm32:$Imm),
-              sub_32)>;
-
-def Neon_High16B : PatFrag<(ops node:$in),
-                           (extract_subvector (v16i8 node:$in), (iPTR 8))>;
-def Neon_High8H  : PatFrag<(ops node:$in),
-                           (extract_subvector (v8i16 node:$in), (iPTR 4))>;
-def Neon_High4S  : PatFrag<(ops node:$in),
-                           (extract_subvector (v4i32 node:$in), (iPTR 2))>;
-def Neon_High2D  : PatFrag<(ops node:$in),
-                           (extract_subvector (v2i64 node:$in), (iPTR 1))>;
-def Neon_High4float : PatFrag<(ops node:$in),
-                               (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-def Neon_High2double : PatFrag<(ops node:$in),
-                               (extract_subvector (v2f64 node:$in), (iPTR 1))>;
-
-def Neon_Low16B : PatFrag<(ops node:$in),
-                          (v8i8 (extract_subvector (v16i8 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low8H : PatFrag<(ops node:$in),
-                         (v4i16 (extract_subvector (v8i16 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4S : PatFrag<(ops node:$in),
-                         (v2i32 (extract_subvector (v4i32 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low2D : PatFrag<(ops node:$in),
-                         (v1i64 (extract_subvector (v2i64 node:$in),
-                                                   (iPTR 0)))>;
-def Neon_Low4float : PatFrag<(ops node:$in),
-                             (v2f32 (extract_subvector (v4f32 node:$in),
-                                                       (iPTR 0)))>;
-def Neon_Low2double : PatFrag<(ops node:$in),
-                              (v1f64 (extract_subvector (v2f64 node:$in),
-                                                        (iPTR 0)))>;
-
-class N2VShiftLong<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                   string SrcT, ValueType DestTy, ValueType SrcTy,
-                   Operand ImmTy, SDPatternOperator ExtOp>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR64:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp (SrcTy VPR64:$Rn))),
-                            (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShiftLongHigh<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, ValueType DestTy, ValueType SrcTy,
-                       int StartIndex, Operand ImmTy,
-                       SDPatternOperator ExtOp, PatFrag getTop>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "2\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [(set (DestTy VPR128:$Rd),
-                        (DestTy (shl
-                          (DestTy (ExtOp
-                            (SrcTy (getTop VPR128:$Rn)))),
-                              (DestTy (Neon_vdup (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VShLL<string prefix, bit u, bits<5> opcode, string asmop,
-                         SDNode ExtOp> {
-  // 64-bit vector types.
-  def _8B : N2VShiftLong<0b0, u, opcode, asmop, "8h", "8b", v8i16, v8i8,
-                         shl_imm8, ExtOp> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _4H : N2VShiftLong<0b0, u, opcode, asmop, "4s", "4h", v4i32, v4i16,
-                         shl_imm16, ExtOp> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _2S : N2VShiftLong<0b0, u, opcode, asmop, "2d", "2s", v2i64, v2i32,
-                         shl_imm32, ExtOp> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // 128-bit vector types
-  def _16B : N2VShiftLongHigh<0b1, u, opcode, asmop, "8h", "16b", v8i16, v8i8,
-                              8, shl_imm8, ExtOp, Neon_High16B> {
-    let Inst{22-19} = 0b0001;  // immh:immb = 0001xxx
-  }
-
-  def _8H : N2VShiftLongHigh<0b1, u, opcode, asmop, "4s", "8h", v4i32, v4i16,
-                             4, shl_imm16, ExtOp, Neon_High8H> {
-    let Inst{22-20} = 0b001;   // immh:immb = 001xxxx
-  }
-
-  def _4S : N2VShiftLongHigh<0b1, u, opcode, asmop, "2d", "4s", v2i64, v2i32,
-                             2, shl_imm32, ExtOp, Neon_High4S> {
-    let Inst{22-21} = 0b01;    // immh:immb = 01xxxxx
-  }
-
-  // Use other patterns to match when the immediate is 0.
-  def : Pat<(v8i16 (ExtOp (v8i8 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_8B") VPR64:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_4H") VPR64:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "_2S") VPR64:$Rn, 0)>;
-
-  def : Pat<(v8i16 (ExtOp (v8i8 (Neon_High16B VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_16B") VPR128:$Rn, 0)>;
-
-  def : Pat<(v4i32 (ExtOp (v4i16 (Neon_High8H VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_8H") VPR128:$Rn, 0)>;
-
-  def : Pat<(v2i64 (ExtOp (v2i32 (Neon_High4S VPR128:$Rn)))),
-            (!cast<Instruction>(prefix # "_4S") VPR128:$Rn, 0)>;
-}
-
-// Shift left long
-defm SSHLLvvi : NeonI_N2VShLL<"SSHLLvvi", 0b0, 0b10100, "sshll", sext>;
-defm USHLLvvi : NeonI_N2VShLL<"USHLLvvi", 0b1, 0b10100, "ushll", zext>;
-
-class NeonI_ext_len_alias<string asmop, string lane, string laneOp,
-                       Instruction inst, RegisterOperand VPRC,
-                       RegisterOperand VPRCOp>
-  : NeonInstAlias<asmop # "\t$Rd" # lane #", $Rn" # laneOp,
-                  (inst VPRC:$Rd, VPRCOp:$Rn, 0), 0b0>;
-
-// Signed integer lengthen (vector) is alias for SSHLL Vd, Vn, #0
-// Signed integer lengthen (vector, second part) is alias for SSHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def SXTLvv_8B  : NeonI_ext_len_alias<"sxtl", ".8h", ".8b",  SSHLLvvi_8B, VPR128, VPR64>;
-def SXTLvv_4H  : NeonI_ext_len_alias<"sxtl", ".4s", ".4h",  SSHLLvvi_4H, VPR128, VPR64>;
-def SXTLvv_2S  : NeonI_ext_len_alias<"sxtl", ".2d", ".2s",  SSHLLvvi_2S, VPR128, VPR64>;
-def SXTL2vv_16B : NeonI_ext_len_alias<"sxtl2", ".8h", ".16b",  SSHLLvvi_16B, VPR128, VPR128>;
-def SXTL2vv_8H  : NeonI_ext_len_alias<"sxtl2", ".4s", ".8h",  SSHLLvvi_8H, VPR128, VPR128>;
-def SXTL2vv_4S  : NeonI_ext_len_alias<"sxtl2", ".2d", ".4s",  SSHLLvvi_4S, VPR128, VPR128>;
-
-// Unsigned integer lengthen (vector) is alias for USHLL Vd, Vn, #0
-// Unsigned integer lengthen (vector, second part) is alias for USHLL2 Vd, Vn, #0
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-def UXTLvv_8B  : NeonI_ext_len_alias<"uxtl", ".8h", ".8b",  USHLLvvi_8B, VPR128, VPR64>;
-def UXTLvv_4H  : NeonI_ext_len_alias<"uxtl", ".4s", ".4h",  USHLLvvi_4H, VPR128, VPR64>;
-def UXTLvv_2S  : NeonI_ext_len_alias<"uxtl", ".2d", ".2s",  USHLLvvi_2S, VPR128, VPR64>;
-def UXTL2vv_16B : NeonI_ext_len_alias<"uxtl2", ".8h", ".16b",  USHLLvvi_16B, VPR128, VPR128>;
-def UXTL2vv_8H  : NeonI_ext_len_alias<"uxtl2", ".4s", ".8h",  USHLLvvi_8H, VPR128, VPR128>;
-def UXTL2vv_4S  : NeonI_ext_len_alias<"uxtl2", ".2d", ".4s",  USHLLvvi_4S, VPR128, VPR128>;
-
-def : Pat<(v8i16 (anyext (v8i8 VPR64:$Rn))), (USHLLvvi_8B VPR64:$Rn, 0)>;
-def : Pat<(v4i32 (anyext (v4i16 VPR64:$Rn))), (USHLLvvi_4H VPR64:$Rn, 0)>;
-def : Pat<(v2i64 (anyext (v2i32 VPR64:$Rn))), (USHLLvvi_2S VPR64:$Rn, 0)>;
-
-// Rounding/Saturating shift
-class N2VShift_RQ<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$Rn),
-                        (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-// shift right (vector by immediate)
-multiclass NeonI_N2VShR_RQ<bit u, bits<5> opcode, string asmop,
-                           SDPatternOperator OpNode> {
-  def _8B  : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H  : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                         OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S  : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                         OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VShL_Q<bit u, bits<5> opcode, string asmop,
-                          SDPatternOperator OpNode> {
-  // 64-bit vector types.
-  def _8B : N2VShift_RQ<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShift_RQ<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShift_RQ<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // 128-bit vector types.
-  def _16B : N2VShift_RQ<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShift_RQ<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShift_RQ<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShift_RQ<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right
-defm SRSHRvvi : NeonI_N2VShR_RQ<0b0, 0b00100, "srshr",
-                                int_aarch64_neon_vsrshr>;
-defm URSHRvvi : NeonI_N2VShR_RQ<0b1, 0b00100, "urshr",
-                                int_aarch64_neon_vurshr>;
-
-// Saturating shift left unsigned
-defm SQSHLUvvi : NeonI_N2VShL_Q<0b1, 0b01100, "sqshlu", int_aarch64_neon_vsqshlu>;
-
-// Saturating shift left
-defm SQSHLvvi : NeonI_N2VShL_Q<0b0, 0b01110, "sqshl", Neon_sqrshlImm>;
-defm UQSHLvvi : NeonI_N2VShL_Q<0b1, 0b01110, "uqshl", Neon_uqrshlImm>;
-
-class N2VShiftAdd<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDNode OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-              (Ty (OpNode (Ty VPRC:$Rn),
-                (Ty (Neon_vdup (i32 ImmTy:$Imm))))))))],
-           NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// Shift Right accumulate
-multiclass NeonI_N2VShRAdd<bit u, bits<5> opcode, string asmop, SDNode OpNode> {
-  def _8B : N2VShiftAdd<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift right and accumulate
-defm SSRAvvi    : NeonI_N2VShRAdd<0, 0b00010, "ssra", sra>;
-defm USRAvvi    : NeonI_N2VShRAdd<1, 0b00010, "usra", srl>;
-
-// Rounding shift accumulate
-class N2VShiftAdd_R<bit q, bit u, bits<5> opcode, string asmop, string T,
-                    RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                    SDPatternOperator OpNode>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (Ty VPRC:$Rd), (Ty (add (Ty VPRC:$src),
-                        (Ty (OpNode (Ty VPRC:$Rn), (i32 ImmTy:$Imm))))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_N2VShRAdd_R<bit u, bits<5> opcode, string asmop,
-                             SDPatternOperator OpNode> {
-  def _8B : N2VShiftAdd_R<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                          OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftAdd_R<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftAdd_R<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _16B : N2VShiftAdd_R<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                           OpNode> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftAdd_R<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                          OpNode> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftAdd_R<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                          OpNode> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftAdd_R<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                          OpNode> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Rounding shift right and accumulate
-defm SRSRAvvi : NeonI_N2VShRAdd_R<0, 0b00110, "srsra", int_aarch64_neon_vsrshr>;
-defm URSRAvvi : NeonI_N2VShRAdd_R<1, 0b00110, "ursra", int_aarch64_neon_vurshr>;
-
-// Shift insert by immediate
-class N2VShiftIns<bit q, bit u, bits<5> opcode, string asmop, string T,
-                  RegisterOperand VPRC, ValueType Ty, Operand ImmTy,
-                  SDPatternOperator OpNode>
-    : NeonI_2VShiftImm<q, u, opcode,
-           (outs VPRC:$Rd), (ins VPRC:$src, VPRC:$Rn, ImmTy:$Imm),
-           asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-           [(set (Ty VPRC:$Rd), (Ty (OpNode (Ty VPRC:$src), (Ty VPRC:$Rn),
-             (i32 ImmTy:$Imm))))],
-           NoItinerary>,
-      Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// shift left insert (vector by immediate)
-multiclass NeonI_N2VShLIns<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shl_imm8,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shl_imm8,
-                         int_aarch64_neon_vsli> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shl_imm16,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shl_imm32,
-                        int_aarch64_neon_vsli> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shl_imm64,
-                        int_aarch64_neon_vsli> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// shift right insert (vector by immediate)
-multiclass NeonI_N2VShRIns<bit u, bits<5> opcode, string asmop> {
-    // 64-bit vector types.
-  def _8B : N2VShiftIns<0b0, u, opcode, asmop, "8b", VPR64, v8i8, shr_imm8,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShiftIns<0b0, u, opcode, asmop, "4h", VPR64, v4i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShiftIns<0b0, u, opcode, asmop, "2s", VPR64, v2i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-    // 128-bit vector types
-  def _16B : N2VShiftIns<0b1, u, opcode, asmop, "16b", VPR128, v16i8, shr_imm8,
-                         int_aarch64_neon_vsri> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShiftIns<0b1, u, opcode, asmop, "8h", VPR128, v8i16, shr_imm16,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShiftIns<0b1, u, opcode, asmop, "4s", VPR128, v4i32, shr_imm32,
-                        int_aarch64_neon_vsri> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VShiftIns<0b1, u, opcode, asmop, "2d", VPR128, v2i64, shr_imm64,
-                        int_aarch64_neon_vsri> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Shift left and insert
-defm SLIvvi   : NeonI_N2VShLIns<0b1, 0b01010, "sli">;
-
-// Shift right and insert
-defm SRIvvi   : NeonI_N2VShRIns<0b1, 0b01000, "sri">;
-
-class N2VShR_Narrow<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                    string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPR64:$Rd), (ins VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-class N2VShR_Narrow_Hi<bit q, bit u, bits<5> opcode, string asmop, string DestT,
-                       string SrcT, Operand ImmTy>
-  : NeonI_2VShiftImm<q, u, opcode, (outs VPR128:$Rd),
-                     (ins VPR128:$src, VPR128:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # DestT # ", $Rn." # SrcT # ", $Imm",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// left long shift by immediate
-multiclass NeonI_N2VShR_Narrow<bit u, bits<5> opcode, string asmop> {
-  def _8B : N2VShR_Narrow<0b0, u, opcode, asmop, "8b", "8h", shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _4H : N2VShR_Narrow<0b0, u, opcode, asmop, "4h", "4s", shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _2S : N2VShR_Narrow<0b0, u, opcode, asmop, "2s", "2d", shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-
-  // Shift Narrow High
-  def _16B : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "16b", "8h",
-                              shr_imm8> {
-    let Inst{22-19} = 0b0001;
-  }
-
-  def _8H : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "8h", "4s",
-                             shr_imm16> {
-    let Inst{22-20} = 0b001;
-  }
-
-  def _4S : N2VShR_Narrow_Hi<0b1, u, opcode, asmop # "2", "4s", "2d",
-                             shr_imm32> {
-    let Inst{22-21} = 0b01;
-  }
-}
-
-// Shift right narrow
-defm SHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10000, "shrn">;
-
-// Shift right narrow (prefix Q is saturating, prefix R is rounding)
-defm QSHRUNvvi :NeonI_N2VShR_Narrow<0b1, 0b10000, "sqshrun">;
-defm RSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10001, "rshrn">;
-defm QRSHRUNvvi : NeonI_N2VShR_Narrow<0b1, 0b10001, "sqrshrun">;
-defm SQSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10010, "sqshrn">;
-defm UQSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10010, "uqshrn">;
-defm SQRSHRNvvi : NeonI_N2VShR_Narrow<0b0, 0b10011, "sqrshrn">;
-defm UQRSHRNvvi : NeonI_N2VShR_Narrow<0b1, 0b10011, "uqrshrn">;
-
-def Neon_combine_2D : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2i64 (concat_vectors (v1i64 node:$Rm),
-                                                     (v1i64 node:$Rn)))>;
-def Neon_combine_8H : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v8i16 (concat_vectors (v4i16 node:$Rm),
-                                                     (v4i16 node:$Rn)))>;
-def Neon_combine_4S : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4i32 (concat_vectors (v2i32 node:$Rm),
-                                                     (v2i32 node:$Rn)))>;
-def Neon_combine_4f : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v4f32 (concat_vectors (v2f32 node:$Rm),
-                                                     (v2f32 node:$Rn)))>;
-def Neon_combine_2d : PatFrag<(ops node:$Rm, node:$Rn),
-                              (v2f64 (concat_vectors (v1f64 node:$Rm),
-                                                     (v1f64 node:$Rn)))>;
-
-def Neon_lshrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (srl (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (srl (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_lshrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (srl (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm8H : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v8i16 (sra (v8i16 node:$lhs),
-                               (v8i16 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm4S : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v4i32 (sra (v4i32 node:$lhs),
-                               (v4i32 (Neon_vdup (i32 node:$rhs)))))>;
-def Neon_ashrImm2D : PatFrag<(ops node:$lhs, node:$rhs),
-                             (v2i64 (sra (v2i64 node:$lhs),
-                               (v2i64 (Neon_vdup (i32 node:$rhs)))))>;
-
-// Normal shift right narrow is matched by IR (srl/sra, trunc, concat_vectors)
-multiclass Neon_shiftNarrow_patterns<string shr> {
-  def : Pat<(v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H") VPR128:$Rn,
-              (i32 shr_imm8:$Imm)))),
-            (SHRNvvi_8B VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S") VPR128:$Rn,
-              (i32 shr_imm16:$Imm)))),
-            (SHRNvvi_4H VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D") VPR128:$Rn,
-              (i32 shr_imm32:$Imm)))),
-            (SHRNvvi_2S VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v8i8 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm8H")
-                VPR128:$Rn, (i32 shr_imm8:$Imm))))))),
-            (SHRNvvi_16B (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-                         VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v4i16 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm4S")
-                VPR128:$Rn, (i32 shr_imm16:$Imm))))))),
-            (SHRNvvi_8H (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src), (v1i64 (bitconvert
-              (v2i32 (trunc (!cast<PatFrag>("Neon_" # shr # "Imm2D")
-                VPR128:$Rn, (i32 shr_imm32:$Imm))))))),
-            (SHRNvvi_4S (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                        VPR128:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_shiftNarrow_QR_patterns<SDPatternOperator op, string prefix> {
-  def : Pat<(v8i8 (op (v8i16 VPR128:$Rn), shr_imm8:$Imm)),
-            (!cast<Instruction>(prefix # "_8B") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v4i16 (op (v4i32 VPR128:$Rn), shr_imm16:$Imm)),
-            (!cast<Instruction>(prefix # "_4H") VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(v2i32 (op (v2i64 VPR128:$Rn), shr_imm32:$Imm)),
-            (!cast<Instruction>(prefix # "_2S") VPR128:$Rn, imm:$Imm)>;
-
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v8i8
-                    (op (v8i16 VPR128:$Rn), shr_imm8:$Imm))))),
-            (!cast<Instruction>(prefix # "_16B")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v4i16
-                    (op (v4i32 VPR128:$Rn), shr_imm16:$Imm))))),
-            (!cast<Instruction>(prefix # "_8H")
-                (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                VPR128:$Rn, imm:$Imm)>;
-  def : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                (v1i64 (bitconvert (v2i32
-                    (op (v2i64 VPR128:$Rn), shr_imm32:$Imm))))),
-            (!cast<Instruction>(prefix # "_4S")
-                  (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                  VPR128:$Rn, imm:$Imm)>;
-}
-
-defm : Neon_shiftNarrow_patterns<"lshr">;
-defm : Neon_shiftNarrow_patterns<"ashr">;
-
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrun, "QSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vrshrn, "RSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrun, "QRSHRUNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqshrn, "SQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqshrn, "UQSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vsqrshrn, "SQRSHRNvvi">;
-defm : Neon_shiftNarrow_QR_patterns<int_aarch64_neon_vuqrshrn, "UQRSHRNvvi">;
-
-// Convert fix-point and float-pointing
-class N2VCvt_Fx<bit q, bit u, bits<5> opcode, string asmop, string T,
-                RegisterOperand VPRC, ValueType DestTy, ValueType SrcTy,
-                Operand ImmTy, SDPatternOperator IntOp>
-  : NeonI_2VShiftImm<q, u, opcode,
-                     (outs VPRC:$Rd), (ins VPRC:$Rn, ImmTy:$Imm),
-                     asmop # "\t$Rd." # T # ", $Rn." # T # ", $Imm",
-                     [(set (DestTy VPRC:$Rd), (DestTy (IntOp (SrcTy VPRC:$Rn),
-                       (i32 ImmTy:$Imm))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_N2VCvt_Fx2fp<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2f32, v2i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4f32, v4i32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2f64, v2i64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-multiclass NeonI_N2VCvt_Fp2fx<bit u, bits<5> opcode, string asmop,
-                              SDPatternOperator IntOp> {
-  def _2S : N2VCvt_Fx<0, u, opcode, asmop, "2s", VPR64, v2i32, v2f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _4S : N2VCvt_Fx<1, u, opcode, asmop, "4s", VPR128, v4i32, v4f32,
-                      shr_imm32, IntOp> {
-    let Inst{22-21} = 0b01;
-  }
-
-  def _2D : N2VCvt_Fx<1, u, opcode, asmop, "2d", VPR128, v2i64, v2f64,
-                      shr_imm64, IntOp> {
-    let Inst{22} = 0b1;
-  }
-}
-
-// Convert fixed-point to floating-point
-defm VCVTxs2f : NeonI_N2VCvt_Fx2fp<0, 0b11100, "scvtf",
-                                   int_arm_neon_vcvtfxs2fp>;
-defm VCVTxu2f : NeonI_N2VCvt_Fx2fp<1, 0b11100, "ucvtf",
-                                   int_arm_neon_vcvtfxu2fp>;
-
-// Convert floating-point to fixed-point
-defm VCVTf2xs : NeonI_N2VCvt_Fp2fx<0, 0b11111, "fcvtzs",
-                                   int_arm_neon_vcvtfp2fxs>;
-defm VCVTf2xu : NeonI_N2VCvt_Fp2fx<1, 0b11111, "fcvtzu",
-                                   int_arm_neon_vcvtfp2fxu>;
-
-multiclass Neon_sshll2_0<SDNode ext>
-{
-  def _v8i8  : PatFrag<(ops node:$Rn),
-                       (v8i16 (ext (v8i8 (Neon_High16B node:$Rn))))>;
-  def _v4i16 : PatFrag<(ops node:$Rn),
-                       (v4i32 (ext (v4i16 (Neon_High8H node:$Rn))))>;
-  def _v2i32 : PatFrag<(ops node:$Rn),
-                       (v2i64 (ext (v2i32 (Neon_High4S node:$Rn))))>;
-}
-
-defm NI_sext_high : Neon_sshll2_0<sext>;
-defm NI_zext_high : Neon_sshll2_0<zext>;
-
-
-//===----------------------------------------------------------------------===//
-// Multiclasses for NeonI_Across
-//===----------------------------------------------------------------------===//
-
-// Variant 1
-
-multiclass NeonI_2VAcross_1<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1h8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1s8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1d2s doesn't exist!
-
-    def _1d4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i64 FPR64:$Rd),
-                    (v1i64 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLV : NeonI_2VAcross_1<0b0, 0b00011, "saddlv", int_aarch64_neon_saddlv>;
-defm UADDLV : NeonI_2VAcross_1<0b1, 0b00011, "uaddlv", int_aarch64_neon_uaddlv>;
-
-// Variant 2
-
-multiclass NeonI_2VAcross_2<bit u, bits<5> opcode,
-                            string asmop, SDPatternOperator opnode>
-{
-    def _1b8b:  NeonI_2VAcross<0b0, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.8b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v8i8 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1b16b: NeonI_2VAcross<0b1, u, 0b00, opcode,
-                (outs FPR8:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.16b",
-                [(set (v1i8 FPR8:$Rd),
-                    (v1i8 (opnode (v16i8 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h4h:  NeonI_2VAcross<0b0, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR64:$Rn),
-                asmop # "\t$Rd, $Rn.4h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v4i16 VPR64:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def _1h8h:  NeonI_2VAcross<0b1, u, 0b01, opcode,
-                (outs FPR16:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.8h",
-                [(set (v1i16 FPR16:$Rd),
-                    (v1i16 (opnode (v8i16 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    // _1s2s doesn't exist!
-
-    def _1s4s:  NeonI_2VAcross<0b1, u, 0b10, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (v1i32 FPR32:$Rd),
-                    (v1i32 (opnode (v4i32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SMAXV : NeonI_2VAcross_2<0b0, 0b01010, "smaxv", int_aarch64_neon_smaxv>;
-defm UMAXV : NeonI_2VAcross_2<0b1, 0b01010, "umaxv", int_aarch64_neon_umaxv>;
-
-defm SMINV : NeonI_2VAcross_2<0b0, 0b11010, "sminv", int_aarch64_neon_sminv>;
-defm UMINV : NeonI_2VAcross_2<0b1, 0b11010, "uminv", int_aarch64_neon_uminv>;
-
-defm ADDV : NeonI_2VAcross_2<0b0, 0b11011, "addv", int_aarch64_neon_vaddv>;
-
-// Variant 3
-
-multiclass NeonI_2VAcross_3<bit u, bits<5> opcode, bits<2> size,
-                            string asmop, SDPatternOperator opnode> {
-    def _1s4s:  NeonI_2VAcross<0b1, u, size, opcode,
-                (outs FPR32:$Rd), (ins VPR128:$Rn),
-                asmop # "\t$Rd, $Rn.4s",
-                [(set (f32 FPR32:$Rd),
-                    (f32 (opnode (v4f32 VPR128:$Rn))))],
-                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FMAXNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b00, "fmaxnmv",
-                                int_aarch64_neon_vmaxnmv>;
-defm FMINNMV : NeonI_2VAcross_3<0b1, 0b01100, 0b10, "fminnmv",
-                                int_aarch64_neon_vminnmv>;
-
-defm FMAXV : NeonI_2VAcross_3<0b1, 0b01111, 0b00, "fmaxv",
-                              int_aarch64_neon_vmaxv>;
-defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
-                              int_aarch64_neon_vminv>;
-
-// The followings are for instruction class (Perm)
-
-class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
-                    string asmop, RegisterOperand OpVPR, string OpS,
-                    SDPatternOperator opnode, ValueType Ty>
-  : NeonI_Perm<q, size, opcode,
-               (outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (Ty OpVPR:$Rd),
-                  (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
-                          SDPatternOperator opnode> {
-  def _8b  : NeonI_Permute<0b0, 0b00, opcode, asmop,
-                           VPR64, "8b", opnode, v8i8>;
-  def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
-                           VPR128, "16b",opnode, v16i8>;
-  def _4h  : NeonI_Permute<0b0, 0b01, opcode, asmop,
-                           VPR64, "4h", opnode, v4i16>;
-  def _8h  : NeonI_Permute<0b1, 0b01, opcode, asmop,
-                           VPR128, "8h", opnode, v8i16>;
-  def _2s  : NeonI_Permute<0b0, 0b10, opcode, asmop,
-                           VPR64, "2s", opnode, v2i32>;
-  def _4s  : NeonI_Permute<0b1, 0b10, opcode, asmop,
-                           VPR128, "4s", opnode, v4i32>;
-  def _2d  : NeonI_Permute<0b1, 0b11, opcode, asmop,
-                           VPR128, "2d", opnode, v2i64>;
-}
-
-defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
-defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
-defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
-defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
-defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
-defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
-
-multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
-  def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
-            (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
-
-  def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
-
-  def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
-            (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
-defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
-defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
-defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
-defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
-defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
-
-// The followings are for instruction class (3V Diff)
-
-// normal long/long2 pattern
-class NeonI_3VDL<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy (ext (OpTy OpVPR:$Rn))),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_s<bit u, bits<4> opcode,
-                        string asmop, SDPatternOperator opnode,
-                        bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, sext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, sext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, sext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h  : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                            opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s  : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                            opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-multiclass NeonI_3VDL_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                           opnode, zext, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VDL<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                           opnode, zext, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VDL<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                           opnode, zext, VPR64, v2i64, v2i32>;
-  }
-}
-
-multiclass NeonI_3VDL2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                            opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                           opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                           opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-  }
-}
-
-defm SADDLvvv :  NeonI_3VDL_s<0b0, 0b0000, "saddl", add, 1>;
-defm UADDLvvv :  NeonI_3VDL_u<0b1, 0b0000, "uaddl", add, 1>;
-
-defm SADDL2vvv :  NeonI_3VDL2_s<0b0, 0b0000, "saddl2", add, 1>;
-defm UADDL2vvv :  NeonI_3VDL2_u<0b1, 0b0000, "uaddl2", add, 1>;
-
-defm SSUBLvvv :  NeonI_3VDL_s<0b0, 0b0010, "ssubl", sub, 0>;
-defm USUBLvvv :  NeonI_3VDL_u<0b1, 0b0010, "usubl", sub, 0>;
-
-defm SSUBL2vvv :  NeonI_3VDL2_s<0b0, 0b0010, "ssubl2", sub, 0>;
-defm USUBL2vvv :  NeonI_3VDL2_u<0b1, 0b0010, "usubl2", sub, 0>;
-
-// normal wide/wide2 pattern
-class NeonI_3VDW<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS,
-                 SDPatternOperator opnode, SDPatternOperator ext,
-                 RegisterOperand OpVPR,
-                 ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # ResS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (ResTy VPR128:$Rn),
-                                   (ResTy (ext (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDW_s<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, sext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, sext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, sext, VPR64, v2i64, v2i32>;
-}
-
-defm SADDWvvv :  NeonI_3VDW_s<0b0, 0b0001, "saddw", add>;
-defm SSUBWvvv :  NeonI_3VDW_s<0b0, 0b0011, "ssubw", sub>;
-
-multiclass NeonI_3VDW2_s<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_sext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h  : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                          opnode, NI_sext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s  : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                          opnode, NI_sext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm SADDW2vvv :  NeonI_3VDW2_s<0b0, 0b0001, "saddw2", add>;
-defm SSUBW2vvv :  NeonI_3VDW2_s<0b0, 0b0011, "ssubw2", sub>;
-
-multiclass NeonI_3VDW_u<bit u, bits<4> opcode, string asmop,
-                        SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDW<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                         opnode, zext, VPR64, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDW<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                         opnode, zext, VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDW<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                         opnode, zext, VPR64, v2i64, v2i32>;
-}
-
-defm UADDWvvv :  NeonI_3VDW_u<0b1, 0b0001, "uaddw", add>;
-defm USUBWvvv :  NeonI_3VDW_u<0b1, 0b0011, "usubw", sub>;
-
-multiclass NeonI_3VDW2_u<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode> {
-  def _8h16b : NeonI_3VDW<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                          opnode, NI_zext_high_v8i8, VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDW<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                         opnode, NI_zext_high_v4i16, VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDW<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                         opnode, NI_zext_high_v2i32, VPR128, v2i64, v4i32>;
-}
-
-defm UADDW2vvv :  NeonI_3VDW2_u<0b1, 0b0001, "uaddw2", add>;
-defm USUBW2vvv :  NeonI_3VDW2_u<0b1, 0b0011, "usubw2", sub>;
-
-// Get the high half part of the vector element.
-multiclass NeonI_get_high {
-  def _8h : PatFrag<(ops node:$Rn),
-                    (v8i8 (trunc (v8i16 (srl (v8i16 node:$Rn),
-                                             (v8i16 (Neon_vdup (i32 8)))))))>;
-  def _4s : PatFrag<(ops node:$Rn),
-                    (v4i16 (trunc (v4i32 (srl (v4i32 node:$Rn),
-                                              (v4i32 (Neon_vdup (i32 16)))))))>;
-  def _2d : PatFrag<(ops node:$Rn),
-                    (v2i32 (trunc (v2i64 (srl (v2i64 node:$Rn),
-                                              (v2i64 (Neon_vdup (i32 32)))))))>;
-}
-
-defm NI_get_hi : NeonI_get_high;
-
-// pattern for addhn/subhn with 2 operands
-class NeonI_3VDN_addhn_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode, SDPatternOperator get_hi,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR64:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR64:$Rd),
-                    (ResTy (get_hi
-                      (OpTy (opnode (OpTy VPR128:$Rn),
-                                    (OpTy VPR128:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDN_addhn_2Op<bit u, bits<4> opcode, string asmop,
-                                SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VDN_addhn_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                                     opnode, NI_get_hi_8h, v8i8, v8i16>;
-    def _4h4s : NeonI_3VDN_addhn_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                                     opnode, NI_get_hi_4s, v4i16, v4i32>;
-    def _2s2d : NeonI_3VDN_addhn_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                                     opnode, NI_get_hi_2d, v2i32, v2i64>;
-  }
-}
-
-defm ADDHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0100, "addhn", add, 1>;
-defm SUBHNvvv  : NeonI_3VDN_addhn_2Op<0b0, 0b0110, "subhn", sub, 0>;
-
-// pattern for operation with 2 operands
-class NeonI_3VD_2Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                    string asmop, string ResS, string OpS,
-                    SDPatternOperator opnode,
-                    RegisterOperand ResVPR, RegisterOperand OpVPR,
-                    ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy ResVPR:$Rd),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// normal narrow pattern
-multiclass NeonI_3VDN_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8b8h : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8b", "8h",
-                              opnode, VPR64, VPR128, v8i8, v8i16>;
-    def _4h4s : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4h", "4s",
-                              opnode, VPR64, VPR128, v4i16, v4i32>;
-    def _2s2d : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2s", "2d",
-                              opnode, VPR64, VPR128, v2i32, v2i64>;
-  }
-}
-
-defm RADDHNvvv : NeonI_3VDN_2Op<0b1, 0b0100, "raddhn", int_arm_neon_vraddhn, 1>;
-defm RSUBHNvvv : NeonI_3VDN_2Op<0b1, 0b0110, "rsubhn", int_arm_neon_vrsubhn, 0>;
-
-// pattern for acle intrinsic with 3 operands
-class NeonI_3VDN_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let neverHasSideEffects = 1;
-}
-
-multiclass NeonI_3VDN_3Op_v1<bit u, bits<4> opcode, string asmop> {
-  def _16b8h : NeonI_3VDN_3Op<0b1, u, 0b00, opcode, asmop, "16b", "8h">;
-  def _8h4s : NeonI_3VDN_3Op<0b1, u, 0b01, opcode, asmop, "8h", "4s">;
-  def _4s2d : NeonI_3VDN_3Op<0b1, u, 0b10, opcode, asmop, "4s", "2d">;
-}
-
-defm ADDHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0100, "addhn2">;
-defm SUBHN2vvv  : NeonI_3VDN_3Op_v1<0b0, 0b0110, "subhn2">;
-
-defm RADDHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0100, "raddhn2">;
-defm RSUBHN2vvv : NeonI_3VDN_3Op_v1<0b1, 0b0110, "rsubhn2">;
-
-// Patterns have to be separate because there's a SUBREG_TO_REG in the output
-// part.
-class NarrowHighHalfPat<Instruction INST, ValueType DstTy, ValueType SrcTy,
-                        SDPatternOperator coreop>
-  : Pat<(Neon_combine_2D (v1i64 VPR64:$src),
-                      (v1i64 (bitconvert (DstTy (coreop (SrcTy VPR128:$Rn),
-                                                        (SrcTy VPR128:$Rm)))))),
-        (INST (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-              VPR128:$Rn, VPR128:$Rm)>;
-
-// addhn2 patterns
-def : NarrowHighHalfPat<ADDHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (add node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<ADDHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (add node:$LHS, node:$RHS))>>;
-
-// subhn2 patterns
-def : NarrowHighHalfPat<SUBHN2vvv_16b8h, v8i8,  v8i16,
-          BinOpFrag<(NI_get_hi_8h (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_8h4s,  v4i16, v4i32,
-          BinOpFrag<(NI_get_hi_4s (sub node:$LHS, node:$RHS))>>;
-def : NarrowHighHalfPat<SUBHN2vvv_4s2d,  v2i32, v2i64,
-          BinOpFrag<(NI_get_hi_2d (sub node:$LHS, node:$RHS))>>;
-
-// raddhn2 patterns
-def : NarrowHighHalfPat<RADDHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vraddhn>;
-def : NarrowHighHalfPat<RADDHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vraddhn>;
-
-// rsubhn2 patterns
-def : NarrowHighHalfPat<RSUBHN2vvv_16b8h, v8i8,  v8i16, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_8h4s,  v4i16, v4i32, int_arm_neon_vrsubhn>;
-def : NarrowHighHalfPat<RSUBHN2vvv_4s2d,  v2i32, v2i64, int_arm_neon_vrsubhn>;
-
-// pattern that need to extend result
-class NeonI_3VDL_Ext<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (zext (OpSTy (opnode (OpTy OpVPR:$Rn),
-                                                (OpTy OpVPR:$Rm))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_3VDL_zext<bit u, bits<4> opcode, string asmop,
-                           SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VDL_Ext<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                               opnode, VPR64, v8i16, v8i8, v8i8>;
-    def _4s4h : NeonI_3VDL_Ext<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                               opnode, VPR64, v4i32, v4i16, v4i16>;
-    def _2d2s : NeonI_3VDL_Ext<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                               opnode, VPR64, v2i64, v2i32, v2i32>;
-  }
-}
-
-defm SABDLvvv : NeonI_3VDL_zext<0b0, 0b0111, "sabdl", int_arm_neon_vabds, 1>;
-defm UABDLvvv : NeonI_3VDL_zext<0b1, 0b0111, "uabdl", int_arm_neon_vabdu, 1>;
-
-multiclass NeonI_Op_High<SDPatternOperator op> {
-  def _16B : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v8i8 (Neon_High16B node:$Rn)),
-                         (v8i8 (Neon_High16B node:$Rm)))>;
-  def _8H  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v4i16 (Neon_High8H node:$Rn)),
-                         (v4i16 (Neon_High8H node:$Rm)))>;
-  def _4S  : PatFrag<(ops node:$Rn, node:$Rm),
-                     (op (v2i32 (Neon_High4S node:$Rn)),
-                         (v2i32 (Neon_High4S node:$Rm)))>;
-}
-
-defm NI_sabdl_hi : NeonI_Op_High<int_arm_neon_vabds>;
-defm NI_uabdl_hi : NeonI_Op_High<int_arm_neon_vabdu>;
-defm NI_smull_hi : NeonI_Op_High<int_arm_neon_vmulls>;
-defm NI_umull_hi : NeonI_Op_High<int_arm_neon_vmullu>;
-defm NI_qdmull_hi : NeonI_Op_High<int_arm_neon_vqdmull>;
-defm NI_pmull_hi : NeonI_Op_High<int_arm_neon_vmullp>;
-
-multiclass NeonI_3VDL_Abd_u<bit u, bits<4> opcode, string asmop, string opnode,
-                            bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b  : NeonI_3VDL_Ext<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                !cast<PatFrag>(opnode # "_16B"),
-                                VPR128, v8i16, v16i8, v8i8>;
-    def _4s4h  : NeonI_3VDL_Ext<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                !cast<PatFrag>(opnode # "_8H"),
-                                VPR128, v4i32, v8i16, v4i16>;
-    def _2d2s  : NeonI_3VDL_Ext<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                !cast<PatFrag>(opnode # "_4S"),
-                                VPR128, v2i64, v4i32, v2i32>;
-  }
-}
-
-defm SABDL2vvv : NeonI_3VDL_Abd_u<0b0, 0b0111, "sabdl2", "NI_sabdl_hi", 1>;
-defm UABDL2vvv : NeonI_3VDL_Abd_u<0b1, 0b0111, "uabdl2", "NI_uabdl_hi", 1>;
-
-// For pattern that need two operators being chained.
-class NeonI_3VDL_Aba<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode, SDPatternOperator subop,
-                     RegisterOperand OpVPR,
-                     ValueType ResTy, ValueType OpTy, ValueType OpSTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (ResTy (zext (OpSTy (subop (OpTy OpVPR:$Rn),
-                                                 (OpTy OpVPR:$Rm))))))))],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_Aba_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode, SDPatternOperator subop>{
-  def _8h8b : NeonI_3VDL_Aba<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, subop, VPR64, v8i16, v8i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, subop, VPR64, v4i32, v4i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, subop, VPR64, v2i64, v2i32, v2i32>;
-}
-
-defm SABALvvv :  NeonI_3VDL_Aba_v1<0b0, 0b0101, "sabal",
-                                   add, int_arm_neon_vabds>;
-defm UABALvvv :  NeonI_3VDL_Aba_v1<0b1, 0b0101, "uabal",
-                                   add, int_arm_neon_vabdu>;
-
-multiclass NeonI_3VDL2_Aba_v1<bit u, bits<4> opcode, string asmop,
-                              SDPatternOperator opnode, string subop> {
-  def _8h8b : NeonI_3VDL_Aba<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                             opnode, !cast<PatFrag>(subop # "_16B"),
-                             VPR128, v8i16, v16i8, v8i8>;
-  def _4s4h : NeonI_3VDL_Aba<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                             opnode, !cast<PatFrag>(subop # "_8H"),
-                             VPR128, v4i32, v8i16, v4i16>;
-  def _2d2s : NeonI_3VDL_Aba<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                             opnode, !cast<PatFrag>(subop # "_4S"),
-                             VPR128, v2i64, v4i32, v2i32>;
-}
-
-defm SABAL2vvv :  NeonI_3VDL2_Aba_v1<0b0, 0b0101, "sabal2", add,
-                                     "NI_sabdl_hi">;
-defm UABAL2vvv :  NeonI_3VDL2_Aba_v1<0b1, 0b0101, "uabal2", add,
-                                     "NI_uabdl_hi">;
-
-// Long pattern with 2 operands
-multiclass NeonI_3VDL_2Op<bit u, bits<4> opcode, string asmop,
-                          SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable,
-      SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode, VPR128, VPR64, v8i16, v8i8>;
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-defm SMULLvvv :  NeonI_3VDL_2Op<0b0, 0b1100, "smull", int_arm_neon_vmulls, 1>;
-defm UMULLvvv :  NeonI_3VDL_2Op<0b1, 0b1100, "umull", int_arm_neon_vmullu, 1>;
-
-class NeonI_3VDL2_2Op_mull<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator opnode,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode (OpTy VPR128:$Rn), (OpTy VPR128:$Rm))))],
-                 NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-
-multiclass NeonI_3VDL2_2Op_mull_v1<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b0, 0b1100, "smull2",
-                                         "NI_smull_hi", 1>;
-defm UMULL2vvv : NeonI_3VDL2_2Op_mull_v1<0b1, 0b1100, "umull2",
-                                         "NI_umull_hi", 1>;
-
-// Long pattern with 3 operands
-class NeonI_3VDL_3Op<bit q, bit u, bits<2> size, bits<4> opcode,
-                     string asmop, string ResS, string OpS,
-                     SDPatternOperator opnode,
-                     ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR64:$Rn, VPR64:$Rm),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-                 [(set (ResTy VPR128:$Rd),
-                    (ResTy (opnode
-                      (ResTy VPR128:$src),
-                      (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL_3Op_v1<bit u, bits<4> opcode, string asmop,
-                             SDPatternOperator opnode> {
-  def _8h8b : NeonI_3VDL_3Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                             opnode, v8i16, v8i8>;
-  def _4s4h : NeonI_3VDL_3Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                             opnode, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL_3Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                             opnode, v2i64, v2i32>;
-}
-
-def Neon_smlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlal : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (add node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-def Neon_smlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmulls node:$Rn, node:$Rm))>;
-
-def Neon_umlsl : PatFrag<(ops node:$Rd, node:$Rn, node:$Rm),
-                         (sub node:$Rd,
-                            (int_arm_neon_vmullu node:$Rn, node:$Rm))>;
-
-defm SMLALvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1000, "smlal", Neon_smlal>;
-defm UMLALvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1000, "umlal", Neon_umlal>;
-
-defm SMLSLvvv :  NeonI_3VDL_3Op_v1<0b0, 0b1010, "smlsl", Neon_smlsl>;
-defm UMLSLvvv :  NeonI_3VDL_3Op_v1<0b1, 0b1010, "umlsl", Neon_umlsl>;
-
-class NeonI_3VDL2_3Op_mlas<bit q, bit u, bits<2> size, bits<4> opcode,
-                           string asmop, string ResS, string OpS,
-                           SDPatternOperator subop, SDPatternOperator opnode,
-                           RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy>
-  : NeonI_3VDiff<q, u, size, opcode,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpVPR:$Rn, OpVPR:$Rm),
-               asmop # "\t$Rd." # ResS # ", $Rn." # OpS # ", $Rm." # OpS,
-               [(set (ResTy VPR128:$Rd),
-                  (ResTy (subop
-                    (ResTy VPR128:$src),
-                    (ResTy (opnode (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm))))))],
-               NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NeonI_3VDL2_3Op_mlas_v1<bit u, bits<4> opcode, string asmop,
-                                   SDPatternOperator subop, string opnode> {
-  def _8h16b : NeonI_3VDL2_3Op_mlas<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                    subop, !cast<PatFrag>(opnode # "_16B"),
-                                    VPR128, v8i16, v16i8>;
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   subop, !cast<PatFrag>(opnode # "_8H"),
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   subop, !cast<PatFrag>(opnode # "_4S"),
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1000, "smlal2",
-                                          add, "NI_smull_hi">;
-defm UMLAL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1000, "umlal2",
-                                          add, "NI_umull_hi">;
-
-defm SMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b0, 0b1010, "smlsl2",
-                                          sub, "NI_smull_hi">;
-defm UMLSL2vvv :  NeonI_3VDL2_3Op_mlas_v1<0b1, 0b1010, "umlsl2",
-                                          sub, "NI_umull_hi">;
-
-multiclass NeonI_3VDL_qdmlal_3Op_v2<bit u, bits<4> opcode, string asmop,
-                                    SDPatternOperator opnode> {
-  def _4s4h : NeonI_3VDL2_3Op_mlas<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v4i32, v4i16>;
-  def _2d2s : NeonI_3VDL2_3Op_mlas<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                                   opnode, int_arm_neon_vqdmull,
-                                   VPR64, v2i64, v2i32>;
-}
-
-defm SQDMLALvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1001, "sqdmlal",
-                                           int_arm_neon_vqadds>;
-defm SQDMLSLvvv : NeonI_3VDL_qdmlal_3Op_v2<0b0, 0b1011, "sqdmlsl",
-                                           int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v2<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s4h : NeonI_3VD_2Op<0b0, u, 0b01, opcode, asmop, "4s", "4h",
-                              opnode, VPR128, VPR64, v4i32, v4i16>;
-    def _2d2s : NeonI_3VD_2Op<0b0, u, 0b10, opcode, asmop, "2d", "2s",
-                              opnode, VPR128, VPR64, v2i64, v2i32>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULLvvv : NeonI_3VDL_v2<0b0, 0b1101, "sqdmull",
-                                int_arm_neon_vqdmull, 1>;
-}
-
-multiclass NeonI_3VDL2_2Op_mull_v2<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _4s8h : NeonI_3VDL2_2Op_mull<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                     !cast<PatFrag>(opnode # "_8H"),
-                                     v4i32, v8i16>;
-    def _2d4s : NeonI_3VDL2_2Op_mull<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                     !cast<PatFrag>(opnode # "_4S"),
-                                     v2i64, v4i32>;
-  }
-}
-
-defm SQDMULL2vvv : NeonI_3VDL2_2Op_mull_v2<0b0, 0b1101, "sqdmull2",
-                                           "NI_qdmull_hi", 1>;
-
-multiclass NeonI_3VDL2_3Op_qdmlal_v2<bit u, bits<4> opcode, string asmop,
-                                     SDPatternOperator opnode> {
-  def _4s8h : NeonI_3VDL2_3Op_mlas<0b1, u, 0b01, opcode, asmop, "4s", "8h",
-                                   opnode, NI_qdmull_hi_8H,
-                                   VPR128, v4i32, v8i16>;
-  def _2d4s : NeonI_3VDL2_3Op_mlas<0b1, u, 0b10, opcode, asmop, "2d", "4s",
-                                   opnode, NI_qdmull_hi_4S,
-                                   VPR128, v2i64, v4i32>;
-}
-
-defm SQDMLAL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1001, "sqdmlal2",
-                                             int_arm_neon_vqadds>;
-defm SQDMLSL2vvv : NeonI_3VDL2_3Op_qdmlal_v2<0b0, 0b1011, "sqdmlsl2",
-                                             int_arm_neon_vqsubs>;
-
-multiclass NeonI_3VDL_v3<bit u, bits<4> opcode, string asmop,
-                         SDPatternOperator opnode_8h8b,
-                         SDPatternOperator opnode_1q1d, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h8b : NeonI_3VD_2Op<0b0, u, 0b00, opcode, asmop, "8h", "8b",
-                              opnode_8h8b, VPR128, VPR64, v8i16, v8i8>;
-
-    def _1q1d : NeonI_3VD_2Op<0b0, u, 0b11, opcode, asmop, "1q", "1d",
-                              opnode_1q1d, VPR128, VPR64, v16i8, v1i64>;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in
-defm PMULLvvv : NeonI_3VDL_v3<0b0, 0b1110, "pmull", int_arm_neon_vmullp,
-                              int_aarch64_neon_vmull_p64, 1>;
-
-multiclass NeonI_3VDL2_2Op_mull_v3<bit u, bits<4> opcode, string asmop,
-                                   string opnode, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _8h16b : NeonI_3VDL2_2Op_mull<0b1, u, 0b00, opcode, asmop, "8h", "16b",
-                                      !cast<PatFrag>(opnode # "_16B"),
-                                      v8i16, v16i8>;
-
-    def _1q2d : 
-      NeonI_3VDiff<0b1, u, 0b11, opcode,
-                   (outs VPR128:$Rd), (ins VPR128:$Rn, VPR128:$Rm),
-                   asmop # "\t$Rd.1q, $Rn.2d, $Rm.2d",
-                   [(set (v16i8 VPR128:$Rd),
-                      (v16i8 (int_aarch64_neon_vmull_p64 
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rn), 1)))),
-                        (v1i64 (scalar_to_vector
-                          (i64 (vector_extract (v2i64 VPR128:$Rm), 1)))))))],
-                   NoItinerary>,
-      Sched<[WriteFPMul, ReadFPMul, ReadFPMul]>;
-  }
-
-  def : Pat<(v16i8 (int_aarch64_neon_vmull_p64
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 1))),
-                      (v1i64 (extract_subvector (v2i64 VPR128:$Rm), (i64 1))))),
-            (!cast<Instruction>(NAME # "_1q2d") VPR128:$Rn, VPR128:$Rm)>;
-}
-
-defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2", "NI_pmull_hi",
-                                         1>;
-
-// End of implementation for instruction class (3V Diff)
-
-// The followings are vector load/store multiple N-element structure
-// (class SIMD lselem).
-
-// ld1:         load multiple 1-element structure to 1/2/3/4 registers.
-// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
-//              The structure consists of a sequence of sets of N values.
-//              The first element of the structure is placed in the first lane
-//              of the first first vector, the second element in the first lane
-//              of the second vector, and so on.
-// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
-// the three 64-bit vectors list {BA, DC, FE}.
-// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
-// 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
-
-
-class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 1, opcode, size,
-                 (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_LDVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_LDVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
-def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
-
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
-
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
-
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
-
-// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
-defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
-def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
-
-defm LD1x3 : LDVList_BHSD<0b0110, "VTriple", "ld1">;
-def LD1x3_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
-
-defm LD1x4 : LDVList_BHSD<0b0010, "VQuad", "ld1">;
-def LD1x4_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
-
-class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-  : NeonI_LdStMult<q, 0, opcode, size,
-                 (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
-                 asmop # "\t$Rt, [$Rn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
-  def _8B : NeonI_STVList<0, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_STVList<0, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_STVList<0, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _16B : NeonI_STVList<1, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_STVList<1, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_STVList<1, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_STVList<1, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
-def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
-
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
-
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
-
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
-
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
-
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
-
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
-
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
-
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
-
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
-
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
-
-// Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
-// FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
-// these patterns are not needed any more.
-def : Pat<(v1i8 (load GPR64xsp:$addr)), (LSFP8_LDR $addr, 0)>;
-def : Pat<(v1i16 (load GPR64xsp:$addr)), (LSFP16_LDR $addr, 0)>;
-def : Pat<(v1i32 (load GPR64xsp:$addr)), (LSFP32_LDR $addr, 0)>;
-
-def : Pat<(store (v1i8 FPR8:$value), GPR64xsp:$addr),
-          (LSFP8_STR $value, $addr, 0)>;
-def : Pat<(store (v1i16 FPR16:$value), GPR64xsp:$addr),
-          (LSFP16_STR $value, $addr, 0)>;
-def : Pat<(store (v1i32 FPR32:$value), GPR64xsp:$addr),
-          (LSFP32_STR $value, $addr, 0)>;
-
-
-// End of vector load/store multiple N-element structure(class SIMD lselem)
-
-// The followings are post-index vector load/store multiple N-element
-// structure(class SIMD lselem-post)
-def exact1_asmoperand : AsmOperandClass {
-  let Name = "Exact1";
-  let PredicateMethod = "isExactImm<1>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact1 : Operand<i32>, ImmLeaf<i32, [{return Imm == 1;}]> {
-  let ParserMatchClass = exact1_asmoperand;
-}
-
-def exact2_asmoperand : AsmOperandClass {
-  let Name = "Exact2";
-  let PredicateMethod = "isExactImm<2>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact2 : Operand<i32>, ImmLeaf<i32, [{return Imm == 2;}]> {
-  let ParserMatchClass = exact2_asmoperand;
-}
-
-def exact3_asmoperand : AsmOperandClass {
-  let Name = "Exact3";
-  let PredicateMethod = "isExactImm<3>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact3 : Operand<i32>, ImmLeaf<i32, [{return Imm == 3;}]> {
-  let ParserMatchClass = exact3_asmoperand;
-}
-
-def exact4_asmoperand : AsmOperandClass {
-  let Name = "Exact4";
-  let PredicateMethod = "isExactImm<4>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact4 : Operand<i32>, ImmLeaf<i32, [{return Imm == 4;}]> {
-  let ParserMatchClass = exact4_asmoperand;
-}
-
-def exact6_asmoperand : AsmOperandClass {
-  let Name = "Exact6";
-  let PredicateMethod = "isExactImm<6>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact6 : Operand<i32>, ImmLeaf<i32, [{return Imm == 6;}]> {
-  let ParserMatchClass = exact6_asmoperand;
-}
-
-def exact8_asmoperand : AsmOperandClass {
-  let Name = "Exact8";
-  let PredicateMethod = "isExactImm<8>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact8 : Operand<i32>, ImmLeaf<i32, [{return Imm == 8;}]> {
-  let ParserMatchClass = exact8_asmoperand;
-}
-
-def exact12_asmoperand : AsmOperandClass {
-  let Name = "Exact12";
-  let PredicateMethod = "isExactImm<12>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact12 : Operand<i32>, ImmLeaf<i32, [{return Imm == 12;}]> {
-  let ParserMatchClass = exact12_asmoperand;
-}
-
-def exact16_asmoperand : AsmOperandClass {
-  let Name = "Exact16";
-  let PredicateMethod = "isExactImm<16>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact16 : Operand<i32>, ImmLeaf<i32, [{return Imm == 16;}]> {
-  let ParserMatchClass = exact16_asmoperand;
-}
-
-def exact24_asmoperand : AsmOperandClass {
-  let Name = "Exact24";
-  let PredicateMethod = "isExactImm<24>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact24 : Operand<i32>, ImmLeaf<i32, [{return Imm == 24;}]> {
-  let ParserMatchClass = exact24_asmoperand;
-}
-
-def exact32_asmoperand : AsmOperandClass {
-  let Name = "Exact32";
-  let PredicateMethod = "isExactImm<32>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact32 : Operand<i32>, ImmLeaf<i32, [{return Imm == 32;}]> {
-  let ParserMatchClass = exact32_asmoperand;
-}
-
-def exact48_asmoperand : AsmOperandClass {
-  let Name = "Exact48";
-  let PredicateMethod = "isExactImm<48>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact48 : Operand<i32>, ImmLeaf<i32, [{return Imm == 48;}]> {
-  let ParserMatchClass = exact48_asmoperand;
-}
-
-def exact64_asmoperand : AsmOperandClass {
-  let Name = "Exact64";
-  let PredicateMethod = "isExactImm<64>";
-  let RenderMethod = "addImmOperands";
-}
-def uimm_exact64 : Operand<i32>, ImmLeaf<i32, [{return Imm == 64;}]> {
-  let ParserMatchClass = exact64_asmoperand;
-}
-
-multiclass NeonI_LDWB_VList<bit q, bits<4> opcode, bits<2> size,
-                           RegisterOperand VecList, Operand ImmTy,
-                           string asmop> {
-  let Constraints = "$Rn = $wb", mayLoad = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 1, opcode, size,
-                     (outs VecList:$Rt, GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 1, opcode, size,
-                        (outs VecList:$Rt, GPR64xsp:$wb),
-                        (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                        asmop # "\t$Rt, [$Rn], $Rm",
-                        [],
-                        NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-    Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_LDWB_VList<0, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              ImmTy, asmop>;
-
-  defm _4H : NeonI_LDWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_LDWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_LDWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_LDWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_LDWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_LDWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
-defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "ld1">;
-
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
-
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
-
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
-
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
-
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
-
-multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let Constraints = "$Rn = $wb", mayStore = 1, neverHasSideEffects = 1,
-      DecoderMethod = "DecodeVLDSTPostInstruction" in {
-    def _fixed : NeonI_LdStMult_Post<q, 0, opcode, size,
-                     (outs GPR64xsp:$wb),
-                     (ins GPR64xsp:$Rn, ImmTy:$amt, VecList:$Rt),
-                     asmop # "\t$Rt, [$Rn], $amt",
-                     [],
-                     NoItinerary>,
-                 Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdStMult_Post<q, 0, opcode, size,
-                      (outs GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VecList:$Rt),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-  }
-}
-
-multiclass STWB_VList_BHSD<bits<4> opcode, string List, Operand ImmTy,
-                           Operand ImmTy2, string asmop> {
-  defm _8B : NeonI_STWB_VList<0, opcode, 0b00,
-                 !cast<RegisterOperand>(List # "8B_operand"), ImmTy, asmop>;
-
-  defm _4H : NeonI_STWB_VList<0, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              ImmTy, asmop>;
-
-  defm _2S : NeonI_STWB_VList<0, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              ImmTy, asmop>;
-
-  defm _16B : NeonI_STWB_VList<1, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               ImmTy2, asmop>;
-
-  defm _8H : NeonI_STWB_VList<1, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              ImmTy2, asmop>;
-
-  defm _4S : NeonI_STWB_VList<1, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              ImmTy2, asmop>;
-
-  defm _2D : NeonI_STWB_VList<1, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              ImmTy2, asmop>;
-}
-
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
-
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
-
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
-
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
-
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
-
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
-
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
-
-// End of post-index vector load/store multiple N-element structure
-// (class SIMD lselem-post)
-
-// The followings are vector load/store single N-element structure
-// (class SIMD lsone).
-def neon_uimm0_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm == 0;}]> {
-  let ParserMatchClass = neon_uimm0_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm1_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 2;}]> {
-  let ParserMatchClass = neon_uimm1_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm2_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 4;}]> {
-  let ParserMatchClass = neon_uimm2_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm3_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-def neon_uimm4_bare : Operand<i64>,
-                        ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmBareOperand";
-}
-
-class NeonI_LDN_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                    RegisterOperand VecList, string asmop>
-    : NeonI_LdOne_Dup<q, r, opcode, size,
-                      (outs VecList:$Rt), (ins GPR64xsp:$Rn),
-                      asmop # "\t$Rt, [$Rn]",
-                      [],
-                      NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-}
-
-multiclass LDN_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop> {
-  def _8B : NeonI_LDN_Dup<0, r, opcode, 0b00,
-                          !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
-  def _4H : NeonI_LDN_Dup<0, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
-  def _2S : NeonI_LDN_Dup<0, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
-  def _1D : NeonI_LDN_Dup<0, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "1D_operand"), asmop>;
-
-  def _16B : NeonI_LDN_Dup<1, r, opcode, 0b00,
-                           !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
-  def _8H : NeonI_LDN_Dup<1, r, opcode, 0b01,
-                          !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
-  def _4S : NeonI_LDN_Dup<1, r, opcode, 0b10,
-                          !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
-  def _2D : NeonI_LDN_Dup<1, r, opcode, 0b11,
-                          !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load single 1-element structure to all lanes of 1 register
-defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
-
-// Load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
-
-
-class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                    Instruction INST>
-    : Pat<(VTy (Neon_vdup (DTy (LoadOp GPR64xsp:$Rn)))),
-          (VTy (INST GPR64xsp:$Rn))>;
-
-// Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
-
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
-
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
-
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
-
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
-
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
-
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
-class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
-                       Instruction INST>
-  : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
-        (VTy (INST GPR64xsp:$Rn))>;
-
-def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
-
-multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
-                                RegisterClass RegList> {
-  defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
-  defm H : VectorList_operands<PREFIX, "H", Count, RegList>;
-  defm S : VectorList_operands<PREFIX, "S", Count, RegList>;
-  defm D : VectorList_operands<PREFIX, "D", Count, RegList>;
-}
-
-// Special vector list operand of 128-bit vectors with bare layout.
-// i.e. only show ".b", ".h", ".s", ".d"
-defm VOne : VectorList_Bare_BHSD<"VOne", 1, FPR128>;
-defm VPair : VectorList_Bare_BHSD<"VPair", 2, QPair>;
-defm VTriple : VectorList_Bare_BHSD<"VTriple", 3, QTriple>;
-defm VQuad : VectorList_Bare_BHSD<"VQuad", 4, QQuad>;
-
-class NeonI_LDN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<1, r, op2_1, op0,
-                         (outs VList:$Rt),
-                         (ins GPR64xsp:$Rn, VList:$src, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecLd, ReadVecLd, ReadVecLd]> {
-  let mayLoad = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-  let Constraints = "$src = $Rt";
-}
-
-multiclass LDN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_LDN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_LDN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                          neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_LDN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Load single 1-element structure to one lane of 1 register.
-defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
-
-// Load single N-element structure to one lane of N consecutive registers
-// (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
-
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
-                          Instruction INST> {
-  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
-                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
-            (VTy (EXTRACT_SUBREG
-                     (INST GPR64xsp:$Rn,
-                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                           ImmOp:$lane),
-                     sub_64))>;
-
-  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
-                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
-            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
-}
-
-// Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
-
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
-
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-
-class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                     Operand ImmOp, string asmop>
-    : NeonI_LdStOne_Lane<0, r, op2_1, op0,
-                         (outs), (ins GPR64xsp:$Rn, VList:$Rt, ImmOp:$lane),
-                         asmop # "\t$Rt[$lane], [$Rn]",
-                         [],
-                         NoItinerary>,
-      Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-  let mayStore = 1;
-  let neverHasSideEffects = 1;
-  let hasExtraDefRegAllocReq = 1;
-}
-
-multiclass STN_Lane_BHSD<bit r, bit op0, string List, string asmop> {
-  def _B : NeonI_STN_Lane<r, 0b00, op0,
-                          !cast<RegisterOperand>(List # "B_operand"),
-                          neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H : NeonI_STN_Lane<r, 0b01, op0,
-                          !cast<RegisterOperand>(List # "H_operand"),
-                          neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "S_operand"),
-                           neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D : NeonI_STN_Lane<r, 0b10, op0,
-                          !cast<RegisterOperand>(List # "D_operand"),
-                          neon_uimm1_bare, asmop>{
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Store single 1-element structure from one lane of 1 register.
-defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
-
-// Store single N-element structure from one lane of N consecutive registers
-// (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
-
-multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
-                          Instruction INST> {
-  def : Pat<(StoreOp (DTy (vector_extract (VTy VPR64:$Rt), ImmOp:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn,
-                  (SUBREG_TO_REG (i64 0), VPR64:$Rt, sub_64),
-                  ImmOp:$lane)>;
-
-  def : Pat<(StoreOp (DTy (vector_extract (VTy2 VPR128:$Rt), ImmOp2:$lane)),
-                     GPR64xsp:$Rn),
-            (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
-}
-
-// Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
-
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
-
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
-// End of vector load/store single N-element structure (class SIMD lsone).
-
-
-// The following are post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-multiclass NeonI_LDN_WB_Dup<bit q, bit r, bits<3> opcode, bits<2> size,
-                            RegisterOperand VecList, Operand ImmTy,
-                            string asmop> {
-  let mayLoad = 1, neverHasSideEffects = 1, Constraints = "$wb = $Rn",
-  DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-    def _fixed : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, ImmTy:$amt),
-                      asmop # "\t$Rt, [$Rn], $amt",
-                      [],
-                      NoItinerary>,
-                 Sched<[WriteVecLd, WriteVecLd, ReadVecLd]> {
-      let Rm = 0b11111;
-    }
-
-    def _register : NeonI_LdOne_Dup_Post<q, r, opcode, size,
-                      (outs VecList:$Rt, GPR64xsp:$wb),
-                      (ins GPR64xsp:$Rn, GPR64noxzr:$Rm),
-                      asmop # "\t$Rt, [$Rn], $Rm",
-                      [],
-                      NoItinerary>,
-                    Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]>;
-  }
-}
-
-multiclass LDWB_Dup_BHSD<bit r, bits<3> opcode, string List, string asmop,
-                         Operand uimm_b, Operand uimm_h,
-                         Operand uimm_s, Operand uimm_d> {
-  defm _8B : NeonI_LDN_WB_Dup<0, r, opcode, 0b00,
-                              !cast<RegisterOperand>(List # "8B_operand"),
-                              uimm_b, asmop>;
-
-  defm _4H : NeonI_LDN_WB_Dup<0, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "4H_operand"),
-                              uimm_h, asmop>;
-
-  defm _2S : NeonI_LDN_WB_Dup<0, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "2S_operand"),
-                              uimm_s, asmop>;
-
-  defm _1D : NeonI_LDN_WB_Dup<0, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "1D_operand"),
-                              uimm_d, asmop>;
-
-  defm _16B : NeonI_LDN_WB_Dup<1, r, opcode, 0b00,
-                               !cast<RegisterOperand>(List # "16B_operand"),
-                               uimm_b, asmop>;
-
-  defm _8H : NeonI_LDN_WB_Dup<1, r, opcode, 0b01,
-                              !cast<RegisterOperand>(List # "8H_operand"),
-                              uimm_h, asmop>;
-
-  defm _4S : NeonI_LDN_WB_Dup<1, r, opcode, 0b10,
-                              !cast<RegisterOperand>(List # "4S_operand"),
-                              uimm_s, asmop>;
-
-  defm _2D : NeonI_LDN_WB_Dup<1, r, opcode, 0b11,
-                              !cast<RegisterOperand>(List # "2D_operand"),
-                              uimm_d, asmop>;
-}
-
-// Post-index load single 1-element structure to all lanes of 1 register
-defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
-                             uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
-    Constraints = "$Rn = $wb, $Rt = $src",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class LDN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd]> {
-    let Rm = 0b11111;
-  }
-
-  class LDN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                                 Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<1, r, op2_1, op0,
-                                (outs VList:$Rt, GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm,
-                                    VList:$src, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecLd, WriteVecLd, ReadVecLd, ReadVecLd, ReadVecLd]>;
-}
-
-multiclass LD_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : LDN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : LDN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : LDN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : LDN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : LDN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : LDN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index load single 1-element structure to one lane of 1 register.
-defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index load single N-element structure to one lane of N consecutive
-// registers
-// (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-let mayStore = 1, neverHasSideEffects = 1,
-    hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
-    DecoderMethod = "DecodeVLDSTLanePostInstruction" in {
-  class STN_WBFx_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                      Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, ImmTy:$amt,
-                                    VList:$Rt, ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $amt",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt]> {
-    let Rm = 0b11111;
-  }
-
-  class STN_WBReg_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
-                       Operand ImmTy, Operand ImmOp, string asmop>
-      : NeonI_LdStOne_Lane_Post<0, r, op2_1, op0,
-                                (outs GPR64xsp:$wb),
-                                (ins GPR64xsp:$Rn, GPR64noxzr:$Rm, VList:$Rt,
-                                    ImmOp:$lane),
-                                asmop # "\t$Rt[$lane], [$Rn], $Rm",
-                                [],
-                                NoItinerary>,
-        Sched<[WriteVecSt, ReadVecSt, ReadVecSt, ReadVecSt]>;
-}
-
-multiclass ST_Lane_WB_BHSD<bit r, bit op0, string List, string asmop,
-                           Operand uimm_b, Operand uimm_h,
-                           Operand uimm_s, Operand uimm_d> {
-  def _B_fixed : STN_WBFx_Lane<r, 0b00, op0,
-                               !cast<RegisterOperand>(List # "B_operand"),
-                               uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _B_register : STN_WBReg_Lane<r, 0b00, op0,
-                                   !cast<RegisterOperand>(List # "B_operand"),
-                                   uimm_b, neon_uimm4_bare, asmop> {
-    let Inst{12-10} = lane{2-0};
-    let Inst{30} = lane{3};
-  }
-
-  def _H_fixed : STN_WBFx_Lane<r, 0b01, op0,
-                               !cast<RegisterOperand>(List # "H_operand"),
-                               uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _H_register : STN_WBReg_Lane<r, 0b01, op0,
-                                   !cast<RegisterOperand>(List # "H_operand"),
-                                   uimm_h, neon_uimm3_bare, asmop> {
-    let Inst{12-10} = {lane{1}, lane{0}, 0b0};
-    let Inst{30} = lane{2};
-  }
-
-  def _S_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "S_operand"),
-                               uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _S_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "S_operand"),
-                                   uimm_s, neon_uimm2_bare, asmop> {
-    let Inst{12-10} = {lane{0}, 0b0, 0b0};
-    let Inst{30} = lane{1};
-  }
-
-  def _D_fixed : STN_WBFx_Lane<r, 0b10, op0,
-                               !cast<RegisterOperand>(List # "D_operand"),
-                               uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-
-  def _D_register : STN_WBReg_Lane<r, 0b10, op0,
-                                   !cast<RegisterOperand>(List # "D_operand"),
-                                   uimm_d, neon_uimm1_bare, asmop> {
-    let Inst{12-10} = 0b001;
-    let Inst{30} = lane{0};
-  }
-}
-
-// Post-index store single 1-element structure from one lane of 1 register.
-defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
-                                uimm_exact2, uimm_exact4, uimm_exact8>;
-
-// Post-index store single N-element structure from one lane of N consecutive
-// registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
-
-// End of post-index load/store single N-element instructions
-// (class SIMD lsone-post)
-
-// Neon Scalar instructions implementation
-// Scalar Three Same
-
-class NeonI_Scalar3Same_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRC>
-  : NeonI_Scalar3Same<u, size, opcode,
-                      (outs FPRC:$Rd), (ins FPRC:$Rn, FPRC:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-
-multiclass NeonI_Scalar3Same_HS_sizes<bit u, bits<5> opcode, string asmop,
-                                      bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_SD_sizes<bit u, bit size_high, bits<5> opcode,
-                                      string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def sss : NeonI_Scalar3Same_size<u, {size_high, 0b0}, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, {size_high, 0b1}, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass NeonI_Scalar3Same_BHSD_sizes<bit u, bits<5> opcode,
-                                        string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def bbb : NeonI_Scalar3Same_size<u, 0b00, opcode, asmop, FPR8>;
-    def hhh : NeonI_Scalar3Same_size<u, 0b01, opcode, asmop, FPR16>;
-    def sss : NeonI_Scalar3Same_size<u, 0b10, opcode, asmop, FPR32>;
-    def ddd : NeonI_Scalar3Same_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-multiclass Neon_Scalar3Same_D_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_BHSD_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTB,
-                                               Instruction INSTH,
-                                               Instruction INSTS,
-                                               Instruction INSTD>
-  : Neon_Scalar3Same_D_size_patterns<opnode, INSTD> {
-  def: Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-           (INSTB FPR8:$Rn, FPR8:$Rm)>;
-  def: Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-           (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def: Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-           (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Same_SD_size_patterns<SDPatternOperator opnode,
-                                             ValueType SResTy, ValueType STy,
-                                             Instruction INSTS, ValueType DResTy,
-                                             ValueType DTy, Instruction INSTD> {
-  def : Pat<(SResTy (opnode (STy FPR32:$Rn), (STy FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(DResTy (opnode (DTy FPR64:$Rn), (DTy FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class Neon_Scalar3Same_cmp_V1_D_size_patterns<CondCode CC,
-                                              Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmp (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Three Different
-
-class NeonI_Scalar3Diff_size<bit u, bits<2> size, bits<4> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar3Diff<u, size, opcode,
-                      (outs FPRCD:$Rd), (ins FPRCS:$Rn, FPRCS:$Rm),
-                      !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                      [],
-                      NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar3Diff_HS_size<bit u, bits<4> opcode, string asmop> {
-  def shh : NeonI_Scalar3Diff_size<u, 0b01, opcode, asmop, FPR32, FPR16>;
-  def dss : NeonI_Scalar3Diff_size<u, 0b10, opcode, asmop, FPR64, FPR32>;
-}
-
-multiclass NeonI_Scalar3Diff_ml_HS_size<bit u, bits<4> opcode, string asmop> {
-  let Constraints = "$Src = $Rd" in {
-    def shh : NeonI_Scalar3Diff<u, 0b01, opcode,
-                       (outs FPR32:$Rd), (ins FPR32:$Src, FPR16:$Rn, FPR16:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-    def dss : NeonI_Scalar3Diff<u, 0b10, opcode,
-                       (outs FPR64:$Rd), (ins FPR64:$Src, FPR32:$Rn, FPR32:$Rm),
-                       !strconcat(asmop, "\t$Rd, $Rn, $Rm"),
-                       [],
-                       NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass Neon_Scalar3Diff_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass Neon_Scalar3Diff_ml_HS_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTH,
-                                             Instruction INSTS> {
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-            (INSTH FPR32:$Src, FPR16:$Rn, FPR16:$Rm)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-            (INSTS FPR64:$Src, FPR32:$Rn, FPR32:$Rm)>;
-}
-
-// Scalar Two Registers Miscellaneous
-
-class NeonI_Scalar2SameMisc_size<bit u, bits<2> size, bits<5> opcode, string asmop,
-                             RegisterClass FPRCD, RegisterClass FPRCS>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRCD:$Rd), (ins FPRCS:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_SD_size<bit u, bit size_high, bits<5> opcode,
-                                         string asmop> {
-  def ss : NeonI_Scalar2SameMisc_size<u, {size_high, 0b0}, opcode, asmop, FPR32,
-                                      FPR32>;
-  def dd : NeonI_Scalar2SameMisc_size<u, {size_high, 0b1}, opcode, asmop, FPR64,
-                                      FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_D_size<bit u, bits<5> opcode, string asmop> {
-  def dd : NeonI_Scalar2SameMisc_size<u, 0b11, opcode, asmop, FPR64, FPR64>;
-}
-
-multiclass NeonI_Scalar2SameMisc_BHSD_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_D_size<u, opcode, asmop> {
-  def bb : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR8>;
-  def hh : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR16>;
-  def ss : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR32>;
-}
-
-class NeonI_Scalar2SameMisc_fcvtxn_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR32, FPR64>;
-
-multiclass NeonI_Scalar2SameMisc_narrow_HSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-  def bh : NeonI_Scalar2SameMisc_size<u, 0b00, opcode, asmop, FPR8, FPR16>;
-  def hs : NeonI_Scalar2SameMisc_size<u, 0b01, opcode, asmop, FPR16, FPR32>;
-  def sd : NeonI_Scalar2SameMisc_size<u, 0b10, opcode, asmop, FPR32, FPR64>;
-}
-
-class NeonI_Scalar2SameMisc_accum_size<bit u, bits<2> size, bits<5> opcode,
-                                       string asmop, RegisterClass FPRC>
-  : NeonI_Scalar2SameMisc<u, size, opcode,
-                          (outs FPRC:$Rd), (ins FPRC:$Src, FPRC:$Rn),
-                          !strconcat(asmop, "\t$Rd, $Rn"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_accum_BHSD_size<bit u, bits<5> opcode,
-                                                 string asmop> {
-
-  let Constraints = "$Src = $Rd" in {
-    def bb : NeonI_Scalar2SameMisc_accum_size<u, 0b00, opcode, asmop, FPR8>;
-    def hh : NeonI_Scalar2SameMisc_accum_size<u, 0b01, opcode, asmop, FPR16>;
-    def ss : NeonI_Scalar2SameMisc_accum_size<u, 0b10, opcode, asmop, FPR32>;
-    def dd : NeonI_Scalar2SameMisc_accum_size<u, 0b11, opcode, asmop, FPR64>;
-  }
-}
-
-class Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTD>
-  : Pat<(f32 (opnode (f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_fcvt_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_vcvt_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-multiclass Neon_Scalar2SameMisc_cvt_SD_size_patterns<SDPatternOperator opnode,
-                                                     Instruction INSTS,
-                                                     Instruction INSTD> {
-  def : Pat<(f32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_SD_size_patterns<SDPatternOperator opnode,
-                                                 Instruction INSTS,
-                                                 Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-class Neon_Scalar2SameMisc_V1_D_size_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD>
-  : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))),
-        (INSTD FPR64:$Rn)>;
-
-class NeonI_Scalar2SameMisc_cmpz_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                          (outs FPR64:$Rd), (ins FPR64:$Rn, neon_uimm0:$Imm),
-                          !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                          [],
-                          NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_Scalar2SameMisc_cmpz_SD_size<bit u, bits<5> opcode,
-                                              string asmop> {
-  def ssi : NeonI_Scalar2SameMisc<u, 0b10, opcode,
-                           (outs FPR32:$Rd), (ins FPR32:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-  def ddi : NeonI_Scalar2SameMisc<u, 0b11, opcode,
-                           (outs FPR64:$Rd), (ins FPR64:$Rn, fpzz32:$FPImm),
-                           !strconcat(asmop, "\t$Rd, $Rn, $FPImm"),
-                           [],
-                           NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-class Neon_Scalar2SameMisc_cmpz_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-                       (v1i64 (bitconvert (v8i8 Neon_AllZero))))),
-        (INSTD FPR64:$Rn, 0)>;
-
-class Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<CondCode CC,
-                                                   Instruction INSTD>
-  : Pat<(v1i64 (Neon_cmpz (v1i64 FPR64:$Rn),
-                          (i32 neon_uimm0:$Imm), CC)),
-        (INSTD FPR64:$Rn, neon_uimm0:$Imm)>;
-
-multiclass Neon_Scalar2SameMisc_cmpz_SD_size_patterns<SDPatternOperator opnode,
-                                                      CondCode CC,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTS FPR32:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (f32 fpzz32:$FPImm))),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-  def : Pat<(v1i64 (Neon_cmpz (v1f64 FPR64:$Rn), (f32 fpzz32:$FPImm), CC)),
-            (INSTD FPR64:$Rn, fpzz32:$FPImm)>;
-}
-
-multiclass Neon_Scalar2SameMisc_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD> {
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_Scalar2SameMisc_D_size_patterns<opnode, INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-}
-
-multiclass Neon_Scalar2SameMisc_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Rn)>;
-
-}
-
-multiclass Neon_Scalar2SameMisc_accum_BHSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTB,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def : Pat<(v1i8 (opnode (v1i8 FPR8:$Src), (v1i8 FPR8:$Rn))),
-            (INSTB FPR8:$Src, FPR8:$Rn)>;
-  def : Pat<(v1i16 (opnode (v1i16 FPR16:$Src), (v1i16 FPR16:$Rn))),
-            (INSTH FPR16:$Src, FPR16:$Rn)>;
-  def : Pat<(v1i32 (opnode (v1i32 FPR32:$Src), (v1i32 FPR32:$Rn))),
-            (INSTS FPR32:$Src, FPR32:$Rn)>;
-  def : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn))),
-            (INSTD FPR64:$Src, FPR64:$Rn)>;
-}
-
-// Scalar Shift By Immediate
-
-class NeonI_ScalarShiftImm_size<bit u, bits<5> opcode, string asmop,
-                                RegisterClass FPRC, Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRC:$Rd), (ins FPRC:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftRightImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftRightImm_BHSD_size<bit u, bits<5> opcode,
-                                               string asmop>
-  : NeonI_ScalarShiftRightImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_D_size<bit u, bits<5> opcode,
-                                            string asmop> {
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shl_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftLeftImm_BHSD_size<bit u, bits<5> opcode,
-                                              string asmop>
-  : NeonI_ScalarShiftLeftImm_D_size<u, opcode, asmop> {
-  def bbi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR8, shl_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hhi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR16, shl_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shl_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-class NeonI_ScalarShiftRightImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shr_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftLeftImm_accum_D_size<bit u, bits<5> opcode, string asmop>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPR64:$Rd),
-                         (ins FPR64:$Src, FPR64:$Rn, shl_imm64:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-    let Constraints = "$Src = $Rd";
-}
-
-class NeonI_ScalarShiftImm_narrow_size<bit u, bits<5> opcode, string asmop,
-                                       RegisterClass FPRCD, RegisterClass FPRCS,
-                                       Operand ImmTy>
-  : NeonI_ScalarShiftImm<u, opcode,
-                         (outs FPRCD:$Rd), (ins FPRCS:$Rn, ImmTy:$Imm),
-                         !strconcat(asmop, "\t$Rd, $Rn, $Imm"),
-                         [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-multiclass NeonI_ScalarShiftImm_narrow_HSD_size<bit u, bits<5> opcode,
-                                                string asmop> {
-  def bhi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR8, FPR16,
-                                             shr_imm8> {
-    bits<3> Imm;
-    let Inst{22-19} = 0b0001; // immh:immb = 0001xxx
-    let Inst{18-16} = Imm;
-  }
-  def hsi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR16, FPR32,
-                                             shr_imm16> {
-    bits<4> Imm;
-    let Inst{22-20} = 0b001; // immh:immb = 001xxxx
-    let Inst{19-16} = Imm;
-  }
-  def sdi : NeonI_ScalarShiftImm_narrow_size<u, opcode, asmop, FPR32, FPR64,
-                                             shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-}
-
-multiclass NeonI_ScalarShiftImm_cvt_SD_size<bit u, bits<5> opcode, string asmop> {
-  def ssi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR32, shr_imm32> {
-    bits<5> Imm;
-    let Inst{22-21} = 0b01; // immh:immb = 01xxxxx
-    let Inst{20-16} = Imm;
-  }
-  def ddi : NeonI_ScalarShiftImm_size<u, opcode, asmop, FPR64, shr_imm64> {
-    bits<6> Imm;
-    let Inst{22} = 0b1; // immh:immb = 1xxxxxx
-    let Inst{21-16} = Imm;
-  }
-}
-
-multiclass Neon_ScalarShiftRImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftLImm_D_size_patterns<SDPatternOperator opnode,
-                                               Instruction INSTD> {
-  def ddi : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (i32 shl_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shl_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_V1_D_size_patterns<SDPatternOperator opnode,
-                                             Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn),
-            (v1i64 (Neon_vdup (i32 shr_imm64:$Imm))))),
-        (INSTD FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftLImm_BHSD_size_patterns<SDPatternOperator opnode,
-                                                   Instruction INSTB,
-                                                   Instruction INSTH,
-                                                   Instruction INSTS,
-                                                   Instruction INSTD>
-  : Neon_ScalarShiftLImm_D_size_patterns<opnode, INSTD> {
-  def bbi : Pat<(v1i8 (opnode (v1i8 FPR8:$Rn), (i32 shl_imm8:$Imm))),
-                (INSTB FPR8:$Rn, imm:$Imm)>;
-  def hhi : Pat<(v1i16 (opnode (v1i16 FPR16:$Rn), (i32 shl_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def ssi : Pat<(v1i32 (opnode (v1i32 FPR32:$Rn), (i32 shl_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-}
-
-class Neon_ScalarShiftLImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shl_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftRImm_accum_D_size_patterns<SDPatternOperator opnode,
-                                                Instruction INSTD>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Src), (v1i64 FPR64:$Rn),
-            (i32 shr_imm64:$Imm))),
-        (INSTD FPR64:$Src, FPR64:$Rn, imm:$Imm)>;
-
-multiclass Neon_ScalarShiftImm_narrow_HSD_size_patterns<
-                                                       SDPatternOperator opnode,
-                                                       Instruction INSTH,
-                                                       Instruction INSTS,
-                                                       Instruction INSTD> {
-  def bhi : Pat<(v1i8 (opnode (v1i16 FPR16:$Rn), (i32 shr_imm16:$Imm))),
-                (INSTH FPR16:$Rn, imm:$Imm)>;
-  def hsi : Pat<(v1i16 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def sdi : Pat<(v1i32 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_scvtf_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(f32 (opnode (v1i32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-multiclass Neon_ScalarShiftImm_fcvts_SD_size_patterns<SDPatternOperator opnode,
-                                                      Instruction INSTS,
-                                                      Instruction INSTD> {
-  def ssi : Pat<(v1i32 (opnode (f32 FPR32:$Rn), (i32 shr_imm32:$Imm))),
-                (INSTS FPR32:$Rn, imm:$Imm)>;
-  def ddi : Pat<(v1i64 (opnode (f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-                (INSTD FPR64:$Rn, imm:$Imm)>;
-}
-
-// Scalar Signed Shift Right (Immediate)
-defm SSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00000, "sshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrds_n, SSHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<sra, SSHRddi>;
-
-// Scalar Unsigned Shift Right (Immediate)
-defm USHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00000, "ushr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vshrdu_n, USHRddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftRImm_V1_D_size_patterns<srl, USHRddi>;
-
-// Scalar Signed Rounding Shift Right (Immediate)
-defm SRSHR : NeonI_ScalarShiftRightImm_D_size<0b0, 0b00100, "srshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vsrshr, SRSHRddi>;
-
-// Scalar Unigned Rounding Shift Right (Immediate)
-defm URSHR : NeonI_ScalarShiftRightImm_D_size<0b1, 0b00100, "urshr">;
-defm : Neon_ScalarShiftRImm_D_size_patterns<int_aarch64_neon_vurshr, URSHRddi>;
-
-// Scalar Signed Shift Right and Accumulate (Immediate)
-def SSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00010, "ssra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsrads_n, SSRA>;
-
-// Scalar Unsigned Shift Right and Accumulate (Immediate)
-def USRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00010, "usra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsradu_n, USRA>;
-
-// Scalar Signed Rounding Shift Right and Accumulate (Immediate)
-def SRSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b0, 0b00110, "srsra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsrads_n, SRSRA>;
-
-// Scalar Unsigned Rounding Shift Right and Accumulate (Immediate)
-def URSRA : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b00110, "ursra">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vrsradu_n, URSRA>;
-
-// Scalar Shift Left (Immediate)
-defm SHL : NeonI_ScalarShiftLeftImm_D_size<0b0, 0b01010, "shl">;
-defm : Neon_ScalarShiftLImm_D_size_patterns<int_aarch64_neon_vshld_n, SHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-def : Neon_ScalarShiftLImm_V1_D_size_patterns<shl, SHLddi>;
-
-// Signed Saturating Shift Left (Immediate)
-defm SQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b0, 0b01110, "sqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshls_n,
-                                               SQSHLbbi, SQSHLhhi,
-                                               SQSHLssi, SQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_sqrshlImm, SQSHLddi>;
-
-// Unsigned Saturating Shift Left (Immediate)
-defm UQSHL : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01110, "uqshl">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vqshlu_n,
-                                               UQSHLbbi, UQSHLhhi,
-                                               UQSHLssi, UQSHLddi>;
-// Pattern to match llvm.arm.* intrinsic.
-defm : Neon_ScalarShiftLImm_D_size_patterns<Neon_uqrshlImm, UQSHLddi>;
-
-// Signed Saturating Shift Left Unsigned (Immediate)
-defm SQSHLU : NeonI_ScalarShiftLeftImm_BHSD_size<0b1, 0b01100, "sqshlu">;
-defm : Neon_ScalarShiftLImm_BHSD_size_patterns<int_aarch64_neon_vsqshlu,
-                                               SQSHLUbbi, SQSHLUhhi,
-                                               SQSHLUssi, SQSHLUddi>;
-
-// Shift Right And Insert (Immediate)
-def SRI : NeonI_ScalarShiftRightImm_accum_D_size<0b1, 0b01000, "sri">;
-def : Neon_ScalarShiftRImm_accum_D_size_patterns
-          <int_aarch64_neon_vsri, SRI>;
-
-// Shift Left And Insert (Immediate)
-def SLI : NeonI_ScalarShiftLeftImm_accum_D_size<0b1, 0b01010, "sli">;
-def : Neon_ScalarShiftLImm_accum_D_size_patterns
-          <int_aarch64_neon_vsli, SLI>;
-
-// Signed Saturating Shift Right Narrow (Immediate)
-defm SQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10010, "sqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrn,
-                                                    SQSHRNbhi, SQSHRNhsi,
-                                                    SQSHRNsdi>;
-
-// Unsigned Saturating Shift Right Narrow (Immediate)
-defm UQSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10010, "uqshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqshrn,
-                                                    UQSHRNbhi, UQSHRNhsi,
-                                                    UQSHRNsdi>;
-
-// Signed Saturating Rounded Shift Right Narrow (Immediate)
-defm SQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b0, 0b10011, "sqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrn,
-                                                    SQRSHRNbhi, SQRSHRNhsi,
-                                                    SQRSHRNsdi>;
-
-// Unsigned Saturating Rounded Shift Right Narrow (Immediate)
-defm UQRSHRN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10011, "uqrshrn">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vuqrshrn,
-                                                    UQRSHRNbhi, UQRSHRNhsi,
-                                                    UQRSHRNsdi>;
-
-// Signed Saturating Shift Right Unsigned Narrow (Immediate)
-defm SQSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10000, "sqshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqshrun,
-                                                    SQSHRUNbhi, SQSHRUNhsi,
-                                                    SQSHRUNsdi>;
-
-// Signed Saturating Rounded Shift Right Unsigned Narrow (Immediate)
-defm SQRSHRUN : NeonI_ScalarShiftImm_narrow_HSD_size<0b1, 0b10001, "sqrshrun">;
-defm : Neon_ScalarShiftImm_narrow_HSD_size_patterns<int_aarch64_neon_vsqrshrun,
-                                                    SQRSHRUNbhi, SQRSHRUNhsi,
-                                                    SQRSHRUNsdi>;
-
-// Scalar Signed Fixed-point Convert To Floating-Point (Immediate)
-defm SCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11100, "scvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxs2fp_n,
-                                                  SCVTF_Nssi, SCVTF_Nddi>;
-
-// Scalar Unsigned Fixed-point Convert To Floating-Point (Immediate)
-defm UCVTF_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11100, "ucvtf">;
-defm : Neon_ScalarShiftImm_scvtf_SD_size_patterns<int_aarch64_neon_vcvtfxu2fp_n,
-                                                  UCVTF_Nssi, UCVTF_Nddi>;
-
-// Scalar Floating-point Convert To Signed Fixed-point (Immediate)
-defm FCVTZS_N : NeonI_ScalarShiftImm_cvt_SD_size<0b0, 0b11111, "fcvtzs">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxs_n,
-                                                  FCVTZS_Nssi, FCVTZS_Nddi>;
-
-// Scalar Floating-point Convert To Unsigned Fixed-point (Immediate)
-defm FCVTZU_N : NeonI_ScalarShiftImm_cvt_SD_size<0b1, 0b11111, "fcvtzu">;
-defm : Neon_ScalarShiftImm_fcvts_SD_size_patterns<int_aarch64_neon_vcvtfp2fxu_n,
-                                                  FCVTZU_Nssi, FCVTZU_Nddi>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_ScalarShiftImm_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-class Neon_ScalarShiftImm_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                             Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn), (i32 shr_imm64:$Imm))),
-          (INST FPR64:$Rn, imm:$Imm)>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxs2fp,
-                                             SCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_cvtf_v1f64_pattern<int_arm_neon_vcvtfxu2fp,
-                                             UCVTF_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxs,
-                                             FCVTZS_Nddi>;
-
-def : Neon_ScalarShiftImm_fcvt_v1f64_pattern<int_arm_neon_vcvtfp2fxu,
-                                             FCVTZU_Nddi>;
-
-// Scalar Integer Add
-let isCommutable = 1 in {
-def ADDddd : NeonI_Scalar3Same_D_size<0b0, 0b10000, "add">;
-}
-
-// Scalar Integer Sub
-def SUBddd : NeonI_Scalar3Same_D_size<0b1, 0b10000, "sub">;
-
-// Pattern for Scalar Integer Add and Sub with D register only
-defm : Neon_Scalar3Same_D_size_patterns<add, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<sub, SUBddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for Scalar Add, Sub
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vaddds, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vadddu, ADDddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubds, SUBddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vsubdu, SUBddd>;
-
-// Scalar Integer Saturating Add (Signed, Unsigned)
-defm SQADD : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00001, "sqadd", 1>;
-defm UQADD : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00001, "uqadd", 1>;
-
-// Scalar Integer Saturating Sub (Signed, Unsigned)
-defm SQSUB : NeonI_Scalar3Same_BHSD_sizes<0b0, 0b00101, "sqsub", 0>;
-defm UQSUB : NeonI_Scalar3Same_BHSD_sizes<0b1, 0b00101, "uqsub", 0>;
-
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Add, Sub  (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqadds, SQADDbbb,
-                                           SQADDhhh, SQADDsss, SQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqaddu, UQADDbbb,
-                                           UQADDhhh, UQADDsss, UQADDddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubs, SQSUBbbb,
-                                           SQSUBhhh, SQSUBsss, SQSUBddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_arm_neon_vqsubu, UQSUBbbb,
-                                           UQSUBhhh, UQSUBsss, UQSUBddd>;
-
-// Scalar Integer Saturating Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in
-defm SQDMULH : NeonI_Scalar3Same_HS_sizes<0b0, 0b10110, "sqdmulh", 1>;
-
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SQRDMULH : NeonI_Scalar3Same_HS_sizes<0b1, 0b10110, "sqrdmulh", 1>;
-}
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Doubling Multiply Half High and
-// Scalar Integer Saturating Rounding Doubling Multiply Half High
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqdmulh, SQDMULHhhh,
-                                                               SQDMULHsss>;
-defm : Neon_Scalar3Same_HS_size_patterns<int_arm_neon_vqrdmulh, SQRDMULHhhh,
-                                                                SQRDMULHsss>;
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-// Scalar Floating-point Multiply Extended
-defm FMULX : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11011, "fmulx", 1>;
-}
-
-// Scalar Floating-point Reciprocal Step
-defm FRECPS : NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11111, "frecps", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrecps, f32, f32,
-                                         FRECPSsss, f64, f64, FRECPSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrecps (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRECPSddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Reciprocal Square Root Step
-defm FRSQRTS : NeonI_Scalar3Same_SD_sizes<0b0, 0b1, 0b11111, "frsqrts", 0>;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vrsqrts, f32, f32,
-                                         FRSQRTSsss, f64, f64, FRSQRTSddd>;
-def : Pat<(v1f64 (int_arm_neon_vrsqrts (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FRSQRTSddd FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (fsqrt (v1f64 FPR64:$Rn))), (FSQRTdd FPR64:$Rn)>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Floating-point Multiply Extended,
-multiclass Neon_Scalar3Same_MULX_SD_size_patterns<SDPatternOperator opnode,
-                                                  Instruction INSTS,
-                                                  Instruction INSTD> {
-  def : Pat<(f32 (opnode (f32 FPR32:$Rn), (f32 FPR32:$Rm))),
-            (INSTS FPR32:$Rn, FPR32:$Rm)>;
-  def : Pat<(f64 (opnode (f64 FPR64:$Rn), (f64 FPR64:$Rm))),
-            (INSTD FPR64:$Rn, FPR64:$Rm)>;
-}
-
-defm : Neon_Scalar3Same_MULX_SD_size_patterns<int_aarch64_neon_vmulx,
-                                              FMULXsss, FMULXddd>;
-def : Pat<(v1f64 (int_aarch64_neon_vmulx (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMULXddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Integer Shift Left (Signed, Unsigned)
-def SSHLddd : NeonI_Scalar3Same_D_size<0b0, 0b01000, "sshl">;
-def USHLddd : NeonI_Scalar3Same_D_size<0b1, 0b01000, "ushl">;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshifts, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vshiftu, USHLddd>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshlds, SSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vshldu, USHLddd>;
-
-// Scalar Integer Saturating Shift Left (Signed, Unsigned)
-defm SQSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01001, "sqshl", 0>;
-defm UQSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01001, "uqshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshls, SQSHLbbb,
-                                           SQSHLhhh, SQSHLsss, SQSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqshlu, UQSHLbbb,
-                                           UQSHLhhh, UQSHLsss, UQSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar  Integer Saturating Shift Letf (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshifts, SQSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqshiftu, UQSHLddd>;
-
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-def SRSHLddd: NeonI_Scalar3Same_D_size<0b0, 0b01010, "srshl">;
-def URSHLddd: NeonI_Scalar3Same_D_size<0b1, 0b01010, "urshl">;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshlds, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vrshldu, URSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshifts, SRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vrshiftu, URSHLddd>;
-
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm SQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b0, 0b01011, "sqrshl", 0>;
-defm UQRSHL: NeonI_Scalar3Same_BHSD_sizes<0b1, 0b01011, "uqrshl", 0>;
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshls, SQRSHLbbb,
-                                           SQRSHLhhh, SQRSHLsss, SQRSHLddd>;
-defm : Neon_Scalar3Same_BHSD_size_patterns<int_aarch64_neon_vqrshlu, UQRSHLbbb,
-                                           UQRSHLhhh, UQRSHLsss, UQRSHLddd>;
-
-// Patterns to match llvm.arm.* intrinsic for
-// Scalar Integer Saturating Rounding Shift Left (Signed, Unsigned)
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshifts, SQRSHLddd>;
-defm : Neon_Scalar3Same_D_size_patterns<int_arm_neon_vqrshiftu, UQRSHLddd>;
-
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-// Signed Saturating Doubling Multiply-Add Long
-defm SQDMLAL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1001, "sqdmlal">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlal,
-                                            SQDMLALshh, SQDMLALdss>;
-
-// Signed Saturating Doubling Multiply-Subtract Long
-let SchedRW = [WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC] in {
-defm SQDMLSL : NeonI_Scalar3Diff_ml_HS_size<0b0, 0b1011, "sqdmlsl">;
-}
-defm : Neon_Scalar3Diff_ml_HS_size_patterns<int_aarch64_neon_vqdmlsl,
-                                            SQDMLSLshh, SQDMLSLdss>;
-
-// Signed Saturating Doubling Multiply Long
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul, ReadFPMul] in {
-defm SQDMULL : NeonI_Scalar3Diff_HS_size<0b0, 0b1101, "sqdmull">;
-}
-defm : Neon_Scalar3Diff_HS_size_patterns<int_arm_neon_vqdmull,
-                                         SQDMULLshh, SQDMULLdss>;
-
-// Scalar Signed Integer Convert To Floating-point
-defm SCVTF  : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11101, "scvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fps,
-                                                 SCVTFss, SCVTFdd>;
-
-// Scalar Unsigned Integer Convert To Floating-point
-defm UCVTF  : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11101, "ucvtf">;
-defm : Neon_Scalar2SameMisc_cvt_SD_size_patterns<int_aarch64_neon_vcvtint2fpu,
-                                                 UCVTFss, UCVTFdd>;
-
-// Scalar Floating-point Converts
-def FCVTXN : NeonI_Scalar2SameMisc_fcvtxn_D_size<0b1, 0b10110, "fcvtxn">;
-def : Neon_Scalar2SameMisc_fcvtxn_D_size_patterns<int_aarch64_neon_fcvtxn,
-                                                  FCVTXN>;
-
-defm FCVTNS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11010, "fcvtns">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtns,
-                                                  FCVTNSss, FCVTNSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtns, FCVTNSdd>;
-
-defm FCVTNU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11010, "fcvtnu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtnu,
-                                                  FCVTNUss, FCVTNUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtnu, FCVTNUdd>;
-
-defm FCVTMS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11011, "fcvtms">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtms,
-                                                  FCVTMSss, FCVTMSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtms, FCVTMSdd>;
-
-defm FCVTMU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11011, "fcvtmu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtmu,
-                                                  FCVTMUss, FCVTMUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtmu, FCVTMUdd>;
-
-defm FCVTAS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b0, 0b11100, "fcvtas">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtas,
-                                                  FCVTASss, FCVTASdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtas, FCVTASdd>;
-
-defm FCVTAU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b0, 0b11100, "fcvtau">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtau,
-                                                  FCVTAUss, FCVTAUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtau, FCVTAUdd>;
-
-defm FCVTPS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11010, "fcvtps">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtps,
-                                                  FCVTPSss, FCVTPSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtps, FCVTPSdd>;
-
-defm FCVTPU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11010, "fcvtpu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtpu,
-                                                  FCVTPUss, FCVTPUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_arm_neon_vcvtpu, FCVTPUdd>;
-
-defm FCVTZS : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11011, "fcvtzs">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzs,
-                                                  FCVTZSss, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzs,
-                                                FCVTZSdd>;
-
-defm FCVTZU : NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11011, "fcvtzu">;
-defm : Neon_Scalar2SameMisc_fcvt_SD_size_patterns<int_aarch64_neon_fcvtzu,
-                                                  FCVTZUss, FCVTZUdd>;
-def : Neon_Scalar2SameMisc_vcvt_D_size_patterns<int_aarch64_neon_vcvtzu,
-                                                FCVTZUdd>;
-
-// Patterns For Convert Instructions Between v1f64 and v1i64
-class Neon_Scalar2SameMisc_cvtf_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1f64 (opnode (v1i64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar2SameMisc_fcvt_v1f64_pattern<SDPatternOperator opnode,
-                                              Instruction INST>
-    : Pat<(v1i64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<sint_to_fp, SCVTFdd>;
-def : Neon_Scalar2SameMisc_cvtf_v1f64_pattern<uint_to_fp, UCVTFdd>;
-
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_sint, FCVTZSdd>;
-def : Neon_Scalar2SameMisc_fcvt_v1f64_pattern<fp_to_uint, FCVTZUdd>;
-
-// Scalar Floating-point Reciprocal Estimate
-defm FRECPE : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11101, "frecpe">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpe,
-                                             FRECPEss, FRECPEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrecpe,
-                                              FRECPEdd>;
-
-// Scalar Floating-point Reciprocal Exponent
-defm FRECPX : NeonI_Scalar2SameMisc_SD_size<0b0, 0b1, 0b11111, "frecpx">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrecpx,
-                                             FRECPXss, FRECPXdd>;
-
-// Scalar Floating-point Reciprocal Square Root Estimate
-defm FRSQRTE: NeonI_Scalar2SameMisc_SD_size<0b1, 0b1, 0b11101, "frsqrte">;
-defm : Neon_Scalar2SameMisc_SD_size_patterns<int_aarch64_neon_vrsqrte,
-                                                 FRSQRTEss, FRSQRTEdd>;
-def : Neon_Scalar2SameMisc_V1_D_size_patterns<int_arm_neon_vrsqrte,
-                                              FRSQRTEdd>;
-
-// Scalar Floating-point Round
-class Neon_ScalarFloatRound_pattern<SDPatternOperator opnode, Instruction INST>
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-def : Neon_ScalarFloatRound_pattern<fceil, FRINTPdd>;
-def : Neon_ScalarFloatRound_pattern<ffloor, FRINTMdd>;
-def : Neon_ScalarFloatRound_pattern<ftrunc, FRINTZdd>;
-def : Neon_ScalarFloatRound_pattern<frint, FRINTXdd>;
-def : Neon_ScalarFloatRound_pattern<fnearbyint, FRINTIdd>;
-def : Neon_ScalarFloatRound_pattern<frnd, FRINTAdd>;
-def : Neon_ScalarFloatRound_pattern<int_aarch64_neon_frintn, FRINTNdd>;
-
-// Scalar Integer Compare
-
-// Scalar Compare Bitwise Equal
-def CMEQddd: NeonI_Scalar3Same_D_size<0b1, 0b10001, "cmeq">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vceq, CMEQddd>;
-
-class Neon_Scalar3Same_cmp_D_size_v1_patterns<SDPatternOperator opnode,
-                                              Instruction INSTD,
-                                              CondCode CC>
-  : Pat<(v1i64 (opnode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm), CC)),
-        (INSTD FPR64:$Rn, FPR64:$Rm)>;
-
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMEQddd, SETEQ>;
-
-// Scalar Compare Signed Greather Than Or Equal
-def CMGEddd: NeonI_Scalar3Same_D_size<0b0, 0b00111, "cmge">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcge, CMGEddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGEddd, SETGE>;
-
-// Scalar Compare Unsigned Higher Or Same
-def CMHSddd: NeonI_Scalar3Same_D_size<0b1, 0b00111, "cmhs">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchs, CMHSddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHSddd, SETUGE>;
-
-// Scalar Compare Unsigned Higher
-def CMHIddd: NeonI_Scalar3Same_D_size<0b1, 0b00110, "cmhi">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vchi, CMHIddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMHIddd, SETUGT>;
-
-// Scalar Compare Signed Greater Than
-def CMGTddd: NeonI_Scalar3Same_D_size<0b0, 0b00110, "cmgt">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vcgt, CMGTddd>;
-def : Neon_Scalar3Same_cmp_D_size_v1_patterns<Neon_cmp, CMGTddd, SETGT>;
-
-// Scalar Compare Bitwise Test Bits
-def CMTSTddd: NeonI_Scalar3Same_D_size<0b0, 0b10001, "cmtst">;
-defm : Neon_Scalar3Same_D_size_patterns<int_aarch64_neon_vtstd, CMTSTddd>;
-defm : Neon_Scalar3Same_D_size_patterns<Neon_tst, CMTSTddd>;
-
-// Scalar Compare Bitwise Equal To Zero
-def CMEQddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01001, "cmeq">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vceq,
-                                                CMEQddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETEQ, CMEQddi>;
-
-// Scalar Compare Signed Greather Than Or Equal To Zero
-def CMGEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01000, "cmge">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcge,
-                                                CMGEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGE, CMGEddi>;
-
-// Scalar Compare Signed Greater Than Zero
-def CMGTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01000, "cmgt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcgt,
-                                                CMGTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETGT, CMGTddi>;
-
-// Scalar Compare Signed Less Than Or Equal To Zero
-def CMLEddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b1, 0b01001, "cmle">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vclez,
-                                                CMLEddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLE, CMLEddi>;
-
-// Scalar Compare Less Than Zero
-def CMLTddi: NeonI_Scalar2SameMisc_cmpz_D_size<0b0, 0b01010, "cmlt">;
-def : Neon_Scalar2SameMisc_cmpz_D_size_patterns<int_aarch64_neon_vcltz,
-                                                CMLTddi>;
-def : Neon_Scalar2SameMisc_cmpz_D_V1_size_patterns<SETLT, CMLTddi>;
-
-// Scalar Floating-point Compare
-
-// Scalar Floating-point Compare Mask Equal
-defm FCMEQ: NeonI_Scalar3Same_SD_sizes<0b0, 0b0, 0b11100, "fcmeq">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fceq, v1i32, f32,
-                                         FCMEQsss, v1i64, f64, FCMEQddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETEQ, FCMEQddd>;
-
-// Scalar Floating-point Compare Mask Equal To Zero
-defm FCMEQZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01101, "fcmeq">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fceq, SETEQ,
-                                                  FCMEQZssi, FCMEQZddi>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal
-defm FCMGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11100, "fcmge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcge, v1i32, f32,
-                                         FCMGEsss, v1i64, f64, FCMGEddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGE, FCMGEddd>;
-
-// Scalar Floating-point Compare Mask Greater Than Or Equal To Zero
-defm FCMGEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01100, "fcmge">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcge, SETGE,
-                                                  FCMGEZssi, FCMGEZddi>;
-
-// Scalar Floating-point Compare Mask Greather Than
-defm FCMGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11100, "fcmgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcgt, v1i32, f32,
-                                         FCMGTsss, v1i64, f64, FCMGTddd>;
-def : Neon_Scalar3Same_cmp_V1_D_size_patterns<SETGT, FCMGTddd>;
-
-// Scalar Floating-point Compare Mask Greather Than Zero
-defm FCMGTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01100, "fcmgt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcgt, SETGT,
-                                                  FCMGTZssi, FCMGTZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Or Equal To Zero
-defm FCMLEZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b1, 0b01101, "fcmle">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fclez, SETLE,
-                                                  FCMLEZssi, FCMLEZddi>;
-
-// Scalar Floating-point Compare Mask Less Than Zero
-defm FCMLTZ: NeonI_Scalar2SameMisc_cmpz_SD_size<0b0, 0b01110, "fcmlt">;
-defm : Neon_Scalar2SameMisc_cmpz_SD_size_patterns<int_aarch64_neon_fcltz, SETLT,
-                                                  FCMLTZssi, FCMLTZddi>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than Or Equal
-defm FACGE: NeonI_Scalar3Same_SD_sizes<0b1, 0b0, 0b11101, "facge">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcage, v1i32, f32,
-                                         FACGEsss, v1i64, f64, FACGEddd>;
-def : Pat<(v1i64 (int_arm_neon_vacge (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGEddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Compare Mask Greater Than
-defm FACGT: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11101, "facgt">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_fcagt, v1i32, f32,
-                                         FACGTsss, v1i64, f64, FACGTddd>;
-def : Pat<(v1i64 (int_arm_neon_vacgt (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FACGTddd FPR64:$Rn, FPR64:$Rm)>;
-
-// Scalar Floating-point Absolute Difference
-defm FABD: NeonI_Scalar3Same_SD_sizes<0b1, 0b1, 0b11010, "fabd">;
-defm : Neon_Scalar3Same_SD_size_patterns<int_aarch64_neon_vabd, f32, f32,
-                                         FABDsss, f64, f64, FABDddd>;
-
-// Scalar Absolute Value
-defm ABS : NeonI_Scalar2SameMisc_D_size<0b0, 0b01011, "abs">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vabs, ABSdd>;
-
-// Scalar Signed Saturating Absolute Value
-defm SQABS : NeonI_Scalar2SameMisc_BHSD_size<0b0, 0b00111, "sqabs">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqabs,
-                                               SQABSbb, SQABShh, SQABSss, SQABSdd>;
-
-// Scalar Negate
-defm NEG : NeonI_Scalar2SameMisc_D_size<0b1, 0b01011, "neg">;
-defm : Neon_Scalar2SameMisc_D_size_patterns<int_aarch64_neon_vneg, NEGdd>;
-
-// Scalar Signed Saturating Negate
-defm SQNEG : NeonI_Scalar2SameMisc_BHSD_size<0b1, 0b00111, "sqneg">;
-defm : Neon_Scalar2SameMisc_BHSD_size_patterns<int_arm_neon_vqneg,
-                                               SQNEGbb, SQNEGhh, SQNEGss, SQNEGdd>;
-
-// Scalar Signed Saturating Accumulated of Unsigned Value
-defm SUQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b0, 0b00011, "suqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vuqadd,
-                                                     SUQADDbb, SUQADDhh,
-                                                     SUQADDss, SUQADDdd>;
-
-// Scalar Unsigned Saturating Accumulated of Signed Value
-defm USQADD : NeonI_Scalar2SameMisc_accum_BHSD_size<0b1, 0b00011, "usqadd">;
-defm : Neon_Scalar2SameMisc_accum_BHSD_size_patterns<int_aarch64_neon_vsqadd,
-                                                     USQADDbb, USQADDhh,
-                                                     USQADDss, USQADDdd>;
-
-def : Pat<(v1i64 (int_aarch64_neon_suqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (SUQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_aarch64_neon_usqadd (v1i64 FPR64:$Src),
-                                          (v1i64 FPR64:$Rn))),
-          (USQADDdd FPR64:$Src, FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vabs (v1i64 FPR64:$Rn))),
-          (ABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqabs (v1i64 FPR64:$Rn))),
-          (SQABSdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (int_arm_neon_vqneg (v1i64 FPR64:$Rn))),
-          (SQNEGdd FPR64:$Rn)>;
-
-def : Pat<(v1i64 (sub (v1i64 (bitconvert (v8i8 Neon_AllZero))),
-                      (v1i64 FPR64:$Rn))),
-          (NEGdd FPR64:$Rn)>;
-
-// Scalar Signed Saturating Extract Unsigned Narrow
-defm SQXTUN : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10010, "sqxtun">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnsu,
-                                                     SQXTUNbh, SQXTUNhs,
-                                                     SQXTUNsd>;
-
-// Scalar Signed Saturating Extract Narrow
-defm SQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b0, 0b10100, "sqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovns,
-                                                     SQXTNbh, SQXTNhs,
-                                                     SQXTNsd>;
-
-// Scalar Unsigned Saturating Extract Narrow
-defm UQXTN  : NeonI_Scalar2SameMisc_narrow_HSD_size<0b1, 0b10100, "uqxtn">;
-defm : Neon_Scalar2SameMisc_narrow_HSD_size_patterns<int_arm_neon_vqmovnu,
-                                                     UQXTNbh, UQXTNhs,
-                                                     UQXTNsd>;
-
-// Scalar Reduce Pairwise
-
-multiclass NeonI_ScalarPair_D_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0> {
-  let isCommutable = Commutable in {
-    def _D_2D : NeonI_ScalarPair<u, {size, 0b1}, opcode,
-                                (outs FPR64:$Rd), (ins VPR128:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2d"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-multiclass NeonI_ScalarPair_SD_sizes<bit u, bit size, bits<5> opcode,
-                                     string asmop, bit Commutable = 0>
-  : NeonI_ScalarPair_D_sizes<u, size, opcode, asmop, Commutable> {
-  let isCommutable = Commutable in {
-    def _S_2S : NeonI_ScalarPair<u, {size, 0b0}, opcode,
-                                (outs FPR32:$Rd), (ins VPR64:$Rn),
-                                !strconcat(asmop, "\t$Rd, $Rn.2s"),
-                                [],
-                                NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-// Scalar Reduce Addition Pairwise (Integer) with
-// Pattern to match llvm.arm.* intrinsic
-defm ADDPvv : NeonI_ScalarPair_D_sizes<0b0, 0b1, 0b11011, "addp", 0>;
-
-// Pattern to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Addition Pairwise (Integer)
-def : Pat<(v1i64 (int_aarch64_neon_vpadd (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_vaddv (v2i64 VPR128:$Rn))),
-          (ADDPvv_D_2D VPR128:$Rn)>;
-
-// Scalar Reduce Addition Pairwise (Floating Point)
-defm FADDPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01101, "faddp", 0>;
-
-// Scalar Reduce Maximum Pairwise (Floating Point)
-defm FMAXPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01111, "fmaxp", 0>;
-
-// Scalar Reduce Minimum Pairwise (Floating Point)
-defm FMINPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01111, "fminp", 0>;
-
-// Scalar Reduce maxNum Pairwise (Floating Point)
-defm FMAXNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b0, 0b01100, "fmaxnmp", 0>;
-
-// Scalar Reduce minNum Pairwise (Floating Point)
-defm FMINNMPvv : NeonI_ScalarPair_SD_sizes<0b1, 0b1, 0b01100, "fminnmp", 0>;
-
-multiclass Neon_ScalarPair_SD_size_patterns<SDPatternOperator opnode,
-                                            Instruction INSTS,
-                                            Instruction INSTD> {
-  def : Pat<(f32 (opnode (v2f32 VPR64:$Rn))),
-            (INSTS VPR64:$Rn)>;
-  def : Pat<(f64 (opnode (v2f64 VPR128:$Rn))),
-            (INSTD VPR128:$Rn)>;
-}
-
-// Patterns to match llvm.aarch64.* intrinsic for
-// Scalar Reduce Add, Max, Min, MaxiNum, MinNum Pairwise (Floating Point)
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfadd,
-                                        FADDPvv_S_2S, FADDPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmax,
-                                        FMAXPvv_S_2S, FMAXPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpmin,
-                                        FMINPvv_S_2S, FMINPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfmaxnm,
-                                        FMAXNMPvv_S_2S, FMAXNMPvv_D_2D>;
-
-defm : Neon_ScalarPair_SD_size_patterns<int_aarch64_neon_vpfminnm,
-                                        FMINNMPvv_S_2S, FMINNMPvv_D_2D>;
-
-def : Pat<(f32 (int_aarch64_neon_vpfadd (v4f32 VPR128:$Rn))),
-          (FADDPvv_S_2S (v2f32
-               (EXTRACT_SUBREG
-                   (v4f32 (FADDP_4S (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rn))),
-                   sub_64)))>;
-
-// Scalar by element Arithmetic
-
-class NeonI_ScalarXIndexedElemArith<string asmop, bits<4> opcode,
-                                    string rmlane, bit u, bit szhi, bit szlo,
-                                    RegisterClass ResFPR, RegisterClass OpFPR,
-                                    RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMul, ReadFPMul, ReadFPMul]> {
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-class NeonI_ScalarXIndexedElemArith_Constraint_Impl<string asmop, bits<4> opcode,
-                                                    string rmlane,
-                                                    bit u, bit szhi, bit szlo,
-                                                    RegisterClass ResFPR,
-                                                    RegisterClass OpFPR,
-                                                    RegisterOperand OpVPR,
-                                                    Operand OpImm>
-  : NeonI_ScalarXIndexedElem<u, szhi, szlo, opcode,
-                             (outs ResFPR:$Rd),
-                             (ins ResFPR:$src, OpFPR:$Rn, OpVPR:$MRm, OpImm:$Imm),
-                             asmop # "\t$Rd, $Rn, $MRm" # rmlane # "[$Imm]",
-                             [],
-                             NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  let Constraints = "$src = $Rd";
-  bits<3> Imm;
-  bits<5> MRm;
-}
-
-// Scalar Floating Point  multiply (scalar, by element)
-def FMULssv_4S : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULddv_2D : NeonI_ScalarXIndexedElemArith<"fmul",
-  0b1001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point  multiply extended (scalar, by element)
-def FMULXssv_4S : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".s", 0b1, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMULXddv_2D : NeonI_ScalarXIndexedElemArith<"fmulx",
-  0b1001, ".d", 0b1, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_MULX_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn))),
-             (ResTy (INST (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Patterns for Scalar Floating Point  multiply (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<fmul, FMULddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Patterns for Scalar Floating Point  multiply extended (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXssv_4S, f32, FPR32, v4f32, neon_uimm2_bare,
-  v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_MULX_Patterns<int_aarch64_neon_vmulx,
-  FMULXddv_2D, f64, FPR64, v2f64, neon_uimm1_bare,
-  v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Floating Point fused multiply-add (scalar, by element)
-def FMLAssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLAddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmla",
-  0b0001, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Floating Point fused multiply-subtract (scalar, by element)
-def FMLSssv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1}; // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def FMLSddv_2D : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"fmls",
-  0b0101, ".d", 0b0, 0b1, 0b1, FPR64, FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{11} = Imm{0}; // h
-  let Inst{21} = 0b0;    // l
-  let Inst{20-16} = MRm;
-}
-// We are allowed to match the fma instruction regardless of compile options.
-multiclass Neon_ScalarXIndexedElem_FMA_Patterns<
-  Instruction FMLAI, Instruction FMLSI,
-  ValueType ResTy, RegisterClass FPRC, ValueType OpTy, Operand OpImm,
-  ValueType OpNTy, ValueType ExTy, Operand OpNImm> {
-  // fmla
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmla operands
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm)),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLAI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // fmls
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma (ResTy FPRC:$Rn),
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-
-  // swapped fmls operands
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpTy VPR128:$MRm), OpImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn), (OpTy VPR128:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (fma
-               (fneg (ResTy (vector_extract (OpNTy VPR64:$MRm), OpNImm:$Imm))),
-               (ResTy FPRC:$Rn),
-               (ResTy FPRC:$Ra))),
-             (ResTy (FMLSI (ResTy FPRC:$Ra),
-               (ResTy FPRC:$Rn),
-               (ExTy (SUBREG_TO_REG (i64 0), VPR64:$MRm, sub_64)),
-               OpNImm:$Imm))>;
-}
-
-// Scalar Floating Point fused multiply-add and
-// multiply-subtract (scalar, by element)
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAssv_4S, FMLSssv_4S,
-  f32, FPR32, v4f32, neon_uimm2_bare, v2f32, v4f32, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-defm : Neon_ScalarXIndexedElem_FMA_Patterns<FMLAddv_2D, FMLSddv_2D,
-  f64, FPR64, v2f64, neon_uimm1_bare, v1f64, v2f64, neon_uimm0_bare>;
-
-// Scalar Signed saturating doubling multiply long (scalar, by element)
-def SQDMULLshv_4H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLshv_8H : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULLdsv_2S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULLdsv_4S : NeonI_ScalarXIndexedElemArith<"sqdmull",
-  0b1011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MUL_Patterns<
-  SDPatternOperator opnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass FPRC,
-  ValueType OpVTy, ValueType OpTy,
-  ValueType VecOpTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode (OpVTy FPRC:$Rn),
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  //swapped operands
-  def  : Pat<(ResTy (opnode
-               (OpVTy (scalar_to_vector
-                 (ExTy (vector_extract (VecOpTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (OpVTy (extract_subvector (VecOpTy VPRC:$MRm), OpImm:$Imm)),
-               (OpVTy FPRC:$Rn))),
-             (ResTy (INST (OpVTy FPRC:$Rn), (VecOpTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-
-// Patterns for Scalar Signed saturating doubling
-// multiply long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_4H, v1i32, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLshv_8H, v1i32, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_2S, v1i64, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmull,
-  SQDMULLdsv_4S, v1i64, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply-add long (scalar, by element)
-def SQDMLALshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLALdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLALdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlal",
-  0b0011, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Scalar Signed saturating doubling
-// multiply-subtract long (scalar, by element)
-def SQDMLSLshv_4H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLshv_8H : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".h", 0b0, 0b0, 0b1, FPR32, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMLSLdsv_2S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMLSLdsv_4S : NeonI_ScalarXIndexedElemArith_Constraint_Impl<"sqdmlsl",
-  0b0111, ".s", 0b0, 0b1, 0b0, FPR64, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-multiclass Neon_ScalarXIndexedElem_MLAL_Patterns<
-  SDPatternOperator opnode,
-  SDPatternOperator coreopnode,
-  Instruction INST,
-  ValueType ResTy, RegisterClass ResFPRC, RegisterClass FPRC,
-  ValueType OpTy,
-  ValueType OpVTy, ValueType ExTy, RegisterOperand VPRC, Operand OpImm> {
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode (OpTy FPRC:$Rn),
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  // swapped operands
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (scalar_to_vector
-                   (ExTy (vector_extract (OpVTy VPRC:$MRm), OpImm:$Imm)))),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-
-  def  : Pat<(ResTy (opnode
-               (ResTy ResFPRC:$Ra),
-               (ResTy (coreopnode
-                 (OpTy (extract_subvector (OpVTy VPRC:$MRm), OpImm:$Imm)),
-                 (OpTy FPRC:$Rn))))),
-             (ResTy (INST (ResTy ResFPRC:$Ra),
-               (OpTy FPRC:$Rn), (OpVTy VPRC:$MRm), OpImm:$Imm))>;
-}
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-add long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqadds,
-  int_arm_neon_vqdmull, SQDMLALdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Patterns for Scalar Signed saturating
-// doubling multiply-sub long (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_4H, v1i32, FPR32, FPR16, v1i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLshv_8H, v1i32, FPR32, FPR16, v1i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_2S, v1i64, FPR64, FPR32, v1i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MLAL_Patterns<int_arm_neon_vqsubs,
-  int_arm_neon_vqdmull, SQDMLSLdsv_4S, v1i64, FPR64, FPR32, v1i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-def SQDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqdmulh",
-  0b1100, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-// Patterns for Scalar Signed saturating doubling multiply returning
-// high half (scalar, by element)
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16,
-  i32, VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16,
-  i32, VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32,
-  i32, VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqdmulh,
-  SQDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32,
-  i32, VPR128Lo, neon_uimm2_bare>;
-
-// Scalar Signed saturating rounding doubling multiply
-// returning high half (scalar, by element)
-def SQRDMULHhhv_4H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR64Lo, neon_uimm2_bare> {
-  let Inst{11} = 0b0; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHhhv_8H : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".h", 0b0, 0b0, 0b1, FPR16, FPR16, VPR128Lo, neon_uimm3_bare> {
-  let Inst{11} = Imm{2}; // h
-  let Inst{21} = Imm{1}; // l
-  let Inst{20} = Imm{0}; // m
-  let Inst{19-16} = MRm{3-0};
-}
-def SQRDMULHssv_2S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR64, neon_uimm1_bare> {
-  let Inst{11} = 0b0;    // h
-  let Inst{21} = Imm{0}; // l
-  let Inst{20-16} = MRm;
-}
-def SQRDMULHssv_4S : NeonI_ScalarXIndexedElemArith<"sqrdmulh",
-  0b1101, ".s", 0b0, 0b1, 0b0, FPR32, FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{11} = Imm{1};    // h
-  let Inst{21} = Imm{0};    // l
-  let Inst{20-16} = MRm;
-}
-
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_4H, v1i16, FPR16, v1i16, i16, v4i16, i32,
-  VPR64Lo, neon_uimm2_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHhhv_8H, v1i16, FPR16, v1i16, i16, v8i16, i32,
-  VPR128Lo, neon_uimm3_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_2S, v1i32, FPR32, v1i32, i32, v2i32, i32,
-  VPR64Lo, neon_uimm1_bare>;
-defm : Neon_ScalarXIndexedElem_MUL_Patterns<int_arm_neon_vqrdmulh,
-  SQRDMULHssv_4S, v1i32, FPR32, v1i32, i32, v4i32, i32,
-  VPR128Lo, neon_uimm2_bare>;
-
-// Scalar general arithmetic operation
-class Neon_Scalar_GeneralMath2D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn))), (INST FPR64:$Rn)>;
-
-class Neon_Scalar_GeneralMath3D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (INST FPR64:$Rn, FPR64:$Rm)>;
-
-class Neon_Scalar_GeneralMath4D_pattern<SDPatternOperator opnode,
-                                        Instruction INST> 
-    : Pat<(v1f64 (opnode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm),
-              (v1f64 FPR64:$Ra))),
-          (INST FPR64:$Rn, FPR64:$Rm, FPR64:$Ra)>;
-
-def : Neon_Scalar_GeneralMath3D_pattern<fadd, FADDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fmul, FMULddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fsub, FSUBddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<fdiv, FDIVddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vabds, FABDddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmaxs, FMAXddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_arm_neon_vmins, FMINddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vmaxnm, FMAXNMddd>;
-def : Neon_Scalar_GeneralMath3D_pattern<int_aarch64_neon_vminnm, FMINNMddd>;
-
-def : Neon_Scalar_GeneralMath2D_pattern<fabs, FABSdd>;
-def : Neon_Scalar_GeneralMath2D_pattern<fneg, FNEGdd>;
-
-def : Neon_Scalar_GeneralMath4D_pattern<fma, FMADDdddd>;
-def : Neon_Scalar_GeneralMath4D_pattern<fmsub, FMSUBdddd>;
-
-// Scalar Copy - DUP element to scalar
-class NeonI_Scalar_DUP<string asmop, string asmlane,
-                       RegisterClass ResRC, RegisterOperand VPRC,
-                       Operand OpImm>
-  : NeonI_ScalarCopy<(outs ResRC:$Rd), (ins VPRC:$Rn, OpImm:$Imm),
-                     asmop # "\t$Rd, $Rn." # asmlane # "[$Imm]",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPbv_B : NeonI_Scalar_DUP<"dup", "b", FPR8, VPR128, neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def DUPhv_H : NeonI_Scalar_DUP<"dup", "h", FPR16, VPR128, neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def DUPsv_S : NeonI_Scalar_DUP<"dup", "s", FPR32, VPR128, neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def DUPdv_D : NeonI_Scalar_DUP<"dup", "d", FPR64, VPR128, neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v4f32 VPR128:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 1))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 2)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 2))>;
-def : Pat<(f32 (vector_extract (v4f32 VPR128:$Rn), 3)),
-          (f32 (DUPsv_S (v4f32 VPR128:$Rn), 3))>;
-
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v2f64 VPR128:$Rn), sub_64))>;
-def : Pat<(f64 (vector_extract (v2f64 VPR128:$Rn), 1)),
-          (f64 (DUPdv_D (v2f64 VPR128:$Rn), 1))>;
-
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 0)),
-          (f32 (EXTRACT_SUBREG (v2f32 VPR64:$Rn), sub_32))>;
-def : Pat<(f32 (vector_extract (v2f32 VPR64:$Rn), 1)),
-          (f32 (DUPsv_S (v4f32 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            1))>;
-
-def : Pat<(f64 (vector_extract (v1f64 VPR64:$Rn), 0)),
-          (f64 (EXTRACT_SUBREG (v1f64 VPR64:$Rn), sub_64))>;
-
-multiclass NeonI_Scalar_DUP_Ext_Vec_pattern<Instruction DUPI,
-  ValueType ResTy, ValueType OpTy,Operand OpLImm,
-  ValueType NOpTy, ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (extract_subvector (OpTy VPR128:$Rn), OpLImm:$Imm)),
-            (ResTy (DUPI VPR128:$Rn, OpLImm:$Imm))>;
-
-  def : Pat<(ResTy (extract_subvector (NOpTy VPR64:$Rn), OpNImm:$Imm)),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-                OpNImm:$Imm))>;
-}
-
-// Patterns for extract subvectors of v1ix data using scalar DUP instructions.
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPbv_B, v1i8, v16i8, neon_uimm4_bare,
-                                        v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPhv_H, v1i16, v8i16, neon_uimm3_bare,
-                                        v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Ext_Vec_pattern<DUPsv_S, v1i32, v4i32, neon_uimm2_bare,
-                                        v2i32, v4i32, neon_uimm1_bare>;
-
-multiclass NeonI_Scalar_DUP_Copy_pattern1<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)),
-              (neon_uimm0_bare:$Imm))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (vector_insert (ResTy undef),
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)),
-              (OpNImm:$Imm))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-multiclass NeonI_Scalar_DUP_Copy_pattern2<Instruction DUPI, ValueType ResTy,
-                                          ValueType OpTy, ValueType ElemTy,
-                                          Operand OpImm, ValueType OpNTy,
-                                          ValueType ExTy, Operand OpNImm> {
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpTy VPR128:$Rn), OpImm:$Imm)))),
-            (ResTy (DUPI (OpTy VPR128:$Rn), OpImm:$Imm))>;
-
-  def : Pat<(ResTy (scalar_to_vector
-              (ElemTy (vector_extract (OpNTy VPR64:$Rn), OpNImm:$Imm)))),
-            (ResTy (DUPI
-              (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              OpNImm:$Imm))>;
-}
-
-// Patterns for vector copy to v1ix and v1fx vectors using scalar DUP
-// instructions.
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern1<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPdv_D,
-  v1i64, v2i64, i64, neon_uimm1_bare,
-  v1i64, v2i64, neon_uimm0_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPsv_S,
-  v1i32, v4i32, i32, neon_uimm2_bare,
-  v2i32, v4i32, neon_uimm1_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPhv_H,
-  v1i16, v8i16, i32, neon_uimm3_bare,
-  v4i16, v8i16, neon_uimm2_bare>;
-defm : NeonI_Scalar_DUP_Copy_pattern2<DUPbv_B,
-  v1i8, v16i8, i32, neon_uimm4_bare,
-  v8i8, v16i8, neon_uimm3_bare>;
-
-multiclass NeonI_Scalar_DUP_alias<string asmop, string asmlane,
-                                  Instruction DUPI, Operand OpImm,
-                                  RegisterClass ResRC> {
-  def : NeonInstAlias<!strconcat(asmop, "$Rd, $Rn" # asmlane # "[$Imm]"),
-          (DUPI ResRC:$Rd, VPR128:$Rn, OpImm:$Imm), 0b0>;
-}
-
-// Aliases for Scalar copy - DUP element (scalar)
-// FIXME: This is actually the preferred syntax but TableGen can't deal with
-// custom printing of aliases.
-defm : NeonI_Scalar_DUP_alias<"mov", ".b", DUPbv_B, neon_uimm4_bare, FPR8>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".h", DUPhv_H, neon_uimm3_bare, FPR16>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".s", DUPsv_S, neon_uimm2_bare, FPR32>;
-defm : NeonI_Scalar_DUP_alias<"mov", ".d", DUPdv_D, neon_uimm1_bare, FPR64>;
-
-multiclass NeonI_SDUP<PatFrag GetLow, PatFrag GetHigh, ValueType ResTy,
-                      ValueType OpTy> {
-  def : Pat<(ResTy (GetLow VPR128:$Rn)),
-            (ResTy (EXTRACT_SUBREG (OpTy VPR128:$Rn), sub_64))>;
-  def : Pat<(ResTy (GetHigh VPR128:$Rn)),
-            (ResTy (DUPdv_D (OpTy VPR128:$Rn), 1))>;
-}
-
-defm : NeonI_SDUP<Neon_Low16B, Neon_High16B, v8i8, v16i8>;
-defm : NeonI_SDUP<Neon_Low8H, Neon_High8H, v4i16, v8i16>;
-defm : NeonI_SDUP<Neon_Low4S, Neon_High4S, v2i32, v4i32>;
-defm : NeonI_SDUP<Neon_Low2D, Neon_High2D, v1i64, v2i64>;
-defm : NeonI_SDUP<Neon_Low4float, Neon_High4float, v2f32, v4f32>;
-defm : NeonI_SDUP<Neon_Low2double, Neon_High2double, v1f64, v2f64>;
-
-// The following is for sext/zext from v1xx to v1xx
-multiclass NeonI_ext<string prefix, SDNode ExtOp> {
-  // v1i32 -> v1i64
-  def : Pat<(v1i64 (ExtOp (v1i32 FPR32:$Rn))),
-            (EXTRACT_SUBREG 
-              (v2i64 (!cast<Instruction>(prefix # "_2S")
-                (v2i32 (SUBREG_TO_REG (i64 0), $Rn, sub_32)), 0)),
-              sub_64)>;
-  
-  // v1i16 -> v1i32
-  def : Pat<(v1i32 (ExtOp (v1i16 FPR16:$Rn))),
-            (EXTRACT_SUBREG 
-              (v4i32 (!cast<Instruction>(prefix # "_4H")
-                (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-              sub_32)>;
-  
-  // v1i8 -> v1i16
-  def : Pat<(v1i16 (ExtOp (v1i8 FPR8:$Rn))),
-            (EXTRACT_SUBREG 
-              (v8i16 (!cast<Instruction>(prefix # "_8B")
-                (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-              sub_16)>;
-}
-
-defm NeonI_zext : NeonI_ext<"USHLLvvi", zext>;
-defm NeonI_sext : NeonI_ext<"SSHLLvvi", sext>;
-
-// zext v1i8 -> v1i32
-def : Pat<(v1i32 (zext (v1i8 FPR8:$Rn))),
-          (v1i32 (EXTRACT_SUBREG
-            (v1i64 (SUBREG_TO_REG (i64 0),
-              (v1i8 (DUPbv_B
-                (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-                0)),
-              sub_8)),
-            sub_32))>;
-
-// zext v1i8 -> v1i64
-def : Pat<(v1i64 (zext (v1i8 FPR8:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i8 (DUPbv_B
-              (v16i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)),
-              0)),
-            sub_8))>;
-
-// zext v1i16 -> v1i64
-def : Pat<(v1i64 (zext (v1i16 FPR16:$Rn))),
-          (v1i64 (SUBREG_TO_REG (i64 0),
-            (v1i16 (DUPhv_H
-              (v8i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)),
-              0)),
-            sub_16))>;
-
-// sext v1i8 -> v1i32
-def : Pat<(v1i32 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG
-            (v4i32 (SSHLLvvi_4H
-              (v4i16 (SUBREG_TO_REG (i64 0),
-                (v1i16 (EXTRACT_SUBREG 
-                  (v8i16 (SSHLLvvi_8B
-                    (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                  sub_16)),
-                sub_16)), 0)),
-            sub_32)>;
-              
-// sext v1i8 -> v1i64
-def : Pat<(v1i64 (sext (v1i8 FPR8:$Rn))),
-          (EXTRACT_SUBREG 
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0),
-                      (v1i16 (EXTRACT_SUBREG 
-                        (v8i16 (SSHLLvvi_8B
-                          (v8i8 (SUBREG_TO_REG (i64 0), $Rn, sub_8)), 0)),
-                        sub_16)),
-                      sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-  
-// sext v1i16 -> v1i64
-def : Pat<(v1i64 (sext (v1i16 FPR16:$Rn))),
-          (EXTRACT_SUBREG
-            (v2i64 (SSHLLvvi_2S
-              (v2i32 (SUBREG_TO_REG (i64 0),
-                (v1i32 (EXTRACT_SUBREG 
-                  (v4i32 (SSHLLvvi_4H
-                    (v4i16 (SUBREG_TO_REG (i64 0), $Rn, sub_16)), 0)),
-                  sub_32)),
-                sub_32)), 0)),
-            sub_64)>;
-
-//===----------------------------------------------------------------------===//
-// Non-Instruction Patterns
-//===----------------------------------------------------------------------===//
-
-// 64-bit vector bitcasts...
-
-def : Pat<(v1i64 (bitconvert (v8i8  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  VPR64:$src))), (v4i16 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v4i16  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2i32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v2f32  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (v1i64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (v1f64  VPR64:$src))), (v1i64 VPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64  VPR64:$src))), (v2f32 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64  VPR64:$src))), (v2i32 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v8i8 (bitconvert (v1f64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  VPR64:$src))), (v1f64 VPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  VPR64:$src))), (v1f64 VPR64:$src)>;
-
-// ..and 128-bit vector bitcasts...
-
-def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8  VPR128:$src))), (v8i16 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (v2i64  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (v2f64  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
-
-// ...and scalar bitcasts...
-def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
-def : Pat<(f32 (bitconvert (v1i32  FPR32:$src))), (f32 FPR32:$src)>;
-def : Pat<(f64 (bitconvert (v1i64  FPR64:$src))), (f64 FPR64:$src)>;
-def : Pat<(f64 (bitconvert (v1f64  FPR64:$src))), (f64 FPR64:$src)>;
-
-def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
-
-def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
-
-def : Pat<(v8i8  (bitconvert (v1i64  VPR64:$src))), (v8i8 VPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
-
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
-
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
-
-def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
-def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
-def : Pat<(v1i64 (bitconvert (f64  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64  FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-
-def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
-
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
-
-// Scalar Three Same
-
-def neon_uimm3 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 8;}]> {
-  let ParserMatchClass = uimm3_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-def neon_uimm4 : Operand<i64>,
-                   ImmLeaf<i64, [{return Imm < 16;}]> {
-  let ParserMatchClass = uimm4_asmoperand;
-  let PrintMethod = "printUImmHexOperand";
-}
-
-// Bitwise Extract
-class NeonI_Extract<bit q, bits<2> op2, string asmop,
-                    string OpS, RegisterOperand OpVPR, Operand OpImm>
-  : NeonI_BitExtract<q, op2, (outs OpVPR:$Rd),
-                     (ins OpVPR:$Rn, OpVPR:$Rm, OpImm:$Index),
-                     asmop # "\t$Rd." # OpS # ", $Rn." # OpS #
-                     ", $Rm." # OpS # ", $Index",
-                     [],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>{
-  bits<4> Index;
-}
-
-def EXTvvvi_8b : NeonI_Extract<0b0, 0b00, "ext", "8b",
-                               VPR64, neon_uimm3> {
-  let Inst{14-11} = {0b0, Index{2}, Index{1}, Index{0}};
-}
-
-def EXTvvvi_16b: NeonI_Extract<0b1, 0b00, "ext", "16b",
-                               VPR128, neon_uimm4> {
-  let Inst{14-11} = Index;
-}
-
-class NI_Extract<ValueType OpTy, RegisterOperand OpVPR, Instruction INST,
-                 Operand OpImm>
-  : Pat<(OpTy (Neon_vextract (OpTy OpVPR:$Rn), (OpTy OpVPR:$Rm),
-                                 (i64 OpImm:$Imm))),
-              (INST OpVPR:$Rn, OpVPR:$Rm, OpImm:$Imm)>;
-
-def : NI_Extract<v8i8,  VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v4i16, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2i32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1i64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v2f32, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v1f64, VPR64,  EXTvvvi_8b,  neon_uimm3>;
-def : NI_Extract<v16i8, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v8i16, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4i32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2i64, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v4f32, VPR128, EXTvvvi_16b, neon_uimm4>;
-def : NI_Extract<v2f64, VPR128, EXTvvvi_16b, neon_uimm4>;
-
-// Table lookup
-class NI_TBL<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-// The vectors in look up table are always 16b
-multiclass NI_TBL_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBL<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBL<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBL1 : NI_TBL_pat<0b00, 0b0, "tbl", "VOne">;
-defm TBL2 : NI_TBL_pat<0b01, 0b0, "tbl", "VPair">;
-defm TBL3 : NI_TBL_pat<0b10, 0b0, "tbl", "VTriple">;
-defm TBL4 : NI_TBL_pat<0b11, 0b0, "tbl", "VQuad">;
-
-// Table lookup extension
-class NI_TBX<bit q, bits<2> op2, bits<2> len, bit op,
-             string asmop, string OpS, RegisterOperand OpVPR,
-             RegisterOperand VecList>
-  : NeonI_TBL<q, op2, len, op,
-              (outs OpVPR:$Rd), (ins OpVPR:$src, VecList:$Rn, OpVPR:$Rm),
-              asmop # "\t$Rd." # OpS # ", $Rn, $Rm." # OpS,
-              [],
-              NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-}
-
-// The vectors in look up table are always 16b
-multiclass NI_TBX_pat<bits<2> len, bit op, string asmop, string List> {
-  def _8b  : NI_TBX<0, 0b00, len, op, asmop, "8b", VPR64,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-
-  def _16b : NI_TBX<1, 0b00, len, op, asmop, "16b", VPR128,
-                    !cast<RegisterOperand>(List # "16B_operand")>;
-}
-
-defm TBX1 : NI_TBX_pat<0b00, 0b1, "tbx", "VOne">;
-defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">;
-defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">;
-defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">;
-
-class NeonI_INS_main<string asmop, string Res, ValueType ResTy,
-                     RegisterClass OpGPR, ValueType OpTy, Operand OpImm>
-  : NeonI_copy<0b1, 0b0, 0b0011,
-               (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd." # Res # "[$Imm], $Rn",
-               [(set (ResTy VPR128:$Rd),
-                 (ResTy (vector_insert
-                   (ResTy VPR128:$src),
-                   (OpTy OpGPR:$Rn),
-                   (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<4> Imm;
-  let Constraints = "$src = $Rd";
-}
-
-//Insert element (vector, from main)
-def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32,
-                           neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32,
-                           neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32,
-                           neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64,
-                           neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn",
-                    (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn",
-                    (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn",
-                    (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn",
-                    (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy,
-                             RegisterClass OpGPR, ValueType OpTy,
-                             Operand OpImm, Instruction INS>
-  : Pat<(ResTy (vector_insert
-              (ResTy VPR64:$src),
-              (OpTy OpGPR:$Rn),
-              (OpImm:$Imm))),
-        (ResTy (EXTRACT_SUBREG
-          (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-            OpGPR:$Rn, OpImm:$Imm)), sub_64))>;
-
-def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32,
-                                          neon_uimm3_bare, INSbw>;
-def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32,
-                                          neon_uimm2_bare, INShw>;
-def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32,
-                                          neon_uimm1_bare, INSsw>;
-def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64,
-                                          neon_uimm0_bare, INSdx>;
-
-class NeonI_INS_element<string asmop, string Res, Operand ResImm>
-  : NeonI_insert<0b1, 0b1,
-                 (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn,
-                 ResImm:$Immd, ResImm:$Immn),
-                 asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  bits<4> Immd;
-  bits<4> Immn;
-}
-
-//Insert element (vector, from element)
-def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> {
-  let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1};
-  let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}};
-}
-def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> {
-  let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0};
-  let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0};
-  // bit 11 is unspecified, but should be set to zero.
-}
-def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> {
-  let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0};
-  let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0};
-  // bits 11-12 are unspecified, but should be set to zero.
-}
-def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> {
-  let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0};
-  let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0};
-  // bits 11-13 are unspecified, but should be set to zero.
-}
-
-def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]",
-                    (INSELb VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]",
-                    (INSELh VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]",
-                    (INSELs VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>;
-def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]",
-                    (INSELd VPR128:$Rd, VPR128:$Rn,
-                      neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>;
-
-multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy,
-                                ValueType MidTy, Operand StImm, Operand NaImm,
-                                Instruction INS> {
-def : Pat<(ResTy (vector_insert
-            (ResTy VPR128:$src),
-            (MidTy (vector_extract
-              (ResTy VPR128:$Rn),
-              (StImm:$Immn))),
-            (StImm:$Immd))),
-          (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn),
-              StImm:$Immd, StImm:$Immn)>;
-
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (StImm:$Immd))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-             StImm:$Immd, NaImm:$Immn)>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (ResTy VPR128:$Rn),
-               (StImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy VPR128:$Rn),
-               NaImm:$Immd, StImm:$Immn)),
-             sub_64))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy (vector_extract
-               (NaTy VPR64:$Rn),
-               (NaImm:$Immn))),
-             (NaImm:$Immd))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)),
-               NaImm:$Immd, NaImm:$Immn)),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                            neon_uimm3_bare, INSELb>;
-defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                            neon_uimm2_bare, INSELh>;
-defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                            neon_uimm1_bare, INSELs>;
-defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare,
-                            neon_uimm0_bare, INSELd>;
-
-multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy,
-                                      ValueType MidTy,
-                                      RegisterClass OpFPR, Operand ResImm,
-                                      SubRegIndex SubIndex, Instruction INS> {
-def : Pat <(ResTy (vector_insert
-             (ResTy VPR128:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (INS (ResTy VPR128:$src),
-             (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)),
-             ResImm:$Imm,
-             (i64 0))>;
-
-def : Pat <(NaTy (vector_insert
-             (NaTy VPR64:$src),
-             (MidTy OpFPR:$Rn),
-             (ResImm:$Imm))),
-           (NaTy (EXTRACT_SUBREG
-             (ResTy (INS
-               (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)),
-               (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)),
-               ResImm:$Imm,
-               (i64 0))),
-             sub_64))>;
-}
-
-defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare,
-                                  sub_32, INSELs>;
-defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare,
-                                  sub_64, INSELd>;
-
-class NeonI_SMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, ValueType eleTy,
-                 Operand OpImm, RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0101,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                 (ResTy (sext_inreg
-                   (ResTy (vector_extract
-                     (OpTy VPR128:$Rn), (OpImm:$Imm))),
-                   eleTy)))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Signed integer move (main, from element)
-def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare,
-                        GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare,
-                        GPR64, i64> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy,
-                               ValueType eleTy, Operand StImm,  Operand NaImm,
-                               Instruction SMOVI> {
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (StTy VPR128:$Rn), (StImm:$Imm))))),
-              eleTy)),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (StTy VPR128:$Rn), (StImm:$Imm))))),
-            (SMOVI VPR128:$Rn, StImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext_inreg
-              (i64 (anyext
-                (i32 (vector_extract
-                  (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-              eleTy)),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-
-  def : Pat<(i64 (sext
-              (i32 (vector_extract
-                (NaTy VPR64:$Rn), (NaImm:$Imm))))),
-            (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              NaImm:$Imm)>;
-}
-
-defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                          neon_uimm3_bare, SMOVxb>;
-defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                          neon_uimm2_bare, SMOVxh>;
-defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                          neon_uimm1_bare, SMOVxs>;
-
-class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy,
-                          ValueType eleTy, Operand StImm,  Operand NaImm,
-                          Instruction SMOVI>
-  : Pat<(i32 (sext_inreg
-          (i32 (vector_extract
-            (NaTy VPR64:$Rn), (NaImm:$Imm))),
-          eleTy)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare,
-                         neon_uimm3_bare, SMOVwb>;
-def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare,
-                         neon_uimm2_bare, SMOVwh>;
-
-class NeonI_UMOV<string asmop, string Res, bit Q,
-                 ValueType OpTy, Operand OpImm,
-                 RegisterClass ResGPR, ValueType ResTy>
-  : NeonI_copy<Q, 0b0, 0b0111,
-               (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd, $Rn." # Res # "[$Imm]",
-               [(set (ResTy ResGPR:$Rd),
-                  (ResTy (vector_extract
-                    (OpTy VPR128:$Rn), (OpImm:$Imm))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-//Unsigned integer move (main, from element)
-def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare,
-                         GPR32, i32> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare,
-                         GPR64, i64> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]",
-                    (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>;
-def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]",
-                    (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>;
-
-class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy,
-                         Operand StImm,  Operand NaImm,
-                         Instruction SMOVI>
-  : Pat<(ResTy (vector_extract
-          (NaTy VPR64:$Rn), NaImm:$Imm)),
-        (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-          NaImm:$Imm)>;
-
-def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare,
-                        neon_uimm3_bare, UMOVwb>;
-def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare,
-                        neon_uimm2_bare, UMOVwh>;
-def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare,
-                        neon_uimm1_bare, UMOVws>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))),
-            255)),
-          (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))),
-            65535)),
-          (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))),
-          (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))),
-            255)),
-          (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm3_bare:$Imm)>;
-
-def : Pat<(i32 (and
-            (i32 (vector_extract
-              (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))),
-            65535)),
-          (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm2_bare:$Imm)>;
-
-def : Pat<(i64 (zext
-            (i32 (vector_extract
-              (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))),
-          (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64),
-            neon_uimm0_bare:$Imm)>;
-
-// Additional copy patterns for scalar types
-def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))),
-          (UMOVwb (v16i8
-            (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))),
-          (UMOVwh (v8i16
-            (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>;
-
-def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))),
-          (FMOVws FPR32:$Rn)>;
-
-def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))),
-          (FMOVxd FPR64:$Rn)>;
-
-def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))),
-          (f64 FPR64:$Rn)>;
-
-def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)),
-          (v1i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_8))>;
-
-def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)),
-          (v1i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_16))>;
-
-def : Pat<(v1i32 (scalar_to_vector GPR32:$src)),
-          (FMOVsw $src)>;
-
-def : Pat<(v1i64 (scalar_to_vector GPR64:$src)),
-          (FMOVdx $src)>;
-
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (v8i8 (EXTRACT_SUBREG (v16i8
-            (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (v4i16 (EXTRACT_SUBREG (v8i16
-            (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v2i32 (scalar_to_vector GPR32:$Rn)),
-          (v2i32 (EXTRACT_SUBREG (v16i8
-            (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))),
-            sub_64))>;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v4i32 (scalar_to_vector GPR32:$Rn)),
-          (INSsw (v4i32 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2i64 (scalar_to_vector GPR64:$Rn)),
-          (INSdx (v2i64 (IMPLICIT_DEF)), $Rn, (i64 0))>;
-
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)>;
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (v1f64 FPR64:$Rn)>;
-
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)),
-                         (f64 FPR64:$src), sub_64)>;
-
-class NeonI_DUP_Elt<bit Q, string asmop, string rdlane,  string rnlane,
-                    RegisterOperand ResVPR, Operand OpImm>
-  : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd),
-               (ins VPR128:$Rn, OpImm:$Imm),
-               asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]",
-               [],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  bits<4> Imm;
-}
-
-def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128,
-                              neon_uimm1_bare> {
-  let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0};
-}
-
-def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64,
-                              neon_uimm4_bare> {
-  let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1};
-}
-
-def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64,
-                              neon_uimm3_bare> {
-  let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0};
-}
-
-def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64,
-                              neon_uimm2_bare> {
-  let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0};
-}
-
-multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy,
-                                       ValueType OpTy,ValueType NaTy,
-                                       ValueType ExTy, Operand OpLImm,
-                                       Operand OpNImm> {
-def  : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)),
-        (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>;
-
-def : Pat<(ResTy (Neon_vduplane
-            (NaTy VPR64:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>;
-}
-defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8,
-                             neon_uimm4_bare, neon_uimm3_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16,
-                             neon_uimm3_bare, neon_uimm2_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32,
-                             neon_uimm2_bare, neon_uimm1_bare>;
-defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64,
-                             neon_uimm1_bare, neon_uimm0_bare>;
-
-def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v2f32 (DUPELT2s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))),
-          (v4f32 (DUPELT4s
-            (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-            (i64 0)))>;
-def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))),
-          (v2f64 (DUPELT2d
-            (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64),
-            (i64 0)))>;
-
-multiclass NeonI_DUP_pattern<Instruction DUPELT, ValueType ResTy,
-                             ValueType OpTy, RegisterClass OpRC,
-                             Operand OpNImm, SubRegIndex SubIndex> {
-def : Pat<(ResTy (Neon_vduplane (OpTy OpRC:$Rn), OpNImm:$Imm)),
-          (ResTy (DUPELT
-            (SUBREG_TO_REG (i64 0), OpRC:$Rn, SubIndex), OpNImm:$Imm))>;
-}
-
-defm : NeonI_DUP_pattern<DUPELT4h, v4i16, v1i16, FPR16, neon_uimm2_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT4s, v4i32, v1i32, FPR32, neon_uimm2_bare,sub_32>;
-defm : NeonI_DUP_pattern<DUPELT8b, v8i8, v1i8, FPR8, neon_uimm3_bare, sub_8>;
-defm : NeonI_DUP_pattern<DUPELT8h, v8i16, v1i16, FPR16, neon_uimm3_bare,sub_16>;
-defm : NeonI_DUP_pattern<DUPELT16b, v16i8, v1i8, FPR8, neon_uimm4_bare, sub_8>;
-
-class NeonI_DUP<bit Q, string asmop, string rdlane,
-                RegisterOperand ResVPR, ValueType ResTy,
-                RegisterClass OpGPR, ValueType OpTy>
-  : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn),
-               asmop # "\t$Rd" # rdlane # ", $Rn",
-               [(set (ResTy ResVPR:$Rd),
-                 (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> {
-  let Inst{20-16} = 0b01000;
-  // bit 20 is unspecified, but should be set to zero.
-}
-
-def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> {
-  let Inst{20-16} = 0b00001;
-  // bits 17-20 are unspecified, but should be set to zero.
-}
-
-def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> {
-  let Inst{20-16} = 0b00010;
-  // bits 18-20 are unspecified, but should be set to zero.
-}
-
-def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> {
-  let Inst{20-16} = 0b00100;
-  // bits 19-20 are unspecified, but should be set to zero.
-}
-
-// patterns for CONCAT_VECTORS
-multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> {
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)),
-          (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))),
-          (INSELd
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)),
-            (i64 1),
-            (i64 0))>;
-def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))),
-          (DUPELT2d
-            (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            (i64 0))> ;
-}
-
-defm : Concat_Vector_Pattern<v16i8, v8i8>;
-defm : Concat_Vector_Pattern<v8i16, v4i16>;
-defm : Concat_Vector_Pattern<v4i32, v2i32>;
-defm : Concat_Vector_Pattern<v2i64, v1i64>;
-defm : Concat_Vector_Pattern<v4f32, v2f32>;
-defm : Concat_Vector_Pattern<v2f64, v1f64>;
-
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), undef)),
-          (v2i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32))>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG 
-            (v4i32 (INSELs
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32)),
-              (v4i32 (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              (i64 1),
-              (i64 0))),
-            sub_64)>;
-def : Pat<(v2i32 (concat_vectors (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rn))),
-          (DUPELT2s (v4i32 (SUBREG_TO_REG(i64 0), $Rn, sub_32)), 0)>;
-
-//patterns for EXTRACT_SUBVECTOR
-def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))),
-          (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))),
-          (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))),
-          (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))),
-          (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))),
-          (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))),
-          (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>;
-
-// The followings are for instruction class (3V Elem)
-
-// Variant 1
-
-class NI_2VE<bit q, bit u, bits<2> size, bits<4> opcode,
-             string asmop, string ResS, string OpS, string EleOpS,
-             Operand OpImm, RegisterOperand ResVPR,
-             RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins ResVPR:$src, OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPMAC, ReadFPMAC, ReadFPMAC, ReadFPMAC]> {
-  bits<3> Index;
-  bits<5> Re;
-
-  let Constraints = "$src = $Rd";
-}
-
-multiclass NI_2VE_v1<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                     neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm MLAvve : NI_2VE_v1<0b1, 0b0000, "mla">;
-defm MLSvve : NI_2VE_v1<0b1, 0b0100, "mls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                   RegisterOperand ResVPR, RegisterOperand OpVPR,
-                   RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                   ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                  RegisterOperand ResVPR, RegisterOperand OpVPR,
-                  RegisterOperand EleOpVPR, ValueType ResTy, ValueType OpTy,
-                  ValueType EleOpTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$src), (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST ResVPR:$src, OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_v1_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                     op, VPR64, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                     op, VPR128, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                     op, VPR64, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                     op, VPR128, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                    op, VPR64, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                    op, VPR64, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MLA_lane_v1 : NI_2VE_v1_pat<"MLAvve", Neon_mla>;
-defm MLS_lane_v1 : NI_2VE_v1_pat<"MLSvve", Neon_mls>;
-
-class NI_2VE_2op<bit q, bit u, bits<2> size, bits<4> opcode,
-                 string asmop, string ResS, string OpS, string EleOpS,
-                 Operand OpImm, RegisterOperand ResVPR,
-                 RegisterOperand OpVPR, RegisterOperand EleOpVPR>
-  : NeonI_2VElem<q, u, size, opcode,
-                 (outs ResVPR:$Rd), (ins OpVPR:$Rn,
-                                         EleOpVPR:$Re, OpImm:$Index),
-                 asmop # "\t$Rd." # ResS # ", $Rn." # OpS #
-                 ", $Re." # EleOpS # "[$Index]",
-                 [],
-                 NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  bits<3> Index;
-  bits<5> Re;
-}
-
-multiclass NI_2VE_v1_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4h8h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4h", "4h", "h",
-                         neon_uimm3_bare, VPR64, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _8h8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop, "8h", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm MULve : NI_2VE_v1_2op<0b0, 0b1000, "mul">;
-defm SQDMULHve : NI_2VE_v1_2op<0b0, 0b1100, "sqdmulh">;
-defm SQRDMULHve : NI_2VE_v1_2op<0b0, 0b1101, "sqrdmulh">;
-}
-
-// Pattern for lane in 128-bit vector
-class NI_2VE_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                       ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VE_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                      ValueType ResTy, ValueType OpTy, ValueType EleOpTy>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (Neon_vduplane (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-multiclass NI_2VE_mul_v1_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i32, v2i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4i32, v4i32, v4i32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4h8h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i16, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_8h8h"), neon_uimm3_bare,
-                         op, VPR128, VPR128Lo, v8i16, v8i16, v8i16>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i32, v2i32, v2i32>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4h8h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i16, v4i16, v4i16>;
-}
-
-defm MUL_lane_v1 : NI_2VE_mul_v1_pat<"MULve", mul>;
-defm SQDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQDMULHve", int_arm_neon_vqdmulh>;
-defm SQRDMULH_lane_v1 : NI_2VE_mul_v1_pat<"SQRDMULHve", int_arm_neon_vqrdmulh>;
-
-// Variant 2
-
-multiclass NI_2VE_v2_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                         neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE_2op<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                         neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm FMULve : NI_2VE_v2_2op<0b0, 0b1001, "fmul">;
-defm FMULXve : NI_2VE_v2_2op<0b1, 0b1001, "fmulx">;
-}
-
-class NI_2VE_mul_lane_2d<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand OpVPR, RegisterOperand EleOpVPR,
-                         ValueType ResTy, ValueType OpTy, ValueType EleOpTy,
-                         SDPatternOperator coreop>
-  : Pat<(ResTy (op (OpTy OpVPR:$Rn),
-          (OpTy (coreop (EleOpTy EleOpVPR:$Re), (EleOpTy EleOpVPR:$Re))))),
-        (INST OpVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), 0)>;
-
-multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2s4s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2f32, v2f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare,
-                         op, VPR128, VPR128, v4f32, v4f32, v4f32>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                         op, VPR128, VPR128, v2f64, v2f64, v2f64>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2s4s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2f32, v2f32, v2f32>;
-
-  def : NI_2VE_mul_lane_2d<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare,
-                           op, VPR128, VPR64, v2f64, v2f64, v1f64,
-                           BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>;
-defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>;
-
-def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v2f32 VPR64:$Rn))),
-          (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))),
-                       (v4f32 VPR128:$Rn))),
-          (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))),
-                       (v2f64 VPR128:$Rn))),
-          (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>;
-
-// The followings are patterns using fma
-// -ffp-contract=fast generates fma
-
-multiclass NI_2VE_v2<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2s4s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2s", "2s", "s",
-                     neon_uimm2_bare, VPR64, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _4s4s : NI_2VE<0b1, u, 0b10, opcode, asmop, "4s", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // _1d2d doesn't exist!
-
-  def _2d2d : NI_2VE<0b1, u, 0b11, opcode, asmop, "2d", "2d", "d",
-                     neon_uimm1_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{0}};
-    let Inst{21} = 0b0;
-    let Inst{20-16} = Re;
-  }
-}
-
-defm FMLAvve : NI_2VE_v2<0b0, 0b0001, "fmla">;
-defm FMLSvve : NI_2VE_v2<0b0, 0b0101, "fmls">;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                       RegisterOperand ResVPR, RegisterOperand OpVPR,
-                       ValueType ResTy, ValueType OpTy,
-                       SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane 0
-class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy ResVPR:$Rn),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                      RegisterOperand ResVPR, RegisterOperand OpVPR,
-                      ValueType ResTy, ValueType OpTy,
-                      SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (i64 OpImm:$Index))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEswap_lane_2d2d<Instruction INST, Operand OpImm,
-                           SDPatternOperator op,
-                           RegisterOperand ResVPR, RegisterOperand OpVPR,
-                           ValueType ResTy, ValueType OpTy,
-                           SDPatternOperator coreop>
-  : Pat<(ResTy (op (ResTy (coreop (OpTy OpVPR:$Re), (OpTy OpVPR:$Re))),
-                   (ResTy ResVPR:$Rn), (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-          (SUBREG_TO_REG (i64 0), OpVPR:$Re, sub_64), 0)>;
-
-
-multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> {
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d node:$LHS, node:$RHS)>>;
-}
-
-defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>;
-
-// Pattern for lane 0
-class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op,
-                      RegisterOperand ResVPR, ValueType ResTy>
-  : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)),
-                   (ResTy (Neon_vdup (f32 FPR32:$Re))),
-                   (ResTy ResVPR:$src))),
-        (INST ResVPR:$src, ResVPR:$Rn,
-              (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>;
-
-multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op>
-{
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"),
-                         neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"),
-                        op, VPR64, v2f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"),
-                         neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"),
-                        op, VPR128, v4f32>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(fneg (Neon_vduplane
-                                     node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"),
-                         neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64,
-                         BinOpFrag<(Neon_vduplane
-                                     (fneg node:$LHS), node:$RHS)>>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane
-                                    node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_2s4s"),
-                        neon_uimm1_bare, op, VPR64, VPR64, v2f32, v2f32,
-                        BinOpFrag<(Neon_vduplane
-                                    (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(fneg (Neon_vduplane node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane<!cast<Instruction>(subop # "_4s4s"),
-                        neon_uimm1_bare, op, VPR128, VPR64, v4f32, v2f32,
-                        BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(fneg (Neon_combine_2d
-                                         node:$LHS, node:$RHS))>>;
-
-  def : NI_2VEswap_lane_2d2d<!cast<Instruction>(subop # "_2d2d"),
-                             neon_uimm1_bare, op, VPR128, VPR64, v2f64, v1f64,
-                             BinOpFrag<(Neon_combine_2d
-                                         (fneg node:$LHS), (fneg node:$RHS))>>;
-}
-
-defm FMLS_lane_v2_s : NI_2VE_fms_v2_pat<"FMLSvve", fma>;
-
-// Variant 3: Long type
-// E.g. SMLAL : 4S/4H/H (v0-v15), 2D/2S/S
-//      SMLAL2: 4S/8H/H (v0-v15), 2D/4S/S
-
-multiclass NI_2VE_v3<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                     neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                     neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                     neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                     neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-defm SMLALvve : NI_2VE_v3<0b0, 0b0010, "smlal">;
-defm UMLALvve : NI_2VE_v3<0b1, 0b0010, "umlal">;
-defm SMLSLvve : NI_2VE_v3<0b0, 0b0110, "smlsl">;
-defm UMLSLvve : NI_2VE_v3<0b1, 0b0110, "umlsl">;
-defm SQDMLALvve : NI_2VE_v3<0b0, 0b0011, "sqdmlal">;
-defm SQDMLSLvve : NI_2VE_v3<0b0, 0b0111, "sqdmlsl">;
-
-multiclass NI_2VE_v3_2op<bit u, bits<4> opcode, string asmop> {
-  // vector register class for element is always 128-bit to cover the max index
-  def _2d2s : NI_2VE_2op<0b0, u, 0b10, opcode, asmop, "2d", "2s", "s",
-                         neon_uimm2_bare, VPR128, VPR64, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  def _2d4s : NI_2VE_2op<0b1, u, 0b10, opcode, asmop # "2", "2d", "4s", "s",
-                         neon_uimm2_bare, VPR128, VPR128, VPR128> {
-    let Inst{11} = {Index{1}};
-    let Inst{21} = {Index{0}};
-    let Inst{20-16} = Re;
-  }
-
-  // Index operations on 16-bit(H) elements are restricted to using v0-v15.
-  def _4s8h : NI_2VE_2op<0b1, u, 0b01, opcode, asmop # "2", "4s", "8h", "h",
-                         neon_uimm3_bare, VPR128, VPR128, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-
-  def _4s4h : NI_2VE_2op<0b0, u, 0b01, opcode, asmop, "4s", "4h", "h",
-                         neon_uimm3_bare, VPR128, VPR64, VPR128Lo> {
-    let Inst{11} = {Index{2}};
-    let Inst{21} = {Index{1}};
-    let Inst{20} = {Index{0}};
-    let Inst{19-16} = Re{3-0};
-  }
-}
-
-let SchedRW = [WriteFPMul, ReadFPMul, ReadFPMul] in {
-defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">;
-defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">;
-defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">;
-}
-
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))),
-          (FMOVdd $src)>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                     RegisterOperand EleOpVPR, ValueType ResTy,
-                     ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                    RegisterOperand EleOpVPR, ValueType ResTy,
-                    ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                    SDPatternOperator hiop>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$src, VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op,
-                     ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                     SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op (ResTy VPR128:$src),
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     op, VPR128, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    op, VPR128, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    op, VPR128, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMLAL_lane_v3 : NI_2VEL_v3_pat<"SMLALvve", Neon_smlal>;
-defm UMLAL_lane_v3 : NI_2VEL_v3_pat<"UMLALvve", Neon_umlal>;
-defm SMLSL_lane_v3 : NI_2VEL_v3_pat<"SMLSLvve", Neon_smlsl>;
-defm UMLSL_lane_v3 : NI_2VEL_v3_pat<"UMLSLvve", Neon_umlsl>;
-
-// Pattern for lane in 128-bit vector
-class NI_2VEL2_mul_laneq<Instruction INST, Operand OpImm, SDPatternOperator op,
-                         RegisterOperand EleOpVPR, ValueType ResTy,
-                         ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn, EleOpVPR:$Re, OpImm:$Index)>;
-
-// Pattern for lane in 64-bit vector
-class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op,
-                        RegisterOperand EleOpVPR, ValueType ResTy,
-                        ValueType OpTy, ValueType EleOpTy, ValueType HalfOpTy,
-                        SDPatternOperator hiop>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vduplane
-                      (EleOpTy EleOpVPR:$Re), (i64 OpImm:$Index))))),
-        (INST VPR128:$Rn,
-          (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>;
-
-// Pattern for fixed lane 0
-class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op,
-                         ValueType ResTy, ValueType OpTy, ValueType HalfOpTy,
-                         SDPatternOperator hiop, Instruction DupInst>
-  : Pat<(ResTy (op
-          (HalfOpTy (hiop (OpTy VPR128:$Rn))),
-          (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))),
-        (INST VPR128:$Rn, (DupInst $Re), 0)>;
-
-multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> {
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                         op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                         op, VPR64, VPR128, v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                         op, VPR128Lo, v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                           op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"),
-                           op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"),
-                           op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                        op, VPR64, VPR64Lo, v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                        op, VPR64, VPR64, v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                          op, VPR64Lo, v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_mul_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                          op, VPR64, v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SMULLve", int_arm_neon_vmulls>;
-defm UMULL_lane_v3 : NI_2VEL_mul_v3_pat<"UMULLve", int_arm_neon_vmullu>;
-defm SQDMULL_lane_v3 : NI_2VEL_mul_v3_pat<"SQDMULLve", int_arm_neon_vqdmull>;
-
-multiclass NI_qdma<SDPatternOperator op> {
-  def _4s : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v4i32 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-
-  def _2d : PatFrag<(ops node:$Ra, node:$Rn, node:$Rm),
-                    (op node:$Ra,
-                      (v2i64 (int_arm_neon_vqdmull node:$Rn, node:$Rm)))>;
-}
-
-defm Neon_qdmlal : NI_qdma<int_arm_neon_vqadds>;
-defm Neon_qdmlsl : NI_qdma<int_arm_neon_vqsubs>;
-
-multiclass NI_2VEL_v3_qdma_pat<string subop, string op> {
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare,
-                     !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR128Lo,
-                     v4i32, v4i16, v8i16>;
-
-  def : NI_2VE_laneq<!cast<Instruction>(subop # "_2d2s"), neon_uimm2_bare,
-                     !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR128,
-                     v2i64, v2i32, v4i32>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_4s8h"), neon_uimm3_bare,
-                       !cast<PatFrag>(op # "_4s"), VPR128Lo,
-                       v4i32, v8i16, v8i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare,
-                       !cast<PatFrag>(op # "_2d"), VPR128,
-                       v2i64, v4i32, v4i32, v2i32, Neon_High4S>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"),
-                       !cast<PatFrag>(op # "_4s"),
-                       v4i32, v8i16, v4i16, Neon_High8H, DUP8h>;
-
-  def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"),
-                       !cast<PatFrag>(op # "_2d"),
-                       v2i64, v4i32, v2i32, Neon_High4S, DUP4s>;
-
-  // Index can only be half of the max value for lane in 64-bit vector
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare,
-                    !cast<PatFrag>(op # "_4s"), VPR128, VPR64, VPR64Lo,
-                    v4i32, v4i16, v4i16>;
-
-  def : NI_2VE_lane<!cast<Instruction>(subop # "_2d2s"), neon_uimm1_bare,
-                    !cast<PatFrag>(op # "_2d"), VPR128, VPR64, VPR64,
-                    v2i64, v2i32, v2i32>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_4s8h"), neon_uimm2_bare,
-                      !cast<PatFrag>(op # "_4s"), VPR64Lo,
-                      v4i32, v8i16, v4i16, v4i16, Neon_High8H>;
-
-  def : NI_2VEL2_lane<!cast<Instruction>(subop # "_2d4s"), neon_uimm1_bare,
-                      !cast<PatFrag>(op # "_2d"), VPR64,
-                      v2i64, v4i32, v2i32, v2i32, Neon_High4S>;
-}
-
-defm SQDMLAL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLALvve", "Neon_qdmlal">;
-defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">;
-
-// End of implementation for instruction class (3V Elem)
-
-class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U,
-                bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy,
-                SDPatternOperator Neon_Rev>
-  : NeonI_2VMisc<Q, U, size, opcode,
-               (outs ResVPR:$Rd), (ins ResVPR:$Rn),
-               asmop # "\t$Rd." # Res # ", $Rn." # Res,
-               [(set (ResTy ResVPR:$Rd),
-                  (ResTy (Neon_Rev (ResTy ResVPR:$Rn))))],
-               NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def REV64_16b : NeonI_REV<"rev64", "16b", 0b00, 0b1, 0b0, 0b00000, VPR128,
-                          v16i8, Neon_rev64>;
-def REV64_8h : NeonI_REV<"rev64", "8h", 0b01, 0b1, 0b0, 0b00000, VPR128,
-                         v8i16, Neon_rev64>;
-def REV64_4s : NeonI_REV<"rev64", "4s", 0b10, 0b1, 0b0, 0b00000, VPR128,
-                         v4i32, Neon_rev64>;
-def REV64_8b : NeonI_REV<"rev64", "8b", 0b00, 0b0, 0b0, 0b00000, VPR64,
-                         v8i8, Neon_rev64>;
-def REV64_4h : NeonI_REV<"rev64", "4h", 0b01, 0b0, 0b0, 0b00000, VPR64,
-                         v4i16, Neon_rev64>;
-def REV64_2s : NeonI_REV<"rev64", "2s", 0b10, 0b0, 0b0, 0b00000, VPR64,
-                         v2i32, Neon_rev64>;
-
-def : Pat<(v4f32 (Neon_rev64 (v4f32 VPR128:$Rn))), (REV64_4s VPR128:$Rn)>;
-def : Pat<(v2f32 (Neon_rev64 (v2f32 VPR64:$Rn))), (REV64_2s VPR64:$Rn)>;
-
-def REV32_16b : NeonI_REV<"rev32", "16b", 0b00, 0b1, 0b1, 0b00000, VPR128,
-                          v16i8, Neon_rev32>;
-def REV32_8h : NeonI_REV<"rev32", "8h", 0b01, 0b1, 0b1, 0b00000, VPR128,
-                          v8i16, Neon_rev32>;
-def REV32_8b : NeonI_REV<"rev32", "8b", 0b00, 0b0, 0b1, 0b00000, VPR64,
-                         v8i8, Neon_rev32>;
-def REV32_4h : NeonI_REV<"rev32", "4h", 0b01, 0b0, 0b1, 0b00000, VPR64,
-                         v4i16, Neon_rev32>;
-
-def REV16_16b : NeonI_REV<"rev16", "16b", 0b00, 0b1, 0b0, 0b00001, VPR128,
-                          v16i8, Neon_rev16>;
-def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
-                         v8i8, Neon_rev16>;
-
-multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.8h, $Rn.16b",
-                           [(set (v8i16 VPR128:$Rd),
-                              (v8i16 (Neon_Padd (v16i8 VPR128:$Rn))))],
-                           NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.8b",
-                          [(set (v4i16 VPR64:$Rd),
-                             (v4i16 (Neon_Padd (v8i8 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.4s, $Rn.8h",
-                           [(set (v4i32 VPR128:$Rd),
-                              (v4i32 (Neon_Padd (v8i16 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.4h",
-                          [(set (v2i32 VPR64:$Rd),
-                             (v2i32 (Neon_Padd (v4i16 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$Rn),
-                           asmop # "\t$Rd.2d, $Rn.4s",
-                           [(set (v2i64 VPR128:$Rd),
-                              (v2i64 (Neon_Padd (v4i32 VPR128:$Rn))))],
-                           NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.1d, $Rn.2s",
-                          [(set (v1i64 VPR64:$Rd),
-                             (v1i64 (Neon_Padd (v2i32 VPR64:$Rn))))],
-                          NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SADDLP : NeonI_PairwiseAdd<"saddlp", 0b0, 0b00010,
-                                int_arm_neon_vpaddls>;
-defm UADDLP : NeonI_PairwiseAdd<"uaddlp", 0b1, 0b00010,
-                                int_arm_neon_vpaddlu>;
-
-def : Pat<(v1i64 (int_aarch64_neon_saddlv (v2i32 VPR64:$Rn))),
-          (SADDLP2s1d $Rn)>;
-def : Pat<(v1i64 (int_aarch64_neon_uaddlv (v2i32 VPR64:$Rn))),
-          (UADDLP2s1d $Rn)>;
-
-multiclass NeonI_PairwiseAddAcc<string asmop, bit U, bits<5> opcode,
-                             SDPatternOperator Neon_Padd> {
-  let Constraints = "$src = $Rd" in {
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "\t$Rd.8h, $Rn.16b",
-                             [(set (v8i16 VPR128:$Rd),
-                                (v8i16 (Neon_Padd
-                                  (v8i16 VPR128:$src), (v16i8 VPR128:$Rn))))],
-                             NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.4h, $Rn.8b",
-                            [(set (v4i16 VPR64:$Rd),
-                               (v4i16 (Neon_Padd
-                                 (v4i16 VPR64:$src), (v8i8 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.4s, $Rn.8h",
-                            [(set (v4i32 VPR128:$Rd),
-                               (v4i32 (Neon_Padd
-                                 (v4i32 VPR128:$src), (v8i16 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.2s, $Rn.4h",
-                            [(set (v2i32 VPR64:$Rd),
-                               (v2i32 (Neon_Padd
-                                 (v2i32 VPR64:$src), (v4i16 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "\t$Rd.2d, $Rn.4s",
-                            [(set (v2i64 VPR128:$Rd),
-                               (v2i64 (Neon_Padd
-                                 (v2i64 VPR128:$src), (v4i32 VPR128:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s1d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                            asmop # "\t$Rd.1d, $Rn.2s",
-                            [(set (v1i64 VPR64:$Rd),
-                               (v1i64 (Neon_Padd
-                                 (v1i64 VPR64:$src), (v2i32 VPR64:$Rn))))],
-                            NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SADALP : NeonI_PairwiseAddAcc<"sadalp", 0b0, 0b00110,
-                                   int_arm_neon_vpadals>;
-defm UADALP : NeonI_PairwiseAddAcc<"uadalp", 0b1, 0b00110,
-                                   int_arm_neon_vpadalu>;
-
-multiclass NeonI_2VMisc_BHSDsize_1Arg<string asmop, bit U, bits<5> opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                         (outs VPR64:$Rd), (ins VPR64:$Rn),
-                         asmop # "\t$Rd.8b, $Rn.8b",
-                         [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm SQABS : NeonI_2VMisc_BHSDsize_1Arg<"sqabs", 0b0, 0b00111>;
-defm SQNEG : NeonI_2VMisc_BHSDsize_1Arg<"sqneg", 0b1, 0b00111>;
-defm ABS : NeonI_2VMisc_BHSDsize_1Arg<"abs", 0b0, 0b01011>;
-defm NEG : NeonI_2VMisc_BHSDsize_1Arg<"neg", 0b1, 0b01011>;
-
-multiclass NeonI_2VMisc_BHSD_1Arg_Pattern<string Prefix,
-                                          SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b) (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b) (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h) (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s) (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQABS", int_arm_neon_vqabs>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"SQNEG", int_arm_neon_vqneg>;
-defm : NeonI_2VMisc_BHSD_1Arg_Pattern<"ABS", int_arm_neon_vabs>;
-
-def : Pat<(v16i8 (sub
-            (v16i8 Neon_AllZero),
-            (v16i8 VPR128:$Rn))),
-          (v16i8 (NEG16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (sub
-            (v8i8 Neon_AllZero),
-            (v8i8 VPR64:$Rn))),
-          (v8i8 (NEG8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (sub
-            (v8i16 (bitconvert (v16i8 Neon_AllZero))),
-            (v8i16 VPR128:$Rn))),
-          (v8i16 (NEG8h (v8i16 VPR128:$Rn)))>;
-def : Pat<(v4i16 (sub
-            (v4i16 (bitconvert (v8i8 Neon_AllZero))),
-            (v4i16 VPR64:$Rn))),
-          (v4i16 (NEG4h (v4i16 VPR64:$Rn)))>;
-def : Pat<(v4i32 (sub
-            (v4i32 (bitconvert (v16i8 Neon_AllZero))),
-            (v4i32 VPR128:$Rn))),
-          (v4i32 (NEG4s (v4i32 VPR128:$Rn)))>;
-def : Pat<(v2i32 (sub
-            (v2i32 (bitconvert (v8i8 Neon_AllZero))),
-            (v2i32 VPR64:$Rn))),
-          (v2i32 (NEG2s (v2i32 VPR64:$Rn)))>;
-def : Pat<(v2i64 (sub
-            (v2i64 (bitconvert (v16i8 Neon_AllZero))),
-            (v2i64 VPR128:$Rn))),
-          (v2i64 (NEG2d (v2i64 VPR128:$Rn)))>;
-
-multiclass NeonI_2VMisc_BHSDsize_2Args<string asmop, bit U, bits<5> opcode> {
-  let Constraints = "$src = $Rd" in {
-    def 16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                           (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                           asmop # "\t$Rd.16b, $Rn.16b",
-                           [], NoItinerary>,
-              Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.8h, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8b",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR64:$src, VPR64:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm SUQADD : NeonI_2VMisc_BHSDsize_2Args<"suqadd", 0b0, 0b00011>;
-defm USQADD : NeonI_2VMisc_BHSDsize_2Args<"usqadd", 0b1, 0b00011>;
-
-multiclass NeonI_2VMisc_BHSD_2Args_Pattern<string Prefix,
-                                           SDPatternOperator Neon_Op> {
-  def : Pat<(v16i8 (Neon_Op (v16i8 VPR128:$src), (v16i8 VPR128:$Rn))),
-            (v16i8 (!cast<Instruction>(Prefix # 16b)
-              (v16i8 VPR128:$src), (v16i8 VPR128:$Rn)))>;
-
-  def : Pat<(v8i16 (Neon_Op (v8i16 VPR128:$src), (v8i16 VPR128:$Rn))),
-            (v8i16 (!cast<Instruction>(Prefix # 8h)
-              (v8i16 VPR128:$src), (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i32 (Neon_Op (v4i32 VPR128:$src), (v4i32 VPR128:$Rn))),
-            (v4i32 (!cast<Instruction>(Prefix # 4s)
-              (v4i32 VPR128:$src), (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i64 (Neon_Op (v2i64 VPR128:$src), (v2i64 VPR128:$Rn))),
-            (v2i64 (!cast<Instruction>(Prefix # 2d)
-              (v2i64 VPR128:$src), (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v8i8 (Neon_Op (v8i8 VPR64:$src), (v8i8 VPR64:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8b)
-              (v8i8 VPR64:$src), (v8i8 VPR64:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i16 VPR64:$src), (v4i16 VPR64:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4h)
-              (v4i16 VPR64:$src), (v4i16 VPR64:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i32 VPR64:$src), (v2i32 VPR64:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2s)
-              (v2i32 VPR64:$src), (v2i32 VPR64:$Rn)))>;
-}
-
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"SUQADD", int_aarch64_neon_suqadd>;
-defm : NeonI_2VMisc_BHSD_2Args_Pattern<"USQADD", int_aarch64_neon_usqadd>;
-
-multiclass NeonI_2VMisc_BHSsizes<string asmop, bit U,
-                          SDPatternOperator Neon_Op> {
-  def 16b : NeonI_2VMisc<0b1, U, 0b00, 0b00100,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [(set (v16i8 VPR128:$Rd),
-                            (v16i8 (Neon_Op (v16i8 VPR128:$Rn))))],
-                         NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h : NeonI_2VMisc<0b1, U, 0b01, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.8h, $Rn.8h",
-                        [(set (v8i16 VPR128:$Rd),
-                           (v8i16 (Neon_Op (v8i16 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, 0b00100,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, 0b00, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [(set (v8i8 VPR64:$Rd),
-                           (v8i8 (Neon_Op (v8i8 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4h : NeonI_2VMisc<0b0, U, 0b01, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.4h, $Rn.4h",
-                        [(set (v4i16 VPR64:$Rd),
-                           (v4i16 (Neon_Op (v4i16 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, 0b00100,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CLS : NeonI_2VMisc_BHSsizes<"cls", 0b0, int_arm_neon_vcls>;
-defm CLZ : NeonI_2VMisc_BHSsizes<"clz", 0b1, ctlz>;
-
-multiclass NeonI_2VMisc_Bsize<string asmop, bit U, bits<2> size,
-                              bits<5> Opcode> {
-  def 16b : NeonI_2VMisc<0b1, U, size, Opcode,
-                         (outs VPR128:$Rd), (ins VPR128:$Rn),
-                         asmop # "\t$Rd.16b, $Rn.16b",
-                         [], NoItinerary>,
-            Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8b : NeonI_2VMisc<0b0, U, size, Opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.8b, $Rn.8b",
-                        [], NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm CNT : NeonI_2VMisc_Bsize<"cnt", 0b0, 0b00, 0b00101>;
-defm NOT : NeonI_2VMisc_Bsize<"not", 0b1, 0b00, 0b00101>;
-defm RBIT : NeonI_2VMisc_Bsize<"rbit", 0b1, 0b01, 0b00101>;
-
-def : NeonInstAlias<"mvn $Rd.16b, $Rn.16b",
-                    (NOT16b VPR128:$Rd, VPR128:$Rn), 0>;
-def : NeonInstAlias<"mvn $Rd.8b, $Rn.8b",
-                    (NOT8b VPR64:$Rd, VPR64:$Rn), 0>;
-
-def : Pat<(v16i8 (ctpop (v16i8 VPR128:$Rn))),
-          (v16i8 (CNT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (ctpop (v8i8 VPR64:$Rn))),
-          (v8i8 (CNT8b (v8i8 VPR64:$Rn)))>;
-
-def : Pat<(v16i8 (xor
-            (v16i8 VPR128:$Rn),
-            (v16i8 Neon_AllOne))),
-          (v16i8 (NOT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (xor
-            (v8i8 VPR64:$Rn),
-            (v8i8 Neon_AllOne))),
-          (v8i8 (NOT8b (v8i8 VPR64:$Rn)))>;
-def : Pat<(v8i16 (xor
-            (v8i16 VPR128:$Rn),
-            (v8i16 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v4i16 (xor
-            (v4i16 VPR64:$Rn),
-            (v4i16 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v4i32 (xor
-            (v4i32 VPR128:$Rn),
-            (v4i32 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-def : Pat<(v2i32 (xor
-            (v2i32 VPR64:$Rn),
-            (v2i32 (bitconvert (v8i8 Neon_AllOne))))),
-          (NOT8b VPR64:$Rn)>;
-def : Pat<(v2i64 (xor
-            (v2i64 VPR128:$Rn),
-            (v2i64 (bitconvert (v16i8 Neon_AllOne))))),
-          (NOT16b VPR128:$Rn)>;
-
-def : Pat<(v16i8 (int_aarch64_neon_rbit (v16i8 VPR128:$Rn))),
-          (v16i8 (RBIT16b (v16i8 VPR128:$Rn)))>;
-def : Pat<(v8i8 (int_aarch64_neon_rbit (v8i8 VPR64:$Rn))),
-          (v8i8 (RBIT8b (v8i8 VPR64:$Rn)))>;
-
-multiclass NeonI_2VMisc_SDsizes<string asmop, bit U, bits<5> opcode,
-                                SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4f32 VPR128:$Rd),
-                           (v4f32 (Neon_Op (v4f32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, 0b11, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (v2f64 VPR128:$Rd),
-                           (v2f64 (Neon_Op (v2f64 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2f32 VPR64:$Rd),
-                           (v2f32 (Neon_Op (v2f32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FABS : NeonI_2VMisc_SDsizes<"fabs", 0b0, 0b01111, fabs>;
-defm FNEG : NeonI_2VMisc_SDsizes<"fneg", 0b1, 0b01111, fneg>;
-
-multiclass NeonI_2VMisc_HSD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 8h8b : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.8b, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$Rd = $src" in {
-    def 8h16b : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                             (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                             asmop # "2\t$Rd.16b, $Rn.8h",
-                             [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm XTN : NeonI_2VMisc_HSD_Narrow<"xtn", 0b0, 0b10010>;
-defm SQXTUN : NeonI_2VMisc_HSD_Narrow<"sqxtun", 0b1, 0b10010>;
-defm SQXTN : NeonI_2VMisc_HSD_Narrow<"sqxtn", 0b0, 0b10100>;
-defm UQXTN : NeonI_2VMisc_HSD_Narrow<"uqxtn", 0b1, 0b10100>;
-
-multiclass NeonI_2VMisc_Narrow_Patterns<string Prefix,
-                                        SDPatternOperator Neon_Op> {
-  def : Pat<(v8i8 (Neon_Op (v8i16 VPR128:$Rn))),
-            (v8i8 (!cast<Instruction>(Prefix # 8h8b) (v8i16 VPR128:$Rn)))>;
-
-  def : Pat<(v4i16 (Neon_Op (v4i32 VPR128:$Rn))),
-            (v4i16 (!cast<Instruction>(Prefix # 4s4h) (v4i32 VPR128:$Rn)))>;
-
-  def : Pat<(v2i32 (Neon_Op (v2i64 VPR128:$Rn))),
-            (v2i32 (!cast<Instruction>(Prefix # 2d2s) (v2i64 VPR128:$Rn)))>;
-
-  def : Pat<(v16i8 (concat_vectors
-              (v8i8 VPR64:$src),
-              (v8i8 (Neon_Op (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 8h16b)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v8i16 (concat_vectors
-              (v4i16 VPR64:$src),
-              (v4i16 (Neon_Op (v4i32 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 4s8h)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-
-  def : Pat<(v4i32 (concat_vectors
-              (v2i32 VPR64:$src),
-              (v2i32 (Neon_Op (v2i64 VPR128:$Rn))))),
-            (!cast<Instruction>(Prefix # 2d4s)
-              (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64),
-              VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Narrow_Patterns<"XTN", trunc>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTUN", int_arm_neon_vqmovnsu>;
-defm : NeonI_2VMisc_Narrow_Patterns<"SQXTN", int_arm_neon_vqmovns>;
-defm : NeonI_2VMisc_Narrow_Patterns<"UQXTN", int_arm_neon_vqmovnu>;
-
-multiclass NeonI_2VMisc_SHIFT<string asmop, bit U, bits<5> opcode> {
-  let DecoderMethod = "DecodeSHLLInstruction" in {
-    def 8b8h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact8:$Imm),
-                            asmop # "\t$Rd.8h, $Rn.8b, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4h4s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact16:$Imm),
-                            asmop # "\t$Rd.4s, $Rn.4h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 2s2d : NeonI_2VMisc<0b0, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR64:$Rn, uimm_exact32:$Imm),
-                            asmop # "\t$Rd.2d, $Rn.2s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact8:$Imm),
-                            asmop # "2\t$Rd.8h, $Rn.16b, $Imm",
-                            [], NoItinerary>,
-                Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 8h4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact16:$Imm),
-                            asmop # "2\t$Rd.4s, $Rn.8h, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-
-    def 4s2d : NeonI_2VMisc<0b1, U, 0b10, opcode,
-                            (outs VPR128:$Rd),
-                            (ins VPR128:$Rn, uimm_exact32:$Imm),
-                            asmop # "2\t$Rd.2d, $Rn.4s, $Imm",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU]>;
-  }
-}
-
-defm SHLL : NeonI_2VMisc_SHIFT<"shll", 0b1, 0b10011>;
-
-class NeonI_SHLL_Patterns<ValueType OpTy, ValueType DesTy,
-                          SDPatternOperator ExtOp, Operand Neon_Imm,
-                          string suffix>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp (OpTy VPR64:$Rn))),
-            (DesTy (Neon_vdup
-              (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR64:$Rn, Neon_Imm:$Imm)>;
-
-class NeonI_SHLL_High_Patterns<ValueType OpTy, ValueType DesTy,
-                               SDPatternOperator ExtOp, Operand Neon_Imm,
-                               string suffix, PatFrag GetHigh>
-  : Pat<(DesTy (shl
-          (DesTy (ExtOp
-            (OpTy (GetHigh VPR128:$Rn)))),
-              (DesTy (Neon_vdup
-                (i32 Neon_Imm:$Imm))))),
-        (!cast<Instruction>("SHLL" # suffix) VPR128:$Rn, Neon_Imm:$Imm)>;
-
-def : NeonI_SHLL_Patterns<v8i8, v8i16, zext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v8i8, v8i16, sext, uimm_exact8, "8b8h">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, zext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v4i16, v4i32, sext, uimm_exact16, "4h4s">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, zext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_Patterns<v2i32, v2i64, sext, uimm_exact32, "2s2d">;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, zext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v8i8, v8i16, sext, uimm_exact8, "16b8h",
-                               Neon_High16B>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, zext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v4i16, v4i32, sext, uimm_exact16, "8h4s",
-                               Neon_High8H>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, zext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-def : NeonI_SHLL_High_Patterns<v2i32, v2i64, sext, uimm_exact32, "4s2d",
-                               Neon_High4S>;
-
-multiclass NeonI_2VMisc_SD_Narrow<string asmop, bit U, bits<5> opcode> {
-  def 4s4h : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.4h, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  let Constraints = "$src = $Rd" in {
-    def 4s8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.8h, $Rn.4s",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-
-    def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                            (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                            asmop # "2\t$Rd.4s, $Rn.2d",
-                            [], NoItinerary>,
-               Sched<[WriteFPALU, ReadFPALU, ReadFPALU]>;
-  }
-}
-
-defm FCVTN : NeonI_2VMisc_SD_Narrow<"fcvtn", 0b0, 0b10110>;
-
-multiclass NeonI_2VMisc_Narrow_Pattern<string prefix,
-                                       SDPatternOperator f32_to_f16_Op,
-                                       SDPatternOperator f64_to_f32_Op> {
-
-  def : Pat<(v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))),
-              (!cast<Instruction>(prefix # "4s4h") (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v8i16 (concat_vectors
-                (v4i16 VPR64:$src),
-                (v4i16 (f32_to_f16_Op (v4f32 VPR128:$Rn))))),
-                  (!cast<Instruction>(prefix # "4s8h")
-                    (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                    (v4f32 VPR128:$Rn))>;
-
-  def : Pat<(v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") (v2f64 VPR128:$Rn))>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (f64_to_f32_Op (v2f64 VPR128:$Rn))))),
-                (!cast<Instruction>(prefix # "2d4s")
-                  (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-                  (v2f64 VPR128:$Rn))>;
-}
-
-defm : NeonI_2VMisc_Narrow_Pattern<"FCVTN", int_arm_neon_vcvtfp2hf, fround>;
-
-multiclass NeonI_2VMisc_D_Narrow<string asmop, string prefix, bit U,
-                                 bits<5> opcode> {
-  def 2d2s : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR64:$Rd), (ins VPR128:$Rn),
-                          asmop # "\t$Rd.2s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d4s : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.2d",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-    let Constraints = "$src = $Rd";
-  }
-
-  def : Pat<(v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))),
-            (!cast<Instruction>(prefix # "2d2s") VPR128:$Rn)>;
-
-  def : Pat<(v4f32 (concat_vectors
-              (v2f32 VPR64:$src),
-              (v2f32 (int_aarch64_neon_vcvtxn (v2f64 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "2d4s")
-               (v4f32 (SUBREG_TO_REG (i32 0), VPR64:$src, sub_64)),
-               VPR128:$Rn)>;
-}
-
-defm FCVTXN : NeonI_2VMisc_D_Narrow<"fcvtxn","FCVTXN", 0b1, 0b10110>;
-
-def Neon_High4Float : PatFrag<(ops node:$in),
-                              (extract_subvector (v4f32 node:$in), (iPTR 2))>;
-
-multiclass NeonI_2VMisc_HS_Extend<string asmop, bit U, bits<5> opcode> {
-  def 4h4s : NeonI_2VMisc<0b0, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.4s, $Rn.4h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s2d : NeonI_2VMisc<0b0, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR64:$Rn),
-                          asmop # "\t$Rd.2d, $Rn.2s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 8h4s : NeonI_2VMisc<0b1, U, 0b00, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.4s, $Rn.8h",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 4s2d : NeonI_2VMisc<0b1, U, 0b01, opcode,
-                          (outs VPR128:$Rd), (ins VPR128:$Rn),
-                          asmop # "2\t$Rd.2d, $Rn.4s",
-                          [], NoItinerary>,
-             Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm FCVTL : NeonI_2VMisc_HS_Extend<"fcvtl", 0b0, 0b10111>;
-
-multiclass NeonI_2VMisc_Extend_Pattern<string prefix> {
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp (v4i16 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "4h4s") VPR64:$Rn)>;
-
-  def : Pat<(v4f32 (int_arm_neon_vcvthf2fp
-              (v4i16 (Neon_High8H
-                (v8i16 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "8h4s") VPR128:$Rn)>;
-
-  def : Pat<(v2f64 (fextend (v2f32 VPR64:$Rn))),
-            (!cast<Instruction>(prefix # "2s2d") VPR64:$Rn)>;
-
-  def : Pat<(v2f64 (fextend
-              (v2f32 (Neon_High4Float
-                (v4f32 VPR128:$Rn))))),
-            (!cast<Instruction>(prefix # "4s2d") VPR128:$Rn)>;
-}
-
-defm : NeonI_2VMisc_Extend_Pattern<"FCVTL">;
-
-multiclass NeonI_2VMisc_SD_Conv<string asmop, bit Size, bit U, bits<5> opcode,
-                                ValueType ResTy4s, ValueType OpTy4s,
-                                ValueType ResTy2d, ValueType OpTy2d,
-                                ValueType ResTy2s, ValueType OpTy2s,
-                                SDPatternOperator Neon_Op> {
-
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (ResTy4s VPR128:$Rd),
-                           (ResTy4s (Neon_Op (OpTy4s VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2d : NeonI_2VMisc<0b1, U, {Size, 0b1}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.2d, $Rn.2d",
-                        [(set (ResTy2d VPR128:$Rd),
-                           (ResTy2d (Neon_Op (OpTy2d VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (ResTy2s VPR64:$Rd),
-                           (ResTy2s (Neon_Op (OpTy2s VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-multiclass NeonI_2VMisc_fp_to_int<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4i32, v4f32, v2i64,
-                                v2f64, v2i32, v2f32, Neon_Op>;
-}
-
-defm FCVTNS : NeonI_2VMisc_fp_to_int<"fcvtns", 0b0, 0b0, 0b11010,
-                                     int_arm_neon_vcvtns>;
-defm FCVTNU : NeonI_2VMisc_fp_to_int<"fcvtnu", 0b0, 0b1, 0b11010,
-                                     int_arm_neon_vcvtnu>;
-defm FCVTPS : NeonI_2VMisc_fp_to_int<"fcvtps", 0b1, 0b0, 0b11010,
-                                     int_arm_neon_vcvtps>;
-defm FCVTPU : NeonI_2VMisc_fp_to_int<"fcvtpu", 0b1, 0b1, 0b11010,
-                                     int_arm_neon_vcvtpu>;
-defm FCVTMS : NeonI_2VMisc_fp_to_int<"fcvtms", 0b0, 0b0, 0b11011,
-                                     int_arm_neon_vcvtms>;
-defm FCVTMU : NeonI_2VMisc_fp_to_int<"fcvtmu", 0b0, 0b1, 0b11011,
-                                     int_arm_neon_vcvtmu>;
-defm FCVTZS : NeonI_2VMisc_fp_to_int<"fcvtzs", 0b1, 0b0, 0b11011, fp_to_sint>;
-defm FCVTZU : NeonI_2VMisc_fp_to_int<"fcvtzu", 0b1, 0b1, 0b11011, fp_to_uint>;
-defm FCVTAS : NeonI_2VMisc_fp_to_int<"fcvtas", 0b0, 0b0, 0b11100,
-                                     int_arm_neon_vcvtas>;
-defm FCVTAU : NeonI_2VMisc_fp_to_int<"fcvtau", 0b0, 0b1, 0b11100,
-                                     int_arm_neon_vcvtau>;
-
-multiclass NeonI_2VMisc_int_to_fp<string asmop, bit Size, bit U,
-                                  bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4i32, v2f64,
-                                v2i64, v2f32, v2i32, Neon_Op>;
-}
-
-defm SCVTF : NeonI_2VMisc_int_to_fp<"scvtf", 0b0, 0b0, 0b11101, sint_to_fp>;
-defm UCVTF : NeonI_2VMisc_int_to_fp<"ucvtf", 0b0, 0b1, 0b11101, uint_to_fp>;
-
-multiclass NeonI_2VMisc_fp_to_fp<string asmop, bit Size, bit U,
-                                 bits<5> opcode, SDPatternOperator Neon_Op> {
-  defm _ : NeonI_2VMisc_SD_Conv<asmop, Size, U, opcode, v4f32, v4f32, v2f64,
-                                v2f64, v2f32, v2f32, Neon_Op>;
-}
-
-defm FRINTN : NeonI_2VMisc_fp_to_fp<"frintn", 0b0, 0b0, 0b11000,
-                                     int_aarch64_neon_frintn>;
-defm FRINTA : NeonI_2VMisc_fp_to_fp<"frinta", 0b0, 0b1, 0b11000, frnd>;
-defm FRINTP : NeonI_2VMisc_fp_to_fp<"frintp", 0b1, 0b0, 0b11000, fceil>;
-defm FRINTM : NeonI_2VMisc_fp_to_fp<"frintm", 0b0, 0b0, 0b11001, ffloor>;
-defm FRINTX : NeonI_2VMisc_fp_to_fp<"frintx", 0b0, 0b1, 0b11001, frint>;
-defm FRINTZ : NeonI_2VMisc_fp_to_fp<"frintz", 0b1, 0b0, 0b11001, ftrunc>;
-defm FRINTI : NeonI_2VMisc_fp_to_fp<"frinti", 0b1, 0b1, 0b11001, fnearbyint>;
-defm FRECPE : NeonI_2VMisc_fp_to_fp<"frecpe", 0b1, 0b0, 0b11101,
-                                    int_arm_neon_vrecpe>;
-defm FRSQRTE : NeonI_2VMisc_fp_to_fp<"frsqrte", 0b1, 0b1, 0b11101,
-                                     int_arm_neon_vrsqrte>;
-let SchedRW = [WriteFPSqrt, ReadFPSqrt] in {
-defm FSQRT : NeonI_2VMisc_fp_to_fp<"fsqrt", 0b1, 0b1, 0b11111, fsqrt>;
-}
-
-multiclass NeonI_2VMisc_S_Conv<string asmop, bit Size, bit U,
-                               bits<5> opcode, SDPatternOperator Neon_Op> {
-  def 4s : NeonI_2VMisc<0b1, U, {Size, 0b0}, opcode,
-                        (outs VPR128:$Rd), (ins VPR128:$Rn),
-                        asmop # "\t$Rd.4s, $Rn.4s",
-                        [(set (v4i32 VPR128:$Rd),
-                           (v4i32 (Neon_Op (v4i32 VPR128:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-
-  def 2s : NeonI_2VMisc<0b0, U, {Size, 0b0}, opcode,
-                        (outs VPR64:$Rd), (ins VPR64:$Rn),
-                        asmop # "\t$Rd.2s, $Rn.2s",
-                        [(set (v2i32 VPR64:$Rd),
-                           (v2i32 (Neon_Op (v2i32 VPR64:$Rn))))],
-                        NoItinerary>,
-           Sched<[WriteFPALU, ReadFPALU]>;
-}
-
-defm URECPE : NeonI_2VMisc_S_Conv<"urecpe", 0b1, 0b0, 0b11100,
-                                  int_arm_neon_vrecpe>;
-defm URSQRTE : NeonI_2VMisc_S_Conv<"ursqrte", 0b1, 0b1, 0b11100,
-                                   int_arm_neon_vrsqrte>;
-
-// Crypto Class
-class NeonI_Cryptoaes_2v<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$src),
-                                       (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def AESE : NeonI_Cryptoaes_2v<0b00, 0b00100, "aese", int_arm_neon_aese>;
-def AESD : NeonI_Cryptoaes_2v<0b00, 0b00101, "aesd", int_arm_neon_aesd>;
-
-class NeonI_Cryptoaes<bits<2> size, bits<5> opcode,
-                      string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_AES<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$Rn),
-                     asmop # "\t$Rd.16b, $Rn.16b",
-                     [(set (v16i8 VPR128:$Rd),
-                        (v16i8 (opnode (v16i8 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]>;
-
-def AESMC : NeonI_Cryptoaes<0b00, 0b00110, "aesmc", int_arm_neon_aesmc>;
-def AESIMC : NeonI_Cryptoaes<0b00, 0b00111, "aesimc", int_arm_neon_aesimc>;
-
-class NeonI_Cryptosha_vv<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn),
-                     asmop # "\t$Rd.4s, $Rn.4s",
-                     [(set (v4i32 VPR128:$Rd),
-                        (v4i32 (opnode (v4i32 VPR128:$src),
-                                       (v4i32 VPR128:$Rn))))],
-                     NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU1 : NeonI_Cryptosha_vv<0b00, 0b00001, "sha1su1",
-                                 int_arm_neon_sha1su1>;
-def SHA256SU0 : NeonI_Cryptosha_vv<0b00, 0b00010, "sha256su0",
-                                   int_arm_neon_sha256su0>;
-
-class NeonI_Cryptosha_ss<bits<2> size, bits<5> opcode,
-                         string asmop, SDPatternOperator opnode>
-  : NeonI_Crypto_SHA<size, opcode,
-                     (outs FPR32:$Rd), (ins FPR32:$Rn),
-                     asmop # "\t$Rd, $Rn",
-                     [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU]> {
-  let Predicates = [HasNEON, HasCrypto];
-  let hasSideEffects = 0;
-}
-
-def SHA1H : NeonI_Cryptosha_ss<0b00, 0b00000, "sha1h", int_arm_neon_sha1h>;
-def : Pat<(i32 (int_arm_neon_sha1h i32:$Rn)),
-          (COPY_TO_REGCLASS (SHA1H (COPY_TO_REGCLASS i32:$Rn, FPR32)), GPR32)>;
-
-
-class NeonI_Cryptosha3_vvv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs VPR128:$Rd),
-                       (ins VPR128:$src, VPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd.4s, $Rn.4s, $Rm.4s",
-                       [(set (v4i32 VPR128:$Rd),
-                          (v4i32 (opnode (v4i32 VPR128:$src),
-                                         (v4i32 VPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1SU0 : NeonI_Cryptosha3_vvv<0b00, 0b011, "sha1su0",
-                                   int_arm_neon_sha1su0>;
-def SHA256SU1 : NeonI_Cryptosha3_vvv<0b00, 0b110, "sha256su1",
-                                     int_arm_neon_sha256su1>;
-
-class NeonI_Cryptosha3_qqv<bits<2> size, bits<3> opcode, string asmop,
-                           SDPatternOperator opnode>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR128:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [(set (v4i32 FPR128:$Rd),
-                          (v4i32 (opnode (v4i32 FPR128:$src),
-                                         (v4i32 FPR128:$Rn),
-                                         (v4i32 VPR128:$Rm))))],
-                       NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA256H : NeonI_Cryptosha3_qqv<0b00, 0b100, "sha256h",
-                                   int_arm_neon_sha256h>;
-def SHA256H2 : NeonI_Cryptosha3_qqv<0b00, 0b101, "sha256h2",
-                                    int_arm_neon_sha256h2>;
-
-class NeonI_Cryptosha3_qsv<bits<2> size, bits<3> opcode, string asmop>
-  : NeonI_Crypto_3VSHA<size, opcode,
-                       (outs FPR128:$Rd),
-                       (ins FPR128:$src, FPR32:$Rn, VPR128:$Rm),
-                       asmop # "\t$Rd, $Rn, $Rm.4s",
-                       [], NoItinerary>,
-    Sched<[WriteFPALU, ReadFPALU, ReadFPALU, ReadFPALU]> {
-  let Constraints = "$src = $Rd";
-  let hasSideEffects = 0;
-  let Predicates = [HasNEON, HasCrypto];
-}
-
-def SHA1C : NeonI_Cryptosha3_qsv<0b00, 0b000, "sha1c">;
-def SHA1P : NeonI_Cryptosha3_qsv<0b00, 0b001, "sha1p">;
-def SHA1M : NeonI_Cryptosha3_qsv<0b00, 0b010, "sha1m">;
-
-def : Pat<(int_arm_neon_sha1c v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1C v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1m v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1M v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-def : Pat<(int_arm_neon_sha1p v4i32:$hash_abcd, i32:$hash_e, v4i32:$wk),
-          (SHA1P v4i32:$hash_abcd,
-                 (COPY_TO_REGCLASS i32:$hash_e, FPR32), v4i32:$wk)>;
-
-// Additional patterns to match shl to USHL.
-def : Pat<(v8i8 (shl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, $Rm)>;
-def : Pat<(v4i16 (shl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, $Rm)>;
-def : Pat<(v2i32 (shl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, $Rm)>;
-def : Pat<(v1i64 (shl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, $Rm)>;
-def : Pat<(v16i8 (shl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, $Rm)>;
-def : Pat<(v8i16 (shl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, $Rm)>;
-def : Pat<(v4i32 (shl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, $Rm)>;
-def : Pat<(v2i64 (shl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, $Rm)>;
-
-def : Pat<(v1i8 (shl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8)),
-              sub_8)>;
-def : Pat<(v1i16 (shl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16)),
-              sub_16)>;
-def : Pat<(v1i32 (shl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32)),
-              sub_32)>;
-
-// Additional patterns to match sra, srl.
-// For a vector right shift by vector, the shift amounts of SSHL/USHL are
-// negative. Negate the vector of shift amount first.
-def : Pat<(v8i8 (srl (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (USHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (srl (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (USHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (srl (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (USHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (srl (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (USHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (srl (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (USHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (srl (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (USHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (srl (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (USHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (srl (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (USHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (srl (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (srl (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (srl (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (USHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-def : Pat<(v8i8 (sra (v8i8 VPR64:$Rn), (v8i8 VPR64:$Rm))),
-          (SSHLvvv_8B $Rn, (NEG8b $Rm))>;
-def : Pat<(v4i16 (sra (v4i16 VPR64:$Rn), (v4i16 VPR64:$Rm))),
-          (SSHLvvv_4H $Rn, (NEG4h $Rm))>;
-def : Pat<(v2i32 (sra (v2i32 VPR64:$Rn), (v2i32 VPR64:$Rm))),
-          (SSHLvvv_2S $Rn, (NEG2s $Rm))>;
-def : Pat<(v1i64 (sra (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm))),
-          (SSHLddd $Rn, (NEGdd $Rm))>;
-def : Pat<(v16i8 (sra (v16i8 VPR128:$Rn), (v16i8 VPR128:$Rm))),
-          (SSHLvvv_16B $Rn, (NEG16b $Rm))>;
-def : Pat<(v8i16 (sra (v8i16 VPR128:$Rn), (v8i16 VPR128:$Rm))),
-          (SSHLvvv_8H $Rn, (NEG8h $Rm))>;
-def : Pat<(v4i32 (sra (v4i32 VPR128:$Rn), (v4i32 VPR128:$Rm))),
-          (SSHLvvv_4S $Rn, (NEG4s $Rm))>;
-def : Pat<(v2i64 (sra (v2i64 VPR128:$Rn), (v2i64 VPR128:$Rm))),
-          (SSHLvvv_2D $Rn, (NEG2d $Rm))>;
-
-def : Pat<(v1i8 (sra (v1i8 FPR8:$Rn), (v1i8 FPR8:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_8B (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8),
-                          (NEG8b (SUBREG_TO_REG (i64 0), FPR8:$Rm, sub_8))),
-              sub_8)>;
-def : Pat<(v1i16 (sra (v1i16 FPR16:$Rn), (v1i16 FPR16:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_4H (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16),
-                          (NEG4h (SUBREG_TO_REG (i64 0), FPR16:$Rm, sub_16))),
-              sub_16)>;
-def : Pat<(v1i32 (sra (v1i32 FPR32:$Rn), (v1i32 FPR32:$Rm))),
-          (EXTRACT_SUBREG
-              (SSHLvvv_2S (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32),
-                          (NEG2s (SUBREG_TO_REG (i64 0), FPR32:$Rm, sub_32))),
-              sub_32)>;
-
-//
-// Patterns for handling half-precision values
-//
-
-// Convert between f16 value and f32 value
-def : Pat<(f32 (f16_to_f32 (i32 GPR32:$Rn))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw $Rn), sub_16))>;
-def : Pat<(i32 (f32_to_f16 (f32 FPR32:$Rn))),
-          (FMOVws (SUBREG_TO_REG (i64 0), (f16 (FCVThs $Rn)), sub_16))>;
-
-// Convert f16 value coming in as i16 value to f32
-def : Pat<(f32 (f16_to_f32 (i32 (and (i32 GPR32:$Rn), 65535)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext GPR32:$Rn)))),
-          (FCVTsh (EXTRACT_SUBREG (FMOVsw GPR32:$Rn), sub_16))>;
-
-def : Pat<(f32 (f16_to_f32 (i32 (assertzext (i32 (
-            f32_to_f16 (f32 FPR32:$Rn))))))),
-          (f32 FPR32:$Rn)>;
-
-// Patterns for vector extract of half-precision FP value in i16 storage type
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v4i16 VPR64:$Rn), neon_uimm2_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H
-            (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-            neon_uimm2_bare:$Imm)))>;
-
-def : Pat<(f32 (f16_to_f32 ( i32 (and (i32 (vector_extract
-            (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)), 65535)))),
-          (FCVTsh (f16 (DUPhv_H (v8i16 VPR128:$Rn), neon_uimm3_bare:$Imm)))>;
-
-// Patterns for vector insert of half-precision FP value 0 in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 0))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 WZR))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-// Patterns for vector insert of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm3_bare:$Imm))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn),
-            (v8i16 (SUBREG_TO_REG (i64 0),
-              (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-              sub_16)),
-            neon_uimm3_bare:$Imm, 0))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint
-              (f32 (f16_to_f32 (i32 (and (i32 GPR32:$src), 65535)))))))),
-            (neon_uimm2_bare:$Imm))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0),
-                (f16 (EXTRACT_SUBREG (f32 (FMOVsw (i32 GPR32:$src))), sub_16)),
-                sub_16)),
-              neon_uimm2_bare:$Imm, 0)),
-            sub_64))>;
-
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-// Patterns for vector copy of half-precision FP value in i16 storage type
-def : Pat<(v8i16 (vector_insert (v8i16 VPR128:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v8i16 VPR128:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v8i16 (INSELh (v8i16 VPR128:$Rn), (v8i16 VPR128:$src),
-            neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2))>;
-
-def : Pat<(v4i16 (vector_insert (v4i16 VPR64:$Rn),
-            (i32 (assertsext (i32 (fp_to_sint(f32 (f16_to_f32 (i32 (and (i32
-              (vector_extract (v4i16 VPR64:$src), neon_uimm3_bare:$Imm2)),
-              65535)))))))),
-            (neon_uimm3_bare:$Imm1))),
-          (v4i16 (EXTRACT_SUBREG
-            (v8i16 (INSELh
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)),
-              (v8i16 (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)),
-              neon_uimm3_bare:$Imm1, neon_uimm3_bare:$Imm2)),
-            sub_64))>;
-
-
diff --git a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index c0031a4..e7454be 100644
--- a/lib/Target/ARM64/ARM64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -1,4 +1,4 @@
-//===-- ARM64LoadStoreOptimizer.cpp - ARM64 load/store opt. pass --*- C++ -*-=//
+//=- AArch64LoadStoreOptimizer.cpp - AArch64 load/store opt. pass -*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-ldst-opt"
-#include "ARM64InstrInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
+#include "AArch64InstrInfo.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -30,7 +29,9 @@
 #include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
-/// ARM64AllocLoadStoreOpt - Post-register allocation pass to combine
+#define DEBUG_TYPE "aarch64-ldst-opt"
+
+/// AArch64AllocLoadStoreOpt - Post-register allocation pass to combine
 /// load / store instructions to form ldp / stp instructions.
 
 STATISTIC(NumPairCreated, "Number of load/store pair instructions generated");
@@ -39,23 +40,21 @@ STATISTIC(NumPreFolded, "Number of pre-index updates folded");
 STATISTIC(NumUnscaledPairCreated,
           "Number of load/store from unscaled generated");
 
-static cl::opt<bool> DoLoadStoreOpt("arm64-load-store-opt", cl::init(true),
-                                    cl::Hidden);
-static cl::opt<unsigned> ScanLimit("arm64-load-store-scan-limit", cl::init(20),
+static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit", cl::init(20),
                                    cl::Hidden);
 
 // Place holder while testing unscaled load/store combining
 static cl::opt<bool>
-EnableARM64UnscaledMemOp("arm64-unscaled-mem-op", cl::Hidden,
-                         cl::desc("Allow ARM64 unscaled load/store combining"),
+EnableAArch64UnscaledMemOp("aarch64-unscaled-mem-op", cl::Hidden,
+                         cl::desc("Allow AArch64 unscaled load/store combining"),
                          cl::init(true));
 
 namespace {
-struct ARM64LoadStoreOpt : public MachineFunctionPass {
+struct AArch64LoadStoreOpt : public MachineFunctionPass {
   static char ID;
-  ARM64LoadStoreOpt() : MachineFunctionPass(ID) {}
+  AArch64LoadStoreOpt() : MachineFunctionPass(ID) {}
 
-  const ARM64InstrInfo *TII;
+  const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
 
   // Scan the instructions looking for a load/store that can be combined
@@ -70,7 +69,7 @@ struct ARM64LoadStoreOpt : public MachineFunctionPass {
   // Merge the two instructions indicated into a single pair-wise instruction.
   // If mergeForward is true, erase the first instruction and fold its
   // operation into the second. If false, the reverse. Return the instruction
-  // following the first instruction (which may change during proecessing).
+  // following the first instruction (which may change during processing).
   MachineBasicBlock::iterator
   mergePairedInsns(MachineBasicBlock::iterator I,
                    MachineBasicBlock::iterator Paired, bool mergeForward);
@@ -100,79 +99,79 @@ struct ARM64LoadStoreOpt : public MachineFunctionPass {
 
   bool optimizeBlock(MachineBasicBlock &MBB);
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-  virtual const char *getPassName() const {
-    return "ARM64 load / store optimization pass";
+  const char *getPassName() const override {
+    return "AArch64 load / store optimization pass";
   }
 
 private:
   int getMemSize(MachineInstr *MemMI);
 };
-char ARM64LoadStoreOpt::ID = 0;
+char AArch64LoadStoreOpt::ID = 0;
 }
 
 static bool isUnscaledLdst(unsigned Opc) {
   switch (Opc) {
   default:
     return false;
-  case ARM64::STURSi:
+  case AArch64::STURSi:
     return true;
-  case ARM64::STURDi:
+  case AArch64::STURDi:
     return true;
-  case ARM64::STURQi:
+  case AArch64::STURQi:
     return true;
-  case ARM64::STURWi:
+  case AArch64::STURWi:
     return true;
-  case ARM64::STURXi:
+  case AArch64::STURXi:
     return true;
-  case ARM64::LDURSi:
+  case AArch64::LDURSi:
     return true;
-  case ARM64::LDURDi:
+  case AArch64::LDURDi:
     return true;
-  case ARM64::LDURQi:
+  case AArch64::LDURQi:
     return true;
-  case ARM64::LDURWi:
+  case AArch64::LDURWi:
     return true;
-  case ARM64::LDURXi:
+  case AArch64::LDURXi:
     return true;
   }
 }
 
 // Size in bytes of the data moved by an unscaled load or store
-int ARM64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
+int AArch64LoadStoreOpt::getMemSize(MachineInstr *MemMI) {
   switch (MemMI->getOpcode()) {
   default:
     llvm_unreachable("Opcode has has unknown size!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
     return 4;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
     return 8;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
     return 16;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
     return 4;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
     return 8;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
     return 4;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
     return 8;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
     return 16;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
     return 4;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
     return 8;
   }
 }
@@ -181,36 +180,36 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pairwise equivalent!");
-  case ARM64::STRSui:
-  case ARM64::STURSi:
-    return ARM64::STPSi;
-  case ARM64::STRDui:
-  case ARM64::STURDi:
-    return ARM64::STPDi;
-  case ARM64::STRQui:
-  case ARM64::STURQi:
-    return ARM64::STPQi;
-  case ARM64::STRWui:
-  case ARM64::STURWi:
-    return ARM64::STPWi;
-  case ARM64::STRXui:
-  case ARM64::STURXi:
-    return ARM64::STPXi;
-  case ARM64::LDRSui:
-  case ARM64::LDURSi:
-    return ARM64::LDPSi;
-  case ARM64::LDRDui:
-  case ARM64::LDURDi:
-    return ARM64::LDPDi;
-  case ARM64::LDRQui:
-  case ARM64::LDURQi:
-    return ARM64::LDPQi;
-  case ARM64::LDRWui:
-  case ARM64::LDURWi:
-    return ARM64::LDPWi;
-  case ARM64::LDRXui:
-  case ARM64::LDURXi:
-    return ARM64::LDPXi;
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+    return AArch64::STPSi;
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+    return AArch64::STPDi;
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+    return AArch64::STPQi;
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+    return AArch64::STPWi;
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+    return AArch64::STPXi;
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return AArch64::LDPSi;
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return AArch64::LDPDi;
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return AArch64::LDPQi;
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return AArch64::LDPWi;
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return AArch64::LDPXi;
   }
 }
 
@@ -218,16 +217,16 @@ static unsigned getPreIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no pre-indexed equivalent!");
-  case ARM64::STRSui:    return ARM64::STRSpre;
-  case ARM64::STRDui:    return ARM64::STRDpre;
-  case ARM64::STRQui:    return ARM64::STRQpre;
-  case ARM64::STRWui:    return ARM64::STRWpre;
-  case ARM64::STRXui:    return ARM64::STRXpre;
-  case ARM64::LDRSui:    return ARM64::LDRSpre;
-  case ARM64::LDRDui:    return ARM64::LDRDpre;
-  case ARM64::LDRQui:    return ARM64::LDRQpre;
-  case ARM64::LDRWui:    return ARM64::LDRWpre;
-  case ARM64::LDRXui:    return ARM64::LDRXpre;
+  case AArch64::STRSui:    return AArch64::STRSpre;
+  case AArch64::STRDui:    return AArch64::STRDpre;
+  case AArch64::STRQui:    return AArch64::STRQpre;
+  case AArch64::STRWui:    return AArch64::STRWpre;
+  case AArch64::STRXui:    return AArch64::STRXpre;
+  case AArch64::LDRSui:    return AArch64::LDRSpre;
+  case AArch64::LDRDui:    return AArch64::LDRDpre;
+  case AArch64::LDRQui:    return AArch64::LDRQpre;
+  case AArch64::LDRWui:    return AArch64::LDRWpre;
+  case AArch64::LDRXui:    return AArch64::LDRXpre;
   }
 }
 
@@ -235,33 +234,33 @@ static unsigned getPostIndexedOpcode(unsigned Opc) {
   switch (Opc) {
   default:
     llvm_unreachable("Opcode has no post-indexed wise equivalent!");
-  case ARM64::STRSui:
-    return ARM64::STRSpost;
-  case ARM64::STRDui:
-    return ARM64::STRDpost;
-  case ARM64::STRQui:
-    return ARM64::STRQpost;
-  case ARM64::STRWui:
-    return ARM64::STRWpost;
-  case ARM64::STRXui:
-    return ARM64::STRXpost;
-  case ARM64::LDRSui:
-    return ARM64::LDRSpost;
-  case ARM64::LDRDui:
-    return ARM64::LDRDpost;
-  case ARM64::LDRQui:
-    return ARM64::LDRQpost;
-  case ARM64::LDRWui:
-    return ARM64::LDRWpost;
-  case ARM64::LDRXui:
-    return ARM64::LDRXpost;
+  case AArch64::STRSui:
+    return AArch64::STRSpost;
+  case AArch64::STRDui:
+    return AArch64::STRDpost;
+  case AArch64::STRQui:
+    return AArch64::STRQpost;
+  case AArch64::STRWui:
+    return AArch64::STRWpost;
+  case AArch64::STRXui:
+    return AArch64::STRXpost;
+  case AArch64::LDRSui:
+    return AArch64::LDRSpost;
+  case AArch64::LDRDui:
+    return AArch64::LDRDpost;
+  case AArch64::LDRQui:
+    return AArch64::LDRQpost;
+  case AArch64::LDRWui:
+    return AArch64::LDRWpost;
+  case AArch64::LDRXui:
+    return AArch64::LDRXpost;
   }
 }
 
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
-                                    MachineBasicBlock::iterator Paired,
-                                    bool mergeForward) {
+AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
+                                      MachineBasicBlock::iterator Paired,
+                                      bool mergeForward) {
   MachineBasicBlock::iterator NextI = I;
   ++NextI;
   // If NextI is the second of the two instructions to be merged, we need
@@ -272,7 +271,8 @@ ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
     ++NextI;
 
   bool IsUnscaled = isUnscaledLdst(I->getOpcode());
-  int OffsetStride = IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(I) : 1;
+  int OffsetStride =
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(I) : 1;
 
   unsigned NewOpc = getMatchingPairOpcode(I->getOpcode());
   // Insert our new paired instruction after whichever of the paired
@@ -295,7 +295,7 @@ ARM64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
   }
   // Handle Unscaled
   int OffsetImm = RtMI->getOperand(2).getImm();
-  if (IsUnscaled && EnableARM64UnscaledMemOp)
+  if (IsUnscaled && EnableAArch64UnscaledMemOp)
     OffsetImm /= OffsetStride;
 
   // Construct the new instruction.
@@ -373,8 +373,8 @@ static int alignTo(int Num, int PowOf2) {
 /// findMatchingInsn - Scan the instructions looking for a load/store that can
 /// be combined with the current instruction into a load/store pair.
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
-                                    bool &mergeForward, unsigned Limit) {
+AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
+                                      bool &mergeForward, unsigned Limit) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineBasicBlock::iterator MBBI = I;
   MachineInstr *FirstMI = I;
@@ -395,7 +395,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   if (FirstMI->modifiesRegister(BaseReg, TRI))
     return E;
   int OffsetStride =
-      IsUnscaled && EnableARM64UnscaledMemOp ? getMemSize(FirstMI) : 1;
+      IsUnscaled && EnableAArch64UnscaledMemOp ? getMemSize(FirstMI) : 1;
   if (!inBoundsForPair(IsUnscaled, Offset, OffsetStride))
     return E;
 
@@ -445,7 +445,7 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // If the alignment requirements of the paired (scaled) instruction
         // can't express the offset of the unscaled input, bail and keep
         // looking.
-        if (IsUnscaled && EnableARM64UnscaledMemOp &&
+        if (IsUnscaled && EnableAArch64UnscaledMemOp &&
             (alignTo(MinOffset, OffsetStride) != MinOffset)) {
           trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
           continue;
@@ -508,10 +508,10 @@ ARM64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
 }
 
 MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                         MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
+AArch64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
+                                           MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator NextI = I;
   // Return the instruction following the merged instruction, which is
@@ -521,14 +521,15 @@ ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
     ++NextI;
 
   int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
          "Can't merge 1 << 12 offset into pre-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
+  if (Update->getOpcode() == AArch64::SUBXri)
     Value = -Value;
 
   unsigned NewOpc = getPreIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB =
       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addImm(Value);
@@ -550,11 +551,10 @@ ARM64LoadStoreOpt::mergePreIdxUpdateInsn(MachineBasicBlock::iterator I,
   return NextI;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
-                                          MachineBasicBlock::iterator Update) {
-  assert((Update->getOpcode() == ARM64::ADDXri ||
-          Update->getOpcode() == ARM64::SUBXri) &&
+MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePostIdxUpdateInsn(
+    MachineBasicBlock::iterator I, MachineBasicBlock::iterator Update) {
+  assert((Update->getOpcode() == AArch64::ADDXri ||
+          Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
   MachineBasicBlock::iterator NextI = I;
   // Return the instruction following the merged instruction, which is
@@ -564,14 +564,15 @@ ARM64LoadStoreOpt::mergePostIdxUpdateInsn(MachineBasicBlock::iterator I,
     ++NextI;
 
   int Value = Update->getOperand(2).getImm();
-  assert(ARM64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
+  assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
          "Can't merge 1 << 12 offset into post-indexed load / store");
-  if (Update->getOpcode() == ARM64::SUBXri)
+  if (Update->getOpcode() == AArch64::SUBXri)
     Value = -Value;
 
   unsigned NewOpc = getPostIndexedOpcode(I->getOpcode());
   MachineInstrBuilder MIB =
       BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc))
+          .addOperand(Update->getOperand(0))
           .addOperand(I->getOperand(0))
           .addOperand(I->getOperand(1))
           .addImm(Value);
@@ -598,17 +599,17 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
   switch (MI->getOpcode()) {
   default:
     break;
-  case ARM64::SUBXri:
+  case AArch64::SUBXri:
     // Negate the offset for a SUB instruction.
     Offset *= -1;
   // FALLTHROUGH
-  case ARM64::ADDXri:
+  case AArch64::ADDXri:
     // Make sure it's a vanilla immediate operand, not a relocation or
     // anything else we can't handle.
     if (!MI->getOperand(2).isImm())
       break;
     // Watch out for 1 << 12 shifted value.
-    if (ARM64_AM::getShiftValue(MI->getOperand(3).getImm()))
+    if (AArch64_AM::getShiftValue(MI->getOperand(3).getImm()))
       break;
     // If the instruction has the base register as source and dest and the
     // immediate will fit in a signed 9-bit integer, then we have a match.
@@ -626,9 +627,8 @@ static bool isMatchingUpdateInsn(MachineInstr *MI, unsigned BaseReg,
   return false;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
-                                                 unsigned Limit, int Value) {
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
+    MachineBasicBlock::iterator I, unsigned Limit, int Value) {
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
   MachineBasicBlock::iterator MBBI = I;
@@ -681,9 +681,8 @@ ARM64LoadStoreOpt::findMatchingUpdateInsnForward(MachineBasicBlock::iterator I,
   return E;
 }
 
-MachineBasicBlock::iterator
-ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
-                                                  unsigned Limit) {
+MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
+    MachineBasicBlock::iterator I, unsigned Limit) {
   MachineBasicBlock::iterator B = I->getParent()->begin();
   MachineBasicBlock::iterator E = I->getParent()->end();
   MachineInstr *MemMI = I;
@@ -735,7 +734,7 @@ ARM64LoadStoreOpt::findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I,
   return E;
 }
 
-bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
+bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
   bool Modified = false;
   // Two tranformations to do here:
   // 1) Find loads and stores that can be merged into a single load or store
@@ -761,27 +760,27 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
     // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
       // If this is a volatile load/store, don't mess with it.
       if (MI->hasOrderedMemoryRef()) {
         ++MBBI;
@@ -793,7 +792,7 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
         break;
       }
       // Check if this load/store has a hint to avoid pair formation.
-      // MachineMemOperands hints are set by the ARM64StorePairSuppress pass.
+      // MachineMemOperands hints are set by the AArch64StorePairSuppress pass.
       if (TII->isLdStPairSuppressed(MI)) {
         ++MBBI;
         break;
@@ -832,27 +831,27 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // Just move on to the next instruction.
       ++MBBI;
       break;
-    case ARM64::STRSui:
-    case ARM64::STRDui:
-    case ARM64::STRQui:
-    case ARM64::STRXui:
-    case ARM64::STRWui:
-    case ARM64::LDRSui:
-    case ARM64::LDRDui:
-    case ARM64::LDRQui:
-    case ARM64::LDRXui:
-    case ARM64::LDRWui:
+    case AArch64::STRSui:
+    case AArch64::STRDui:
+    case AArch64::STRQui:
+    case AArch64::STRXui:
+    case AArch64::STRWui:
+    case AArch64::LDRSui:
+    case AArch64::LDRDui:
+    case AArch64::LDRQui:
+    case AArch64::LDRXui:
+    case AArch64::LDRWui:
     // do the unscaled versions as well
-    case ARM64::STURSi:
-    case ARM64::STURDi:
-    case ARM64::STURQi:
-    case ARM64::STURWi:
-    case ARM64::STURXi:
-    case ARM64::LDURSi:
-    case ARM64::LDURDi:
-    case ARM64::LDURQi:
-    case ARM64::LDURWi:
-    case ARM64::LDURXi: {
+    case AArch64::STURSi:
+    case AArch64::STURDi:
+    case AArch64::STURQi:
+    case AArch64::STURWi:
+    case AArch64::STURXi:
+    case AArch64::LDURSi:
+    case AArch64::LDURDi:
+    case AArch64::LDURQi:
+    case AArch64::LDURWi:
+    case AArch64::LDURXi: {
       // Make sure this is a reg+imm (as opposed to an address reloc).
       if (!MI->getOperand(2).isImm()) {
         ++MBBI;
@@ -893,7 +892,7 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
       // ldr x1, [x0, #64]
       // add x0, x0, #64
       //   merged into:
-      // ldr x1, [x0], #64
+      // ldr x1, [x0, #64]!
 
       // The immediate in the load/store is scaled by the size of the register
       // being loaded. The immediate in the add we're looking for,
@@ -921,13 +920,9 @@ bool ARM64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) {
   return Modified;
 }
 
-bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
-  // Early exit if pass disabled.
-  if (!DoLoadStoreOpt)
-    return false;
-
+bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
-  TII = static_cast<const ARM64InstrInfo *>(TM.getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(TM.getInstrInfo());
   TRI = TM.getRegisterInfo();
 
   bool Modified = false;
@@ -942,6 +937,6 @@ bool ARM64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
 
 /// createARMLoadStoreOptimizationPass - returns an instance of the load / store
 /// optimization pass.
-FunctionPass *llvm::createARM64LoadStoreOptimizationPass() {
-  return new ARM64LoadStoreOpt();
+FunctionPass *llvm::createAArch64LoadStoreOptimizationPass() {
+  return new AArch64LoadStoreOpt();
 }
diff --git a/lib/Target/AArch64/AArch64MCInstLower.cpp b/lib/Target/AArch64/AArch64MCInstLower.cpp
index 3842bfd..ab6d375 100644
--- a/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst -==//
+//==-- AArch64MCInstLower.cpp - Convert AArch64 MachineInstr to an MCInst --==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,146 +12,191 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AArch64AsmPrinter.h"
-#include "AArch64TargetMachine.h"
+#include "AArch64MCInstLower.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
-MCOperand
-AArch64AsmPrinter::lowerSymbolOperand(const MachineOperand &MO,
-                                      const MCSymbol *Sym) const {
-  const MCExpr *Expr = 0;
+AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, Mangler &mang,
+                                       AsmPrinter &printer)
+    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
 
-  Expr = MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, OutContext);
+MCSymbol *
+AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
+  return Printer.getSymbol(MO.getGlobal());
+}
 
-  switch (MO.getTargetFlags()) {
-  case AArch64II::MO_GOT:
-    Expr = AArch64MCExpr::CreateGOT(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOT_LO12:
-    Expr = AArch64MCExpr::CreateGOTLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_LO12:
-    Expr = AArch64MCExpr::CreateLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G1:
-    Expr = AArch64MCExpr::CreateDTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_DTPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateDTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL:
-    Expr = AArch64MCExpr::CreateGOTTPREL(Expr, OutContext);
-    break;
-  case AArch64II::MO_GOTTPREL_LO12:
-    Expr = AArch64MCExpr::CreateGOTTPRELLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC:
-    Expr = AArch64MCExpr::CreateTLSDesc(Expr, OutContext);
-    break;
-  case AArch64II::MO_TLSDESC_LO12:
-    Expr = AArch64MCExpr::CreateTLSDescLo12(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G1:
-    Expr = AArch64MCExpr::CreateTPREL_G1(Expr, OutContext);
-    break;
-  case AArch64II::MO_TPREL_G0_NC:
-    Expr = AArch64MCExpr::CreateTPREL_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G3:
-    Expr = AArch64MCExpr::CreateABS_G3(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G2_NC:
-    Expr = AArch64MCExpr::CreateABS_G2_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G1_NC:
-    Expr = AArch64MCExpr::CreateABS_G1_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_ABS_G0_NC:
-    Expr = AArch64MCExpr::CreateABS_G0_NC(Expr, OutContext);
-    break;
-  case AArch64II::MO_NO_FLAG:
-    // Expr is already correct
-    break;
-  default:
-    llvm_unreachable("Unexpected MachineOperand flag");
+MCSymbol *
+AArch64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
+  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
+                                                       MCSymbol *Sym) const {
+  // FIXME: We would like an efficient form for this, so we don't have to do a
+  // lot of extra uniquing.
+  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
+  if ((MO.getTargetFlags() & AArch64II::MO_GOT) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
+    else
+      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
+  } else if ((MO.getTargetFlags() & AArch64II::MO_TLS) != 0) {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
+    else
+      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
+  } else {
+    if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+      RefKind = MCSymbolRefExpr::VK_PAGE;
+    else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+             AArch64II::MO_PAGEOFF)
+      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
   }
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
+  if (!MO.isJTI() && MO.getOffset())
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+  return MCOperand::CreateExpr(Expr);
+}
+
+MCOperand AArch64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
+                                                    MCSymbol *Sym) const {
+  uint32_t RefFlags = 0;
 
+  if (MO.getTargetFlags() & AArch64II::MO_GOT)
+    RefFlags |= AArch64MCExpr::VK_GOT;
+  else if (MO.getTargetFlags() & AArch64II::MO_TLS) {
+    TLSModel::Model Model;
+    if (MO.isGlobal()) {
+      const GlobalValue *GV = MO.getGlobal();
+      Model = Printer.TM.getTLSModel(GV);
+    } else {
+      assert(MO.isSymbol() &&
+             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
+             "unexpected external TLS symbol");
+      Model = TLSModel::GeneralDynamic;
+    }
+    switch (Model) {
+    case TLSModel::InitialExec:
+      RefFlags |= AArch64MCExpr::VK_GOTTPREL;
+      break;
+    case TLSModel::LocalExec:
+      RefFlags |= AArch64MCExpr::VK_TPREL;
+      break;
+    case TLSModel::LocalDynamic:
+      RefFlags |= AArch64MCExpr::VK_DTPREL;
+      break;
+    case TLSModel::GeneralDynamic:
+      RefFlags |= AArch64MCExpr::VK_TLSDESC;
+      break;
+    }
+  } else {
+    // No modifier means this is a generic reference, classified as absolute for
+    // the cases where it matters (:abs_g0: etc).
+    RefFlags |= AArch64MCExpr::VK_ABS;
+  }
+
+  if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_PAGE)
+    RefFlags |= AArch64MCExpr::VK_PAGE;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) ==
+           AArch64II::MO_PAGEOFF)
+    RefFlags |= AArch64MCExpr::VK_PAGEOFF;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G3)
+    RefFlags |= AArch64MCExpr::VK_G3;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G2)
+    RefFlags |= AArch64MCExpr::VK_G2;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G1)
+    RefFlags |= AArch64MCExpr::VK_G1;
+  else if ((MO.getTargetFlags() & AArch64II::MO_FRAGMENT) == AArch64II::MO_G0)
+    RefFlags |= AArch64MCExpr::VK_G0;
+
+  if (MO.getTargetFlags() & AArch64II::MO_NC)
+    RefFlags |= AArch64MCExpr::VK_NC;
+
+  const MCExpr *Expr =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
   if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(Expr,
-                                   MCConstantExpr::Create(MO.getOffset(),
-                                                          OutContext),
-                                   OutContext);
+    Expr = MCBinaryExpr::CreateAdd(
+        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
+
+  AArch64MCExpr::VariantKind RefKind;
+  RefKind = static_cast<AArch64MCExpr::VariantKind>(RefFlags);
+  Expr = AArch64MCExpr::Create(Expr, RefKind, Ctx);
 
   return MCOperand::CreateExpr(Expr);
 }
 
-bool AArch64AsmPrinter::lowerOperand(const MachineOperand &MO,
-                                     MCOperand &MCOp) const {
+MCOperand AArch64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                                 MCSymbol *Sym) const {
+  if (TargetTriple.isOSDarwin())
+    return lowerSymbolOperandDarwin(MO, Sym);
+
+  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
+  return lowerSymbolOperandELF(MO, Sym);
+}
+
+bool AArch64MCInstLower::lowerOperand(const MachineOperand &MO,
+                                      MCOperand &MCOp) const {
   switch (MO.getType()) {
-  default: llvm_unreachable("unknown operand type");
+  default:
+    assert(0 && "unknown operand type");
   case MachineOperand::MO_Register:
+    // Ignore all implicit register operands.
     if (MO.isImplicit())
       return false;
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
     MCOp = MCOperand::CreateReg(MO.getReg());
     break;
+  case MachineOperand::MO_RegisterMask:
+    // Regmasks are like implicit defs.
+    return false;
   case MachineOperand::MO_Immediate:
     MCOp = MCOperand::CreateImm(MO.getImm());
     break;
-  case MachineOperand::MO_FPImmediate: {
-    assert(MO.getFPImm()->isZero() && "Only fp imm 0.0 is supported");
-    MCOp = MCOperand::CreateFPImm(0.0);
-    break;
-  }
-  case MachineOperand::MO_BlockAddress:
-    MCOp = lowerSymbolOperand(MO, GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = lowerSymbolOperand(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+  case MachineOperand::MO_MachineBasicBlock:
+    MCOp = MCOperand::CreateExpr(
+        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = lowerSymbolOperand(MO, getSymbol(MO.getGlobal()));
+    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
     break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
-                                   MO.getMBB()->getSymbol(), OutContext));
+  case MachineOperand::MO_ExternalSymbol:
+    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
     break;
   case MachineOperand::MO_JumpTableIndex:
-    MCOp = lowerSymbolOperand(MO, GetJTISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
     break;
   case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = lowerSymbolOperand(MO, GetCPISymbol(MO.getIndex()));
+    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
+    break;
+  case MachineOperand::MO_BlockAddress:
+    MCOp = LowerSymbolOperand(
+        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
     break;
-  case MachineOperand::MO_RegisterMask:
-    // Ignore call clobbers
-    return false;
-
   }
-
   return true;
 }
 
-void llvm::LowerAArch64MachineInstrToMCInst(const MachineInstr *MI,
-                                            MCInst &OutMI,
-                                            AArch64AsmPrinter &AP) {
+void AArch64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-
     MCOperand MCOp;
-    if (AP.lowerOperand(MO, MCOp))
+    if (lowerOperand(MI->getOperand(i), MCOp))
       OutMI.addOperand(MCOp);
   }
 }
diff --git a/lib/Target/ARM64/ARM64MCInstLower.h b/lib/Target/AArch64/AArch64MCInstLower.h
index 7e3a2c8..ba50ba9 100644
--- a/lib/Target/ARM64/ARM64MCInstLower.h
+++ b/lib/Target/AArch64/AArch64MCInstLower.h
@@ -1,4 +1,4 @@
-//===-- ARM64MCInstLower.h - Lower MachineInstr to MCInst ----------------===//
+//===-- AArch64MCInstLower.h - Lower MachineInstr to MCInst ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef ARM64_MCINSTLOWER_H
-#define ARM64_MCINSTLOWER_H
+#ifndef AArch64_MCINSTLOWER_H
+#define AArch64_MCINSTLOWER_H
 
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Compiler.h"
@@ -25,15 +25,15 @@ class MachineModuleInfoMachO;
 class MachineOperand;
 class Mangler;
 
-/// ARM64MCInstLower - This class is used to lower an MachineInstr
+/// AArch64MCInstLower - This class is used to lower an MachineInstr
 /// into an MCInst.
-class LLVM_LIBRARY_VISIBILITY ARM64MCInstLower {
+class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
   MCContext &Ctx;
   AsmPrinter &Printer;
   Triple TargetTriple;
 
 public:
-  ARM64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
+  AArch64MCInstLower(MCContext &ctx, Mangler &mang, AsmPrinter &printer);
 
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
deleted file mode 100644
index f45d8f7..0000000
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-//===-- AArch64MachineFuctionInfo.cpp - AArch64 machine function info -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file just contains the anchor for the AArch64MachineFunctionInfo to
-// force vtable emission.
-//
-//===----------------------------------------------------------------------===//
-#include "AArch64MachineFunctionInfo.h"
-
-using namespace llvm;
-
-void AArch64MachineFunctionInfo::anchor() { }
diff --git a/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 33da54f..7c257ba 100644
--- a/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -1,4 +1,4 @@
-//=- AArch64MachineFuctionInfo.h - AArch64 machine function info -*- C++ -*-==//
+//=- AArch64MachineFuctionInfo.h - AArch64 machine function info --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef AARCH64MACHINEFUNCTIONINFO_H
-#define AARCH64MACHINEFUNCTIONINFO_H
+#ifndef AArch64MACHINEFUNCTIONINFO_H
+#define AArch64MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/MC/MCLinkerOptimizationHint.h"
 
 namespace llvm {
 
-/// This class is derived from MachineFunctionInfo and contains private AArch64
-/// target-specific information for each MachineFunction.
-class AArch64MachineFunctionInfo : public MachineFunctionInfo {
-  virtual void anchor();
+/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
+/// contains private AArch64-specific information for each MachineFunction.
+class AArch64FunctionInfo : public MachineFunctionInfo {
 
   /// Number of bytes of arguments this function has on the stack. If the callee
   /// is expected to restore the argument stack this should be a multiple of 16,
@@ -39,111 +41,123 @@ class AArch64MachineFunctionInfo : public MachineFunctionInfo {
   /// callee is expected to pop the args.
   unsigned ArgumentStackToRestore;
 
-  /// If the stack needs to be adjusted on frame entry in two stages, this
-  /// records the size of the first adjustment just prior to storing
-  /// callee-saved registers. The callee-saved slots are addressed assuming
-  /// SP == <incoming-SP> - InitialStackAdjust.
-  unsigned InitialStackAdjust;
+  /// HasStackFrame - True if this function has a stack frame. Set by
+  /// processFunctionBeforeCalleeSavedScan().
+  bool HasStackFrame;
 
-  /// Number of local-dynamic TLS accesses.
-  unsigned NumLocalDynamics;
+  /// \brief Amount of stack frame size, not including callee-saved registers.
+  unsigned LocalStackSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// general-purpose registers that might contain variadic parameters.
-  int VariadicGPRIdx;
+  /// \brief Number of TLS accesses using the special (combinable)
+  /// _TLS_MODULE_BASE_ symbol.
+  unsigned NumLocalDynamicTLSAccesses;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the general-purpose registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicGPRIdx to what's stored in __gr_top.
-  unsigned VariadicGPRSize;
+  /// \brief FrameIndex for start of varargs area for arguments passed on the
+  /// stack.
+  int VarArgsStackIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of the area where LowerFormalArguments puts the
-  /// floating-point registers that might contain variadic parameters.
-  int VariadicFPRIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// general purpose registers.
+  int VarArgsGPRIndex;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The size of the frame object used to store the floating-point registers
-  /// which might contain variadic arguments. This is the offset from
-  /// VariadicFPRIdx to what's stored in __vr_top.
-  unsigned VariadicFPRSize;
+  /// \brief Size of the varargs area for arguments passed in general purpose
+  /// registers.
+  unsigned VarArgsGPRSize;
 
-  /// @see AArch64 Procedure Call Standard, B.3
-  ///
-  /// The Frame index of an object pointing just past the last known stacked
-  /// argument on entry to a variadic function. This goes into the __stack field
-  /// of the va_list type.
-  int VariadicStackIdx;
+  /// \brief FrameIndex for start of varargs area for arguments passed in
+  /// floating-point registers.
+  int VarArgsFPRIndex;
 
-  /// The offset of the frame pointer from the stack pointer on function
-  /// entry. This is expected to be negative.
-  int FramePointerOffset;
+  /// \brief Size of the varargs area for arguments passed in floating-point
+  /// registers.
+  unsigned VarArgsFPRSize;
 
 public:
-  AArch64MachineFunctionInfo()
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
-
-  explicit AArch64MachineFunctionInfo(MachineFunction &MF)
-    : BytesInStackArgArea(0),
-      ArgumentStackToRestore(0),
-      InitialStackAdjust(0),
-      NumLocalDynamics(0),
-      VariadicGPRIdx(0),
-      VariadicGPRSize(0),
-      VariadicFPRIdx(0),
-      VariadicFPRSize(0),
-      VariadicStackIdx(0),
-      FramePointerOffset(0) {}
+  AArch64FunctionInfo()
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
+
+  explicit AArch64FunctionInfo(MachineFunction &MF)
+      : BytesInStackArgArea(0), ArgumentStackToRestore(0), HasStackFrame(false),
+        NumLocalDynamicTLSAccesses(0), VarArgsStackIndex(0), VarArgsGPRIndex(0),
+        VarArgsGPRSize(0), VarArgsFPRIndex(0), VarArgsFPRSize(0) {
+    (void)MF;
+  }
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
-  void setBytesInStackArgArea (unsigned bytes) { BytesInStackArgArea = bytes;}
+  void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
 
   unsigned getArgumentStackToRestore() const { return ArgumentStackToRestore; }
   void setArgumentStackToRestore(unsigned bytes) {
     ArgumentStackToRestore = bytes;
   }
 
-  unsigned getInitialStackAdjust() const { return InitialStackAdjust; }
-  void setInitialStackAdjust(unsigned bytes) { InitialStackAdjust = bytes; }
+  bool hasStackFrame() const { return HasStackFrame; }
+  void setHasStackFrame(bool s) { HasStackFrame = s; }
 
-  unsigned getNumLocalDynamicTLSAccesses() const { return NumLocalDynamics; }
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamics; }
+  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
+  unsigned getLocalStackSize() const { return LocalStackSize; }
 
-  int getVariadicGPRIdx() const { return VariadicGPRIdx; }
-  void setVariadicGPRIdx(int Idx) { VariadicGPRIdx = Idx; }
+  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
+  unsigned getNumLocalDynamicTLSAccesses() const {
+    return NumLocalDynamicTLSAccesses;
+  }
 
-  unsigned getVariadicGPRSize() const { return VariadicGPRSize; }
-  void setVariadicGPRSize(unsigned Size) { VariadicGPRSize = Size; }
+  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
+  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
 
-  int getVariadicFPRIdx() const { return VariadicFPRIdx; }
-  void setVariadicFPRIdx(int Idx) { VariadicFPRIdx = Idx; }
+  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
+  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
 
-  unsigned getVariadicFPRSize() const { return VariadicFPRSize; }
-  void setVariadicFPRSize(unsigned Size) { VariadicFPRSize = Size; }
+  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
+  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
 
-  int getVariadicStackIdx() const { return VariadicStackIdx; }
-  void setVariadicStackIdx(int Idx) { VariadicStackIdx = Idx; }
+  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
+  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
 
-  int getFramePointerOffset() const { return FramePointerOffset; }
-  void setFramePointerOffset(int Idx) { FramePointerOffset = Idx; }
+  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
+  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
 
-};
+  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
+
+  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
+
+  // Shortcuts for LOH related types.
+  class MILOHDirective {
+    MCLOHType Kind;
 
+    /// Arguments of this directive. Order matters.
+    SmallVector<const MachineInstr *, 3> Args;
+
+  public:
+    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
+
+    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
+        : Kind(Kind), Args(Args.begin(), Args.end()) {
+      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
+    }
+
+    MCLOHType getKind() const { return Kind; }
+    const LOHArgs &getArgs() const { return Args; }
+  };
+
+  typedef MILOHDirective::LOHArgs MILOHArgs;
+  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
+
+  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
+
+  /// Add a LOH directive of this @p Kind and this @p Args.
+  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
+    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
+    LOHRelated.insert(Args.begin(), Args.end());
+  }
+
+private:
+  // Hold the lists of LOHs.
+  MILOHContainer LOHContainerSet;
+  SetOfInstructions LOHRelated;
+};
 } // End llvm namespace
 
-#endif
+#endif // AArch64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64PerfectShuffle.h b/lib/Target/AArch64/AArch64PerfectShuffle.h
index 6759236..b22fa24 100644
--- a/lib/Target/ARM64/ARM64PerfectShuffle.h
+++ b/lib/Target/AArch64/AArch64PerfectShuffle.h
@@ -1,4 +1,4 @@
-//===-- ARM64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -------------===//
+//===-- AArch64PerfectShuffle.h - AdvSIMD Perfect Shuffle Table -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/Target/ARM64/ARM64PromoteConstant.cpp b/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 9fbaedb..4723cc4 100644
--- a/lib/Target/ARM64/ARM64PromoteConstant.cpp
+++ b/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -1,5 +1,4 @@
-
-//===-- ARM64PromoteConstant.cpp --- Promote constant to global for ARM64 -===//
+//=- AArch64PromoteConstant.cpp --- Promote constant to global for AArch64 -==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,23 +7,20 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ARM64PromoteConstant pass which promotes constant
-// to global variables when this is likely to be more efficient.
-// Currently only types related to constant vector (i.e., constant vector, array
-// of constant vectors, constant structure with a constant vector field, etc.)
-// are promoted to global variables.
-// Indeed, constant vector are likely to be lowered in target constant pool
-// during instruction selection.
-// Therefore, the access will remain the same (memory load), but the structures
-// types are not split into different constant pool accesses for each field.
-// The bonus side effect is that created globals may be merged by the global
-// merge pass.
+// This file implements the AArch64PromoteConstant pass which promotes constants
+// to global variables when this is likely to be more efficient. Currently only
+// types related to constant vector (i.e., constant vector, array of constant
+// vectors, constant structure with a constant vector field, etc.) are promoted
+// to global variables. Constant vectors are likely to be lowered in target
+// constant pool during instruction selection already; therefore, the access
+// will remain the same (memory load), but the structure types are not split
+// into different constant pool accesses for each field. A bonus side effect is
+// that created globals may be merged by the global merge pass.
 //
 // FIXME: This pass may be useful for other targets too.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-promote-const"
-#include "ARM64.h"
+#include "AArch64.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
@@ -44,15 +40,17 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-promote-const"
+
 // Stress testing mode - disable heuristics.
-static cl::opt<bool> Stress("arm64-stress-promote-const", cl::Hidden,
+static cl::opt<bool> Stress("aarch64-stress-promote-const", cl::Hidden,
                             cl::desc("Promote all vector constants"));
 
 STATISTIC(NumPromoted, "Number of promoted constants");
 STATISTIC(NumPromotedUses, "Number of promoted constants uses");
 
 //===----------------------------------------------------------------------===//
-//                       ARM64PromoteConstant
+//                       AArch64PromoteConstant
 //===----------------------------------------------------------------------===//
 
 namespace {
@@ -74,26 +72,28 @@ namespace {
 ///   return ret;
 /// }
 ///
-/// The constants in that example are folded into the uses. Thus, 4 different
+/// The constants in this example are folded into the uses. Thus, 4 different
 /// constants are created.
+///
 /// As their type is vector the cheapest way to create them is to load them
 /// for the memory.
-/// Therefore the final assembly final has 4 different load.
-/// With this pass enabled, only one load is issued for the constants.
-class ARM64PromoteConstant : public ModulePass {
+///
+/// Therefore the final assembly final has 4 different loads. With this pass
+/// enabled, only one load is issued for the constants.
+class AArch64PromoteConstant : public ModulePass {
 
 public:
   static char ID;
-  ARM64PromoteConstant() : ModulePass(ID) {}
+  AArch64PromoteConstant() : ModulePass(ID) {}
 
-  virtual const char *getPassName() const { return "ARM64 Promote Constant"; }
+  const char *getPassName() const override { return "AArch64 Promote Constant"; }
 
   /// Iterate over the functions and promote the interesting constants into
   /// global variables with module scope.
-  bool runOnModule(Module &M) {
+  bool runOnModule(Module &M) override {
     DEBUG(dbgs() << getPassName() << '\n');
     bool Changed = false;
-    for (auto &MF: M) {
+    for (auto &MF : M) {
       Changed |= runOnFunction(MF);
     }
     return Changed;
@@ -106,18 +106,18 @@ private:
   bool runOnFunction(Function &F);
 
   // This transformation requires dominator info
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
   }
 
-  /// Type to store a list of User
+  /// Type to store a list of User.
   typedef SmallVector<Value::user_iterator, 4> Users;
   /// Map an insertion point to all the uses it dominates.
   typedef DenseMap<Instruction *, Users> InsertionPoints;
   /// Map a function to the required insertion point of load for a
-  /// global variable
+  /// global variable.
   typedef DenseMap<Function *, InsertionPoints> InsertionPointsPerFunc;
 
   /// Find the closest point that dominates the given Use.
@@ -187,34 +187,34 @@ private:
                                              Value::user_iterator &UseIt,
                                              InsertionPoints::iterator &IPI,
                                              InsertionPoints &InsertPts) {
-    // Record the dominated use
+    // Record the dominated use.
     IPI->second.push_back(UseIt);
     // Transfer the dominated uses of IPI to NewPt
     // Inserting into the DenseMap may invalidate existing iterator.
     // Keep a copy of the key to find the iterator to erase.
     Instruction *OldInstr = IPI->first;
     InsertPts.insert(InsertionPoints::value_type(NewPt, IPI->second));
-    // Erase IPI
+    // Erase IPI.
     IPI = InsertPts.find(OldInstr);
     InsertPts.erase(IPI);
   }
 };
 } // end anonymous namespace
 
-char ARM64PromoteConstant::ID = 0;
+char AArch64PromoteConstant::ID = 0;
 
 namespace llvm {
-void initializeARM64PromoteConstantPass(PassRegistry &);
+void initializeAArch64PromoteConstantPass(PassRegistry &);
 }
 
-INITIALIZE_PASS_BEGIN(ARM64PromoteConstant, "arm64-promote-const",
-                      "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_BEGIN(AArch64PromoteConstant, "aarch64-promote-const",
+                      "AArch64 Promote Constant Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_END(ARM64PromoteConstant, "arm64-promote-const",
-                    "ARM64 Promote Constant Pass", false, false)
+INITIALIZE_PASS_END(AArch64PromoteConstant, "aarch64-promote-const",
+                    "AArch64 Promote Constant Pass", false, false)
 
-ModulePass *llvm::createARM64PromoteConstantPass() {
-  return new ARM64PromoteConstant();
+ModulePass *llvm::createAArch64PromoteConstantPass() {
+  return new AArch64PromoteConstant();
 }
 
 /// Check if the given type uses a vector type.
@@ -243,26 +243,26 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
   if (isa<const ShuffleVectorInst>(Instr) && OpIdx == 2)
     return false;
 
-  // extractvalue instruction expects a const idx
+  // extractvalue instruction expects a const idx.
   if (isa<const ExtractValueInst>(Instr) && OpIdx > 0)
     return false;
 
-  // extractvalue instruction expects a const idx
+  // extractvalue instruction expects a const idx.
   if (isa<const InsertValueInst>(Instr) && OpIdx > 1)
     return false;
 
   if (isa<const AllocaInst>(Instr) && OpIdx > 0)
     return false;
 
-  // Alignment argument must be constant
+  // Alignment argument must be constant.
   if (isa<const LoadInst>(Instr) && OpIdx > 0)
     return false;
 
-  // Alignment argument must be constant
+  // Alignment argument must be constant.
   if (isa<const StoreInst>(Instr) && OpIdx > 1)
     return false;
 
-  // Index must be constant
+  // Index must be constant.
   if (isa<const GetElementPtrInst>(Instr) && OpIdx > 0)
     return false;
 
@@ -271,19 +271,19 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
   if (isa<const LandingPadInst>(Instr))
     return false;
 
-  // switch instruction expects constants to compare to
+  // Switch instruction expects constants to compare to.
   if (isa<const SwitchInst>(Instr))
     return false;
 
-  // Expected address must be a constant
+  // Expected address must be a constant.
   if (isa<const IndirectBrInst>(Instr))
     return false;
 
-  // Do not mess with intrinsic
+  // Do not mess with intrinsics.
   if (isa<const IntrinsicInst>(Instr))
     return false;
 
-  // Do not mess with inline asm
+  // Do not mess with inline asm.
   const CallInst *CI = dyn_cast<const CallInst>(Instr);
   if (CI && isa<const InlineAsm>(CI->getCalledValue()))
     return false;
@@ -329,9 +329,9 @@ static bool shouldConvert(const Constant *Cst) {
 }
 
 Instruction *
-ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
+AArch64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
   // If this user is a phi, the insertion point is in the related
-  // incoming basic block
+  // incoming basic block.
   PHINode *PhiInst = dyn_cast<PHINode>(*Use);
   Instruction *InsertionPoint;
   if (PhiInst)
@@ -343,46 +343,43 @@ ARM64PromoteConstant::findInsertionPoint(Value::user_iterator &Use) {
   return InsertionPoint;
 }
 
-bool ARM64PromoteConstant::isDominated(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
+bool AArch64PromoteConstant::isDominated(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
 
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
 
-  // Traverse all the existing insertion point and check if one is dominating
-  // NewPt
-  for (InsertionPoints::iterator IPI = InsertPts.begin(),
-                                 EndIPI = InsertPts.end();
-       IPI != EndIPI; ++IPI) {
-    if (NewPt == IPI->first || DT.dominates(IPI->first, NewPt) ||
-        // When IPI->first is a terminator instruction, DT may think that
+  // Traverse all the existing insertion points and check if one is dominating
+  // NewPt. If it is, remember that.
+  for (auto &IPI : InsertPts) {
+    if (NewPt == IPI.first || DT.dominates(IPI.first, NewPt) ||
+        // When IPI.first is a terminator instruction, DT may think that
         // the result is defined on the edge.
         // Here we are testing the insertion point, not the definition.
-        (IPI->first->getParent() != NewPt->getParent() &&
-         DT.dominates(IPI->first->getParent(), NewPt->getParent()))) {
-      // No need to insert this point
-      // Record the dominated use
+        (IPI.first->getParent() != NewPt->getParent() &&
+         DT.dominates(IPI.first->getParent(), NewPt->getParent()))) {
+      // No need to insert this point. Just record the dominated use.
       DEBUG(dbgs() << "Insertion point dominated by:\n");
-      DEBUG(IPI->first->print(dbgs()));
+      DEBUG(IPI.first->print(dbgs()));
       DEBUG(dbgs() << '\n');
-      IPI->second.push_back(UseIt);
+      IPI.second.push_back(UseIt);
       return true;
     }
   }
   return false;
 }
 
-bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
-                                       Value::user_iterator &UseIt,
-                                       InsertionPoints &InsertPts) {
+bool AArch64PromoteConstant::tryAndMerge(Instruction *NewPt,
+                                         Value::user_iterator &UseIt,
+                                         InsertionPoints &InsertPts) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
       *NewPt->getParent()->getParent()).getDomTree();
   BasicBlock *NewBB = NewPt->getParent();
 
   // Traverse all the existing insertion point and check if one is dominated by
   // NewPt and thus useless or can be combined with NewPt into a common
-  // dominator
+  // dominator.
   for (InsertionPoints::iterator IPI = InsertPts.begin(),
                                  EndIPI = InsertPts.end();
        IPI != EndIPI; ++IPI) {
@@ -400,19 +397,19 @@ bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
 
     // Look for a common dominator
     BasicBlock *CommonDominator = DT.findNearestCommonDominator(NewBB, CurBB);
-    // If none exists, we cannot merge these two points
+    // If none exists, we cannot merge these two points.
     if (!CommonDominator)
       continue;
 
     if (CommonDominator != NewBB) {
-      // By construction, the CommonDominator cannot be CurBB
+      // By construction, the CommonDominator cannot be CurBB.
       assert(CommonDominator != CurBB &&
              "Instruction has not been rejected during isDominated check!");
       // Take the last instruction of the CommonDominator as insertion point
       NewPt = CommonDominator->getTerminator();
     }
     // else, CommonDominator is the block of NewBB, hence NewBB is the last
-    // possible insertion point in that block
+    // possible insertion point in that block.
     DEBUG(dbgs() << "Merge insertion point with:\n");
     DEBUG(IPI->first->print(dbgs()));
     DEBUG(dbgs() << '\n');
@@ -424,17 +421,17 @@ bool ARM64PromoteConstant::tryAndMerge(Instruction *NewPt,
   return false;
 }
 
-void ARM64PromoteConstant::computeInsertionPoints(
+void AArch64PromoteConstant::computeInsertionPoints(
     Constant *Val, InsertionPointsPerFunc &InsPtsPerFunc) {
   DEBUG(dbgs() << "** Compute insertion points **\n");
   for (Value::user_iterator UseIt = Val->user_begin(),
                             EndUseIt = Val->user_end();
        UseIt != EndUseIt; ++UseIt) {
-    // If the user is not an Instruction, we cannot modify it
+    // If the user is not an Instruction, we cannot modify it.
     if (!isa<Instruction>(*UseIt))
       continue;
 
-    // Filter out uses that should not be converted
+    // Filter out uses that should not be converted.
     if (!shouldConvertUse(Val, cast<Instruction>(*UseIt), UseIt.getOperandNo()))
       continue;
 
@@ -466,19 +463,18 @@ void ARM64PromoteConstant::computeInsertionPoints(
   }
 }
 
-bool
-ARM64PromoteConstant::insertDefinitions(Constant *Cst,
-                                        InsertionPointsPerFunc &InsPtsPerFunc) {
-  // We will create one global variable per Module
+bool AArch64PromoteConstant::insertDefinitions(
+    Constant *Cst, InsertionPointsPerFunc &InsPtsPerFunc) {
+  // We will create one global variable per Module.
   DenseMap<Module *, GlobalVariable *> ModuleToMergedGV;
   bool HasChanged = false;
 
-  // Traverse all insertion points in all the function
+  // Traverse all insertion points in all the function.
   for (InsertionPointsPerFunc::iterator FctToInstPtsIt = InsPtsPerFunc.begin(),
                                         EndIt = InsPtsPerFunc.end();
        FctToInstPtsIt != EndIt; ++FctToInstPtsIt) {
     InsertionPoints &InsertPts = FctToInstPtsIt->second;
-// Do more check for debug purposes
+// Do more checking for debug purposes.
 #ifndef NDEBUG
     DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(
         *FctToInstPtsIt->first).getDomTree();
@@ -491,8 +487,8 @@ ARM64PromoteConstant::insertDefinitions(Constant *Cst,
         ModuleToMergedGV.find(M);
     if (MapIt == ModuleToMergedGV.end()) {
       PromotedGV = new GlobalVariable(
-          *M, Cst->getType(), true, GlobalValue::InternalLinkage, 0,
-          "_PromotedConst", 0, GlobalVariable::NotThreadLocal);
+          *M, Cst->getType(), true, GlobalValue::InternalLinkage, nullptr,
+          "_PromotedConst", nullptr, GlobalVariable::NotThreadLocal);
       PromotedGV->setInitializer(Cst);
       ModuleToMergedGV[M] = PromotedGV;
       DEBUG(dbgs() << "Global replacement: ");
@@ -507,7 +503,7 @@ ARM64PromoteConstant::insertDefinitions(Constant *Cst,
     for (InsertionPoints::iterator IPI = InsertPts.begin(),
                                    EndIPI = InsertPts.end();
          IPI != EndIPI; ++IPI) {
-      // Create the load of the global variable
+      // Create the load of the global variable.
       IRBuilder<> Builder(IPI->first->getParent(), IPI->first);
       LoadInst *LoadedCst = Builder.CreateLoad(PromotedGV);
       DEBUG(dbgs() << "**********\n");
@@ -515,21 +511,19 @@ ARM64PromoteConstant::insertDefinitions(Constant *Cst,
       DEBUG(LoadedCst->print(dbgs()));
       DEBUG(dbgs() << '\n');
 
-      // Update the dominated uses
+      // Update the dominated uses.
       Users &DominatedUsers = IPI->second;
-      for (Users::iterator UseIt = DominatedUsers.begin(),
-                           EndIt = DominatedUsers.end();
-           UseIt != EndIt; ++UseIt) {
+      for (Value::user_iterator Use : DominatedUsers) {
 #ifndef NDEBUG
-        assert((DT.dominates(LoadedCst, cast<Instruction>(**UseIt)) ||
-                (isa<PHINode>(**UseIt) &&
-                 DT.dominates(LoadedCst, findInsertionPoint(*UseIt)))) &&
+        assert((DT.dominates(LoadedCst, cast<Instruction>(*Use)) ||
+                (isa<PHINode>(*Use) &&
+                 DT.dominates(LoadedCst, findInsertionPoint(Use)))) &&
                "Inserted definition does not dominate all its uses!");
 #endif
-        DEBUG(dbgs() << "Use to update " << UseIt->getOperandNo() << ":");
-        DEBUG((*UseIt)->print(dbgs()));
+        DEBUG(dbgs() << "Use to update " << Use.getOperandNo() << ":");
+        DEBUG(Use->print(dbgs()));
         DEBUG(dbgs() << '\n');
-        (*UseIt)->setOperand(UseIt->getOperandNo(), LoadedCst);
+        Use->setOperand(Use.getOperandNo(), LoadedCst);
         ++NumPromotedUses;
       }
     }
@@ -537,13 +531,13 @@ ARM64PromoteConstant::insertDefinitions(Constant *Cst,
   return HasChanged;
 }
 
-bool ARM64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
+bool AArch64PromoteConstant::computeAndInsertDefinitions(Constant *Val) {
   InsertionPointsPerFunc InsertPtsPerFunc;
   computeInsertionPoints(Val, InsertPtsPerFunc);
   return insertDefinitions(Val, InsertPtsPerFunc);
 }
 
-bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
+bool AArch64PromoteConstant::promoteConstant(Constant *Cst) {
   assert(Cst && "Given variable is not a valid constant.");
 
   if (!shouldConvert(Cst))
@@ -557,24 +551,23 @@ bool ARM64PromoteConstant::promoteConstant(Constant *Cst) {
   return computeAndInsertDefinitions(Cst);
 }
 
-bool ARM64PromoteConstant::runOnFunction(Function &F) {
-  // Look for instructions using constant vector
-  // Promote that constant to a global variable.
-  // Create as few load of this variable as possible and update the uses
-  // accordingly
+bool AArch64PromoteConstant::runOnFunction(Function &F) {
+  // Look for instructions using constant vector. Promote that constant to a
+  // global variable. Create as few loads of this variable as possible and
+  // update the uses accordingly.
   bool LocalChange = false;
   SmallSet<Constant *, 8> AlreadyChecked;
 
   for (auto &MBB : F) {
-    for (auto &MI: MBB) {
-      // Traverse the operand, looking for constant vectors
-      // Replace them by a load of a global variable of type constant vector
+    for (auto &MI : MBB) {
+      // Traverse the operand, looking for constant vectors. Replace them by a
+      // load of a global variable of constant vector type.
       for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands();
            OpIdx != EndOpIdx; ++OpIdx) {
         Constant *Cst = dyn_cast<Constant>(MI.getOperand(OpIdx));
-        // There is no point is promoting global value, they are already global.
-        // Do not promote constant expression, as they may require some code
-        // expansion.
+        // There is no point in promoting global values as they are already
+        // global. Do not promote constant expressions either, as they may
+        // require some code expansion.
         if (Cst && !isa<GlobalValue>(Cst) && !isa<ConstantExpr>(Cst) &&
             AlreadyChecked.insert(Cst))
           LocalChange |= promoteConstant(Cst);
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 06e1ffb..01b9587 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -12,175 +12,393 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #include "AArch64RegisterInfo.h"
 #include "AArch64FrameLowering.h"
-#include "AArch64MachineFunctionInfo.h"
-#include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
 
 #define GET_REGINFO_TARGET_DESC
 #include "AArch64GenRegisterInfo.inc"
 
-using namespace llvm;
+AArch64RegisterInfo::AArch64RegisterInfo(const AArch64InstrInfo *tii,
+                                         const AArch64Subtarget *sti)
+    : AArch64GenRegisterInfo(AArch64::LR), TII(tii), STI(sti) {}
 
-AArch64RegisterInfo::AArch64RegisterInfo()
-  : AArch64GenRegisterInfo(AArch64::X30) {
-}
-
-const uint16_t *
+const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_PCS_SaveList;
-}
-
-const uint32_t*
-AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID) const {
-  return CSR_PCS_RegMask;
+  assert(MF && "Invalid MachineFunction pointer.");
+  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_SaveList;
+  else
+    return CSR_AArch64_AAPCS_SaveList;
 }
 
-const uint32_t *AArch64RegisterInfo::getTLSDescCallPreservedMask() const {
-  return TLSDesc_RegMask;
+const uint32_t *
+AArch64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::AnyReg)
+    return CSR_AArch64_AllRegs_RegMask;
+  else
+    return CSR_AArch64_AAPCS_RegMask;
 }
 
-const TargetRegisterClass *
-AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &AArch64::FlagClassRegClass)
-    return &AArch64::GPR64RegClass;
+const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
+  if (STI->isTargetDarwin())
+    return CSR_AArch64_TLS_Darwin_RegMask;
 
-  return RC;
+  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
+  return CSR_AArch64_TLS_ELF_RegMask;
 }
 
-
+const uint32_t *
+AArch64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i64 argument (which must also be the register used to return a
+  // single i64 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both, the function should return NULL (does not currently apply)
+  return CSR_AArch64_AAPCS_ThisReturn_RegMask;
+}
 
 BitVector
 AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  BitVector Reserved(getNumRegs());
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  Reserved.set(AArch64::XSP);
-  Reserved.set(AArch64::WSP);
-
+  // FIXME: avoid re-calculating this every time.
+  BitVector Reserved(getNumRegs());
+  Reserved.set(AArch64::SP);
   Reserved.set(AArch64::XZR);
+  Reserved.set(AArch64::WSP);
   Reserved.set(AArch64::WZR);
 
-  if (TFI->hasFP(MF)) {
-    Reserved.set(AArch64::X29);
+  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
+    Reserved.set(AArch64::FP);
     Reserved.set(AArch64::W29);
   }
 
+  if (STI->isTargetDarwin()) {
+    Reserved.set(AArch64::X18); // Platform register
+    Reserved.set(AArch64::W18);
+  }
+
+  if (hasBasePointer(MF)) {
+    Reserved.set(AArch64::X19);
+    Reserved.set(AArch64::W19);
+  }
+
   return Reserved;
 }
 
-static bool hasFrameOffset(int opcode) {
-  return opcode != AArch64::LD1x2_8B  && opcode != AArch64::LD1x3_8B  &&
-         opcode != AArch64::LD1x4_8B  && opcode != AArch64::ST1x2_8B  &&
-         opcode != AArch64::ST1x3_8B  && opcode != AArch64::ST1x4_8B  &&
-         opcode != AArch64::LD1x2_16B && opcode != AArch64::LD1x3_16B &&
-         opcode != AArch64::LD1x4_16B && opcode != AArch64::ST1x2_16B &&
-         opcode != AArch64::ST1x3_16B && opcode != AArch64::ST1x4_16B;
-}
+bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
+                                      unsigned Reg) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-void
-AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MBBI,
-                                         int SPAdj,
-                                         unsigned FIOperandNum,
-                                         RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Cannot deal with nonzero SPAdj yet");
-  MachineInstr &MI = *MBBI;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const AArch64FrameLowering *TFI =
-   static_cast<const AArch64FrameLowering *>(MF.getTarget().getFrameLowering());
-
-  // In order to work out the base and offset for addressing, the FrameLowering
-  // code needs to know (sometimes) whether the instruction is storing/loading a
-  // callee-saved register, or whether it's a more generic
-  // operation. Fortunately the frame indices are used *only* for that purpose
-  // and are contiguous, so we can check here.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  int MinCSFI = 0;
-  int MaxCSFI = -1;
-
-  if (CSI.size()) {
-    MinCSFI = CSI[0].getFrameIdx();
-    MaxCSFI = CSI[CSI.size() - 1].getFrameIdx();
+  switch (Reg) {
+  default:
+    break;
+  case AArch64::SP:
+  case AArch64::XZR:
+  case AArch64::WSP:
+  case AArch64::WZR:
+    return true;
+  case AArch64::X18:
+  case AArch64::W18:
+    return STI->isTargetDarwin();
+  case AArch64::FP:
+  case AArch64::W29:
+    return TFI->hasFP(MF) || STI->isTargetDarwin();
+  case AArch64::W19:
+  case AArch64::X19:
+    return hasBasePointer(MF);
   }
 
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  bool IsCalleeSaveOp = FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI;
+  return false;
+}
 
-  unsigned FrameReg;
-  int64_t Offset;
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj,
-                                           IsCalleeSaveOp);
-  // A vector load/store instruction doesn't have an offset operand.
-  bool HasOffsetOp = hasFrameOffset(MI.getOpcode());
-  if (HasOffsetOp)
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+const TargetRegisterClass *
+AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
+                                      unsigned Kind) const {
+  return &AArch64::GPR64RegClass;
+}
 
-  // DBG_VALUE instructions have no real restrictions so they can be handled
-  // easily.
-  if (MI.isDebugValue()) {
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, /*isDef=*/ false);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
+const TargetRegisterClass *
+AArch64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
+  if (RC == &AArch64::CCRRegClass)
+    return &AArch64::GPR64RegClass; // Only MSR & MRS copy NZCV.
+  return RC;
+}
 
-  const AArch64InstrInfo &TII =
-    *static_cast<const AArch64InstrInfo*>(MF.getTarget().getInstrInfo());
-  int MinOffset, MaxOffset, OffsetScale;
-  if (MI.getOpcode() == AArch64::ADDxxi_lsl0_s || !HasOffsetOp) {
-    MinOffset = 0;
-    MaxOffset = 0xfff;
-    OffsetScale = 1;
-  } else {
-    // Load/store of a stack object
-    TII.getAddressConstraints(MI, OffsetScale, MinOffset, MaxOffset);
-  }
+unsigned AArch64RegisterInfo::getBaseRegister() const { return AArch64::X19; }
 
-  // There are two situations we don't use frame + offset directly in the
-  // instruction:
-  // (1) The offset can't really be scaled
-  // (2) Can't encode offset as it doesn't have an offset operand
-  if ((Offset % OffsetScale != 0 || Offset < MinOffset || Offset > MaxOffset) ||
-      (!HasOffsetOp && Offset != 0)) {
-    unsigned BaseReg =
-      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
-    emitRegUpdate(MBB, MBBI, MBBI->getDebugLoc(), TII,
-                  BaseReg, FrameReg, BaseReg, Offset);
-    FrameReg = BaseReg;
-    Offset = 0;
-  }
+bool AArch64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
 
-  // Negative offsets are expected if we address from FP, but for
-  // now this checks nothing has gone horribly wrong.
-  assert(Offset >= 0 && "Unexpected negative offset from SP");
+  // In the presence of variable sized objects, if the fixed stack size is
+  // large enough that referencing from the FP won't result in things being
+  // in range relatively often, we can use a base pointer to allow access
+  // from the other direction like the SP normally works.
+  if (MFI->hasVarSizedObjects()) {
+    // Conservatively estimate whether the negative offset from the frame
+    // pointer will be sufficient to reach. If a function has a smallish
+    // frame, it's less likely to have lots of spills and callee saved
+    // space, so it's all more likely to be within range of the frame pointer.
+    // If it's wrong, we'll materialize the constant and still get to the
+    // object; it's just suboptimal. Negative offsets use the unscaled
+    // load/store instructions, which have a 9-bit signed immediate.
+    if (MFI->getLocalFrameSize() < 256)
+      return false;
+    return true;
+  }
 
-  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false, false, true);
-  if (HasOffsetOp)
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset / OffsetScale);
+  return false;
 }
 
 unsigned
 AArch64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  if (TFI->hasFP(MF))
-    return AArch64::X29;
-  else
-    return AArch64::XSP;
+  return TFI->hasFP(MF) ? AArch64::FP : AArch64::SP;
+}
+
+bool AArch64RegisterInfo::requiresRegisterScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool AArch64RegisterInfo::requiresVirtualBaseRegisters(
+    const MachineFunction &MF) const {
+  return true;
 }
 
 bool
 AArch64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // AArch64FrameLowering::resolveFrameIndexReference() can always fall back
+  // to the stack pointer, so only put the emergency spill slot next to the
+  // FP when there's no better way to access it (SP or base pointer).
+  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
+}
+
+bool AArch64RegisterInfo::requiresFrameIndexScavenging(
+    const MachineFunction &MF) const {
+  return true;
+}
+
+bool
+AArch64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Only consider eliminating leaf frames.
+  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
+                          MFI->adjustsStack()))
+    return true;
+  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
+}
+
+/// needsFrameBaseReg - Returns true if the instruction's frame index
+/// reference would be better served by a base register other than FP
+/// or SP. Used by LocalStackFrameAllocation to determine which frame index
+/// references it should create new base registers for.
+bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
+                                            int64_t Offset) const {
+  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
+    assert(i < MI->getNumOperands() &&
+           "Instr doesn't have FrameIndex operand!");
+
+  // It's the load/store FI references that cause issues, as it can be difficult
+  // to materialize the offset if it won't fit in the literal field. Estimate
+  // based on the size of the local frame and some conservative assumptions
+  // about the rest of the stack frame (note, this is pre-regalloc, so
+  // we don't know everything for certain yet) whether this offset is likely
+  // to be out of range of the immediate. Return true if so.
+
+  // We only generate virtual base registers for loads and stores, so
+  // return false for everything else.
+  if (!MI->mayLoad() && !MI->mayStore())
+    return false;
+
+  // Without a virtual base register, if the function has variable sized
+  // objects, all fixed-size local references will be via the frame pointer,
+  // Approximate the offset and see if it's legal for the instruction.
+  // Note that the incoming offset is based on the SP value at function entry,
+  // so it'll be negative.
+  MachineFunction &MF = *MI->getParent()->getParent();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const AArch64FrameLowering *AFI
-    = static_cast<const AArch64FrameLowering*>(TFI);
-  return AFI->useFPForAddressing(MF);
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Estimate an offset from the frame pointer.
+  // Conservatively assume all GPR callee-saved registers get pushed.
+  // FP, LR, X19-X28, D8-D15. 64-bits each.
+  int64_t FPOffset = Offset - 16 * 20;
+  // Estimate an offset from the stack pointer.
+  // The incoming offset is relating to the SP at the start of the function,
+  // but when we access the local it'll be relative to the SP after local
+  // allocation, so adjust our SP-relative offset by that allocation size.
+  Offset += MFI->getLocalFrameSize();
+  // Assume that we'll have at least some spill slots allocated.
+  // FIXME: This is a total SWAG number. We should run some statistics
+  //        and pick a real one.
+  Offset += 128; // 128 bytes of spill slots
+
+  // If there is a frame pointer, try using it.
+  // The FP is only available if there is no dynamic realignment. We
+  // don't know for sure yet whether we'll need that, so we guess based
+  // on whether there are any local variables that would trigger it.
+  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
+    return false;
+
+  // If we can reference via the stack pointer or base pointer, try that.
+  // FIXME: This (and the code that resolves the references) can be improved
+  //        to only disallow SP relative references in the live range of
+  //        the VLA(s). In practice, it's unclear how much difference that
+  //        would make, but it may be worth doing.
+  if (isFrameOffsetLegal(MI, Offset))
+    return false;
+
+  // The offset likely isn't legal; we want to allocate a virtual base register.
+  return true;
+}
+
+bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             int64_t Offset) const {
+  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
+  assert(MI && "Unable to get the legal offset for nil instruction.");
+  int SaveOffset = Offset;
+  return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
+}
+
+/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
+/// at the beginning of the basic block.
+void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       unsigned BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
+  MachineBasicBlock::iterator Ins = MBB->begin();
+  DebugLoc DL; // Defaults to "unknown"
+  if (Ins != MBB->end())
+    DL = Ins->getDebugLoc();
+
+  const MCInstrDesc &MCID = TII->get(AArch64::ADDXri);
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  const MachineFunction &MF = *MBB->getParent();
+  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
+  unsigned Shifter = AArch64_AM::getShifterImm(AArch64_AM::LSL, 0);
+
+  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
+      .addFrameIndex(FrameIdx)
+      .addImm(Offset)
+      .addImm(Shifter);
 }
+
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                                            int64_t Offset) const {
+  int Off = Offset; // ARM doesn't need the general 64-bit offsets
+  unsigned i = 0;
+
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  bool Done = rewriteAArch64FrameIndex(MI, i, BaseReg, Off, TII);
+  assert(Done && "Unable to resolve frame index!");
+  (void)Done;
+}
+
+void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
+                                              int SPAdj, unsigned FIOperandNum,
+                                              RegScavenger *RS) const {
+  assert(SPAdj == 0 && "Unexpected");
+
+  MachineInstr &MI = *II;
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  const AArch64FrameLowering *TFI = static_cast<const AArch64FrameLowering *>(
+      MF.getTarget().getFrameLowering());
+
+  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
+  unsigned FrameReg;
+  int Offset;
+
+  // Special handling of dbg_value, stackmap and patchpoint instructions.
+  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
+                                             /*PreferFP=*/true);
+    Offset += MI.getOperand(FIOperandNum + 1).getImm();
+    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
+    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    return;
+  }
+
+  // Modify MI as necessary to handle as much of 'Offset' as possible
+  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
+  if (rewriteAArch64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
+    return;
+
+  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
+         "Emergency spill slot is out of reach");
+
+  // If we get here, the immediate doesn't fit into the instruction.  We folded
+  // as much as possible above.  Handle the rest, providing a register that is
+  // SP+LargeImm.
+  unsigned ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
+  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
+}
+
+namespace llvm {
+
+unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                                  MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case AArch64::GPR32RegClassID:
+  case AArch64::GPR32spRegClassID:
+  case AArch64::GPR32allRegClassID:
+  case AArch64::GPR64spRegClassID:
+  case AArch64::GPR64allRegClassID:
+  case AArch64::GPR64RegClassID:
+  case AArch64::GPR32commonRegClassID:
+  case AArch64::GPR64commonRegClassID:
+    return 32 - 1                                      // XZR/SP
+           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
+           - STI->isTargetDarwin() // X18 reserved as platform register
+           - hasBasePointer(MF);   // X19
+  case AArch64::FPR8RegClassID:
+  case AArch64::FPR16RegClassID:
+  case AArch64::FPR32RegClassID:
+  case AArch64::FPR64RegClassID:
+  case AArch64::FPR128RegClassID:
+    return 32;
+
+  case AArch64::DDRegClassID:
+  case AArch64::DDDRegClassID:
+  case AArch64::DDDDRegClassID:
+  case AArch64::QQRegClassID:
+  case AArch64::QQQRegClassID:
+  case AArch64::QQQQRegClassID:
+    return 32;
+
+  case AArch64::FPR128_loRegClassID:
+    return 16;
+  }
+}
+
+} // namespace llvm
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.h b/lib/Target/AArch64/AArch64RegisterInfo.h
index 4d67943..76af1ed 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -1,4 +1,4 @@
-//==- AArch64RegisterInfo.h - AArch64 Register Information Impl -*- C++ -*-===//
+//==- AArch64RegisterInfo.h - AArch64 Register Information Impl --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the AArch64 implementation of the MCRegisterInfo class.
+// This file contains the AArch64 implementation of the MRegisterInfo class.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64REGISTERINFO_H
-#define LLVM_TARGET_AARCH64REGISTERINFO_H
-
-#include "llvm/Target/TargetRegisterInfo.h"
+#ifndef LLVM_TARGET_AArch64REGISTERINFO_H
+#define LLVM_TARGET_AArch64REGISTERINFO_H
 
 #define GET_REGINFO_HEADER
 #include "AArch64GenRegisterInfo.inc"
@@ -23,49 +21,81 @@ namespace llvm {
 
 class AArch64InstrInfo;
 class AArch64Subtarget;
+class MachineFunction;
+class RegScavenger;
+class TargetRegisterClass;
 
 struct AArch64RegisterInfo : public AArch64GenRegisterInfo {
-  AArch64RegisterInfo();
+private:
+  const AArch64InstrInfo *TII;
+  const AArch64Subtarget *STI;
 
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+public:
+  AArch64RegisterInfo(const AArch64InstrInfo *tii, const AArch64Subtarget *sti);
 
-  const uint32_t *getTLSDescCallPreservedMask() const;
+  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  /// Code Generation virtual methods...
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
 
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *Rs = NULL) const;
+  unsigned getCSRFirstUseCost() const override {
+    // The cost will be compared against BlockFrequency where entry has the
+    // value of 1 << 14. A value of 5 will choose to spill or split really
+    // cold path instead of using a callee-saved register.
+    return 5;
+  }
 
-  /// getCrossCopyRegClass - Returns a legal register class to copy a register
-  /// in the specified class to or from. Returns original class if it is
-  /// possible to copy between a two registers of the specified class.
+  // Calls involved in thread-local variable lookup save more registers than
+  // normal calls, so they need a different mask to represent this.
+  const uint32_t *getTLSCallPreservedMask() const;
+
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i64 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i64 argument and an i64 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const;
-
-  /// getLargestLegalSuperClass - Returns the largest super class of RC that is
-  /// legal to use in the current sub-target and has the same spill size.
-  const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const {
-    if (RC == &AArch64::tcGPR64RegClass)
-      return &AArch64::GPR64RegClass;
-
-    return RC;
-  }
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
+  const TargetRegisterClass *
+  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
+
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
+
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+                                    int FrameIdx,
+                                    int64_t Offset) const override;
+  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+                         int64_t Offset) const override;
+  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
+  bool cannotEliminateFrame(const MachineFunction &MF) const;
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
-    return true;
-  }
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
+  bool hasBasePointer(const MachineFunction &MF) const;
+  unsigned getBaseRegister() const;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
-    return true;
-  }
+  // Debug information queries.
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
 
-#endif // LLVM_TARGET_AARCH64REGISTERINFO_H
+#endif // LLVM_TARGET_AArch64REGISTERINFO_H
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 9de7abd..21c927f 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -1,4 +1,4 @@
-//===- AArch64RegisterInfo.td - ARM Register defs ----------*- tablegen -*-===//
+//=- AArch64RegisterInfo.td - Describe the AArch64 Regisers --*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,284 +7,587 @@
 //
 //===----------------------------------------------------------------------===//
 //
-//  This file contains declarations that describe the AArch64 register file
 //
 //===----------------------------------------------------------------------===//
 
-let Namespace = "AArch64" in {
-def sub_128 : SubRegIndex<128>;
-def sub_64 : SubRegIndex<64>;
-def sub_32 : SubRegIndex<32>;
-def sub_16 : SubRegIndex<16>;
-def sub_8  : SubRegIndex<8>;
-
-// Note: Code depends on these having consecutive numbers.
-def qqsub : SubRegIndex<256, 256>;
-
-def qsub_0 : SubRegIndex<128>;
-def qsub_1 : SubRegIndex<128, 128>;
-def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
-def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
-
-def dsub_0 : SubRegIndex<64>;
-def dsub_1 : SubRegIndex<64, 64>;
-def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
-def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-}
 
-// Registers are identified with 5-bit ID numbers.
-class AArch64Reg<bits<16> enc, string n> : Register<n> {
+class AArch64Reg<bits<16> enc, string n, list<Register> subregs = [],
+               list<string> altNames = []>
+        : Register<n, altNames> {
   let HWEncoding = enc;
   let Namespace = "AArch64";
+  let SubRegs = subregs;
 }
 
-class AArch64RegWithSubs<bits<16> enc, string n, list<Register> subregs = [],
-                         list<SubRegIndex> inds = []>
-      : AArch64Reg<enc, n> {
-  let SubRegs = subregs;
-  let SubRegIndices = inds;
+let Namespace = "AArch64" in {
+  def sub_32 : SubRegIndex<32>;
+
+  def bsub : SubRegIndex<8>;
+  def hsub : SubRegIndex<16>;
+  def ssub : SubRegIndex<32>;
+  def dsub : SubRegIndex<32>;
+  def qhisub : SubRegIndex<64>;
+  def qsub : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def dsub0 : SubRegIndex<64>;
+  def dsub1 : SubRegIndex<64>;
+  def dsub2 : SubRegIndex<64>;
+  def dsub3 : SubRegIndex<64>;
+  // Note: Code depends on these having consecutive numbers
+  def qsub0 : SubRegIndex<128>;
+  def qsub1 : SubRegIndex<128>;
+  def qsub2 : SubRegIndex<128>;
+  def qsub3 : SubRegIndex<128>;
+}
+
+let Namespace = "AArch64" in {
+  def vreg : RegAltNameIndex;
+  def vlist1 : RegAltNameIndex;
 }
 
 //===----------------------------------------------------------------------===//
-//  Integer registers: w0-w30, wzr, wsp, x0-x30, xzr, sp
+// Registers
 //===----------------------------------------------------------------------===//
-
-foreach Index = 0-30 in {
-  def W#Index : AArch64Reg< Index, "w"#Index>, DwarfRegNum<[Index]>;
+def W0    : AArch64Reg<0,   "w0" >, DwarfRegNum<[0]>;
+def W1    : AArch64Reg<1,   "w1" >, DwarfRegNum<[1]>;
+def W2    : AArch64Reg<2,   "w2" >, DwarfRegNum<[2]>;
+def W3    : AArch64Reg<3,   "w3" >, DwarfRegNum<[3]>;
+def W4    : AArch64Reg<4,   "w4" >, DwarfRegNum<[4]>;
+def W5    : AArch64Reg<5,   "w5" >, DwarfRegNum<[5]>;
+def W6    : AArch64Reg<6,   "w6" >, DwarfRegNum<[6]>;
+def W7    : AArch64Reg<7,   "w7" >, DwarfRegNum<[7]>;
+def W8    : AArch64Reg<8,   "w8" >, DwarfRegNum<[8]>;
+def W9    : AArch64Reg<9,   "w9" >, DwarfRegNum<[9]>;
+def W10   : AArch64Reg<10, "w10">, DwarfRegNum<[10]>;
+def W11   : AArch64Reg<11, "w11">, DwarfRegNum<[11]>;
+def W12   : AArch64Reg<12, "w12">, DwarfRegNum<[12]>;
+def W13   : AArch64Reg<13, "w13">, DwarfRegNum<[13]>;
+def W14   : AArch64Reg<14, "w14">, DwarfRegNum<[14]>;
+def W15   : AArch64Reg<15, "w15">, DwarfRegNum<[15]>;
+def W16   : AArch64Reg<16, "w16">, DwarfRegNum<[16]>;
+def W17   : AArch64Reg<17, "w17">, DwarfRegNum<[17]>;
+def W18   : AArch64Reg<18, "w18">, DwarfRegNum<[18]>;
+def W19   : AArch64Reg<19, "w19">, DwarfRegNum<[19]>;
+def W20   : AArch64Reg<20, "w20">, DwarfRegNum<[20]>;
+def W21   : AArch64Reg<21, "w21">, DwarfRegNum<[21]>;
+def W22   : AArch64Reg<22, "w22">, DwarfRegNum<[22]>;
+def W23   : AArch64Reg<23, "w23">, DwarfRegNum<[23]>;
+def W24   : AArch64Reg<24, "w24">, DwarfRegNum<[24]>;
+def W25   : AArch64Reg<25, "w25">, DwarfRegNum<[25]>;
+def W26   : AArch64Reg<26, "w26">, DwarfRegNum<[26]>;
+def W27   : AArch64Reg<27, "w27">, DwarfRegNum<[27]>;
+def W28   : AArch64Reg<28, "w28">, DwarfRegNum<[28]>;
+def W29   : AArch64Reg<29, "w29">, DwarfRegNum<[29]>;
+def W30   : AArch64Reg<30, "w30">, DwarfRegNum<[30]>;
+def WSP   : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
+def WZR   : AArch64Reg<31, "wzr">, DwarfRegAlias<WSP>;
+
+let SubRegIndices = [sub_32] in {
+def X0    : AArch64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
+def X1    : AArch64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
+def X2    : AArch64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
+def X3    : AArch64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
+def X4    : AArch64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
+def X5    : AArch64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
+def X6    : AArch64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
+def X7    : AArch64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
+def X8    : AArch64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
+def X9    : AArch64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
+def X10   : AArch64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
+def X11   : AArch64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
+def X12   : AArch64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
+def X13   : AArch64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
+def X14   : AArch64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
+def X15   : AArch64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
+def X16   : AArch64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
+def X17   : AArch64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
+def X18   : AArch64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
+def X19   : AArch64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
+def X20   : AArch64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
+def X21   : AArch64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
+def X22   : AArch64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
+def X23   : AArch64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
+def X24   : AArch64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
+def X25   : AArch64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
+def X26   : AArch64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
+def X27   : AArch64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
+def X28   : AArch64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
+def FP    : AArch64Reg<29, "x29", [W29]>, DwarfRegAlias<W29>;
+def LR    : AArch64Reg<30, "x30", [W30]>, DwarfRegAlias<W30>;
+def SP    : AArch64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
+def XZR   : AArch64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
 }
 
-def WSP : AArch64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR : AArch64Reg<31, "wzr">;
+// Condition code register.
+def NZCV  : AArch64Reg<0, "nzcv">;
 
-// Could be combined with previous loop, but this way leaves w and x registers
-// consecutive as LLVM register numbers, which makes for easier debugging.
-foreach Index = 0-30 in {
-  def X#Index : AArch64RegWithSubs<Index, "x"#Index,
-                                   [!cast<Register>("W"#Index)], [sub_32]>,
-                DwarfRegNum<[Index]>;
+// GPR register classes with the intersections of GPR32/GPR32sp and
+// GPR64/GPR64sp for use by the coalescer.
+def GPR32common : RegisterClass<"AArch64", [i32], 32, (sequence "W%u", 0, 30)> {
+  let AltOrders = [(rotl GPR32common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64common : RegisterClass<"AArch64", [i64], 64,
+                                (add (sequence "X%u", 0, 28), FP, LR)> {
+  let AltOrders = [(rotl GPR64common, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+// GPR register classes which exclude SP/WSP.
+def GPR32 : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR)> {
+  let AltOrders = [(rotl GPR32, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64 : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR)> {
+  let AltOrders = [(rotl GPR64, 8)];
+  let AltOrderSelect = [{ return 1; }];
 }
 
-def XSP : AArch64RegWithSubs<31, "sp", [WSP], [sub_32]>, DwarfRegNum<[31]>;
-def XZR : AArch64RegWithSubs<31, "xzr", [WZR], [sub_32]>;
+// GPR register classes which include SP/WSP.
+def GPR32sp : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WSP)> {
+  let AltOrders = [(rotl GPR32sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
+def GPR64sp : RegisterClass<"AArch64", [i64], 64, (add GPR64common, SP)> {
+  let AltOrders = [(rotl GPR64sp, 8)];
+  let AltOrderSelect = [{ return 1; }];
+}
 
-// Most instructions treat register 31 as zero for reads and a black-hole for
-// writes.
+def GPR32sponly : RegisterClass<"AArch64", [i32], 32, (add WSP)>;
+def GPR64sponly : RegisterClass<"AArch64", [i64], 64, (add SP)>;
 
-// Note that the order of registers is important for the Disassembler here:
-// tablegen uses it to form MCRegisterClass::getRegister, which we assume can
-// take an encoding value.
-def GPR32 : RegisterClass<"AArch64", [i32], 32,
-                          (add (sequence "W%u", 0, 30), WZR)> {
+def GPR64spPlus0Operand : AsmOperandClass {
+  let Name = "GPR64sp0";
+  let RenderMethod = "addRegOperands";
+  let ParserMethod = "tryParseGPR64sp0Operand";
 }
 
-def GPR64 : RegisterClass<"AArch64", [i64], 64,
-                          (add (sequence "X%u", 0, 30), XZR)> {
+def GPR64sp0 : RegisterOperand<GPR64sp> {
+  let ParserMatchClass = GPR64spPlus0Operand;
 }
 
-def GPR32nowzr : RegisterClass<"AArch64", [i32], 32,
-                               (sequence "W%u", 0, 30)> {
+// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
+// constraint used by any instructions, it is used as a common super-class.
+def GPR32all : RegisterClass<"AArch64", [i32], 32, (add GPR32common, WZR, WSP)>;
+def GPR64all : RegisterClass<"AArch64", [i64], 64, (add GPR64common, XZR, SP)>;
+
+// For tail calls, we can't use callee-saved registers, as they are restored
+// to the saved value before the tail call, which would clobber a call address.
+// This is for indirect tail calls to store the address of the destination.
+def tcGPR64 : RegisterClass<"AArch64", [i64], 64, (sub GPR64common, X19, X20, X21,
+                                                     X22, X23, X24, X25, X26,
+                                                     X27, X28)>;
+
+// GPR register classes for post increment amount of vector load/store that
+// has alternate printing when Rm=31 and prints a constant immediate value
+// equal to the total number of bytes transferred.
+
+// FIXME: TableGen *should* be able to do these itself now. There appears to be
+// a bug in counting how many operands a Post-indexed MCInst should have which
+// means the aliases don't trigger.
+def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand<1>">;
+def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand<2>">;
+def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand<3>">;
+def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand<4>">;
+def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand<6>">;
+def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand<8>">;
+def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand<12>">;
+def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand<16>">;
+def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand<24>">;
+def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand<32>">;
+def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand<48>">;
+def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand<64>">;
+
+// Condition code regclass.
+def CCR : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
+  let CopyCost = -1;  // Don't allow copying of status registers.
+
+  // CCR is not allocatable.
+  let isAllocatable = 0;
 }
 
-def GPR64noxzr : RegisterClass<"AArch64", [i64], 64,
-                               (sequence "X%u", 0, 30)> {
-}
+//===----------------------------------------------------------------------===//
+// Floating Point Scalar Registers
+//===----------------------------------------------------------------------===//
 
-// For tail calls, we can't use callee-saved registers or the structure-return
-// register, as they are supposed to be live across function calls and may be
-// clobbered by the epilogue.
-def tcGPR64 : RegisterClass<"AArch64", [i64], 64,
-                            (add (sequence "X%u", 0, 7),
-                                 (sequence "X%u", 9, 18))> {
+def B0    : AArch64Reg<0,   "b0">, DwarfRegNum<[64]>;
+def B1    : AArch64Reg<1,   "b1">, DwarfRegNum<[65]>;
+def B2    : AArch64Reg<2,   "b2">, DwarfRegNum<[66]>;
+def B3    : AArch64Reg<3,   "b3">, DwarfRegNum<[67]>;
+def B4    : AArch64Reg<4,   "b4">, DwarfRegNum<[68]>;
+def B5    : AArch64Reg<5,   "b5">, DwarfRegNum<[69]>;
+def B6    : AArch64Reg<6,   "b6">, DwarfRegNum<[70]>;
+def B7    : AArch64Reg<7,   "b7">, DwarfRegNum<[71]>;
+def B8    : AArch64Reg<8,   "b8">, DwarfRegNum<[72]>;
+def B9    : AArch64Reg<9,   "b9">, DwarfRegNum<[73]>;
+def B10   : AArch64Reg<10, "b10">, DwarfRegNum<[74]>;
+def B11   : AArch64Reg<11, "b11">, DwarfRegNum<[75]>;
+def B12   : AArch64Reg<12, "b12">, DwarfRegNum<[76]>;
+def B13   : AArch64Reg<13, "b13">, DwarfRegNum<[77]>;
+def B14   : AArch64Reg<14, "b14">, DwarfRegNum<[78]>;
+def B15   : AArch64Reg<15, "b15">, DwarfRegNum<[79]>;
+def B16   : AArch64Reg<16, "b16">, DwarfRegNum<[80]>;
+def B17   : AArch64Reg<17, "b17">, DwarfRegNum<[81]>;
+def B18   : AArch64Reg<18, "b18">, DwarfRegNum<[82]>;
+def B19   : AArch64Reg<19, "b19">, DwarfRegNum<[83]>;
+def B20   : AArch64Reg<20, "b20">, DwarfRegNum<[84]>;
+def B21   : AArch64Reg<21, "b21">, DwarfRegNum<[85]>;
+def B22   : AArch64Reg<22, "b22">, DwarfRegNum<[86]>;
+def B23   : AArch64Reg<23, "b23">, DwarfRegNum<[87]>;
+def B24   : AArch64Reg<24, "b24">, DwarfRegNum<[88]>;
+def B25   : AArch64Reg<25, "b25">, DwarfRegNum<[89]>;
+def B26   : AArch64Reg<26, "b26">, DwarfRegNum<[90]>;
+def B27   : AArch64Reg<27, "b27">, DwarfRegNum<[91]>;
+def B28   : AArch64Reg<28, "b28">, DwarfRegNum<[92]>;
+def B29   : AArch64Reg<29, "b29">, DwarfRegNum<[93]>;
+def B30   : AArch64Reg<30, "b30">, DwarfRegNum<[94]>;
+def B31   : AArch64Reg<31, "b31">, DwarfRegNum<[95]>;
+
+let SubRegIndices = [bsub] in {
+def H0    : AArch64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
+def H1    : AArch64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
+def H2    : AArch64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
+def H3    : AArch64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
+def H4    : AArch64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
+def H5    : AArch64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
+def H6    : AArch64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
+def H7    : AArch64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
+def H8    : AArch64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
+def H9    : AArch64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
+def H10   : AArch64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
+def H11   : AArch64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
+def H12   : AArch64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
+def H13   : AArch64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
+def H14   : AArch64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
+def H15   : AArch64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
+def H16   : AArch64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
+def H17   : AArch64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
+def H18   : AArch64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
+def H19   : AArch64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
+def H20   : AArch64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
+def H21   : AArch64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
+def H22   : AArch64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
+def H23   : AArch64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
+def H24   : AArch64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
+def H25   : AArch64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
+def H26   : AArch64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
+def H27   : AArch64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
+def H28   : AArch64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
+def H29   : AArch64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
+def H30   : AArch64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
+def H31   : AArch64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
 }
 
+let SubRegIndices = [hsub] in {
+def S0    : AArch64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
+def S1    : AArch64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
+def S2    : AArch64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
+def S3    : AArch64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
+def S4    : AArch64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
+def S5    : AArch64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
+def S6    : AArch64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
+def S7    : AArch64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
+def S8    : AArch64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
+def S9    : AArch64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
+def S10   : AArch64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
+def S11   : AArch64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
+def S12   : AArch64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
+def S13   : AArch64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
+def S14   : AArch64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
+def S15   : AArch64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
+def S16   : AArch64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
+def S17   : AArch64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
+def S18   : AArch64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
+def S19   : AArch64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
+def S20   : AArch64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
+def S21   : AArch64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
+def S22   : AArch64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
+def S23   : AArch64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
+def S24   : AArch64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
+def S25   : AArch64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
+def S26   : AArch64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
+def S27   : AArch64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
+def S28   : AArch64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
+def S29   : AArch64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
+def S30   : AArch64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
+def S31   : AArch64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
+}
 
-// Certain addressing-useful instructions accept sp directly. Again the order of
-// registers is important to the Disassembler.
-def GPR32wsp : RegisterClass<"AArch64", [i32], 32,
-                             (add (sequence "W%u", 0, 30), WSP)> {
+let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
+def D0    : AArch64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
+def D1    : AArch64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
+def D2    : AArch64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
+def D3    : AArch64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
+def D4    : AArch64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
+def D5    : AArch64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
+def D6    : AArch64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
+def D7    : AArch64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
+def D8    : AArch64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
+def D9    : AArch64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
+def D10   : AArch64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
+def D11   : AArch64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
+def D12   : AArch64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
+def D13   : AArch64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
+def D14   : AArch64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
+def D15   : AArch64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
+def D16   : AArch64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
+def D17   : AArch64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
+def D18   : AArch64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
+def D19   : AArch64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
+def D20   : AArch64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
+def D21   : AArch64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
+def D22   : AArch64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
+def D23   : AArch64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
+def D24   : AArch64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
+def D25   : AArch64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
+def D26   : AArch64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
+def D27   : AArch64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
+def D28   : AArch64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
+def D29   : AArch64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
+def D30   : AArch64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
+def D31   : AArch64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-def GPR64xsp : RegisterClass<"AArch64", [i64], 64,
-                             (add (sequence "X%u", 0, 30), XSP)> {
+let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
+def Q0    : AArch64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
+def Q1    : AArch64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
+def Q2    : AArch64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
+def Q3    : AArch64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
+def Q4    : AArch64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
+def Q5    : AArch64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
+def Q6    : AArch64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
+def Q7    : AArch64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
+def Q8    : AArch64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
+def Q9    : AArch64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
+def Q10   : AArch64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
+def Q11   : AArch64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
+def Q12   : AArch64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
+def Q13   : AArch64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
+def Q14   : AArch64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
+def Q15   : AArch64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
+def Q16   : AArch64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
+def Q17   : AArch64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
+def Q18   : AArch64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
+def Q19   : AArch64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
+def Q20   : AArch64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
+def Q21   : AArch64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
+def Q22   : AArch64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
+def Q23   : AArch64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
+def Q24   : AArch64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
+def Q25   : AArch64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
+def Q26   : AArch64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
+def Q27   : AArch64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
+def Q28   : AArch64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
+def Q29   : AArch64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
+def Q30   : AArch64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
+def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 }
 
-// Some aliases *only* apply to SP (e.g. MOV uses different encoding for SP and
-// non-SP variants). We can't use a bare register in those patterns because
-// TableGen doesn't like it, so we need a class containing just stack registers
-def Rxsp : RegisterClass<"AArch64", [i64], 64,
-                         (add XSP)> {
+def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
+  let Size = 8;
 }
+def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
+def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
+                                    v1i64],
+                                    64, (sequence "D%u", 0, 31)>;
+// We don't (yet) have an f128 legal type, so don't use that here. We
+// normalize 128-bit vectors to v2f64 for arg passing and such, so use
+// that here.
+def FPR128 : RegisterClass<"AArch64",
+                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
+                           128, (sequence "Q%u", 0, 31)>;
 
-def Rwsp : RegisterClass<"AArch64", [i32], 32,
-                         (add WSP)> {
+// The lower 16 vector registers.  Some instructions can only take registers
+// in this range.
+def FPR128_lo : RegisterClass<"AArch64",
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
+                              128, (trunc FPR128, 16)>;
+
+// Pairs, triples, and quads of 64-bit vector registers.
+def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
+def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
+                                 [(rotl FPR64, 0), (rotl FPR64, 1),
+                                  (rotl FPR64, 2)]>;
+def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
+                               [(rotl FPR64, 0), (rotl FPR64, 1),
+                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
+def DD   : RegisterClass<"AArch64", [untyped], 64, (add DSeqPairs)> {
+  let Size = 128;
+}
+def DDD  : RegisterClass<"AArch64", [untyped], 64, (add DSeqTriples)> {
+  let Size = 196;
+}
+def DDDD : RegisterClass<"AArch64", [untyped], 64, (add DSeqQuads)> {
+  let Size = 256;
 }
 
-//===----------------------------------------------------------------------===//
-//  Scalar registers in the vector unit:
-//  b0-b31, h0-h31, s0-s31, d0-d31, q0-q31
-//===----------------------------------------------------------------------===//
+// Pairs, triples, and quads of 128-bit vector registers.
+def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
+def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
+                                 [(rotl FPR128, 0), (rotl FPR128, 1),
+                                  (rotl FPR128, 2)]>;
+def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
+                               [(rotl FPR128, 0), (rotl FPR128, 1),
+                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
+def QQ   : RegisterClass<"AArch64", [untyped], 128, (add QSeqPairs)> {
+  let Size = 256;
+}
+def QQQ  : RegisterClass<"AArch64", [untyped], 128, (add QSeqTriples)> {
+  let Size = 384;
+}
+def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
+  let Size = 512;
+}
 
-foreach Index = 0-31 in {
-  def B # Index : AArch64Reg< Index, "b" # Index>,
-                  DwarfRegNum<[!add(Index, 64)]>;
 
-  def H # Index : AArch64RegWithSubs<Index, "h" # Index,
-                                     [!cast<Register>("B" # Index)], [sub_8]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+// Vector operand versions of the FP registers. Alternate name printing and
+// assmebler matching.
+def VectorReg64AsmOperand : AsmOperandClass {
+  let Name = "VectorReg64";
+  let PredicateMethod = "isVectorReg";
+}
+def VectorReg128AsmOperand : AsmOperandClass {
+  let Name = "VectorReg128";
+  let PredicateMethod = "isVectorReg";
+}
 
-  def S # Index : AArch64RegWithSubs<Index, "s" # Index,
-                                     [!cast<Register>("H" # Index)], [sub_16]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V64  : RegisterOperand<FPR64, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg64AsmOperand;
+}
 
-  def D # Index : AArch64RegWithSubs<Index, "d" # Index,
-                                     [!cast<Register>("S" # Index)], [sub_32]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def V128 : RegisterOperand<FPR128, "printVRegOperand"> {
+  let ParserMatchClass = VectorReg128AsmOperand;
+}
 
-  def Q # Index : AArch64RegWithSubs<Index, "q" # Index,
-                                     [!cast<Register>("D" # Index)], [sub_64]>,
-                  DwarfRegNum<[!add(Index, 64)]>;
+def VectorRegLoAsmOperand : AsmOperandClass { let Name = "VectorRegLo"; }
+def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
 }
 
+class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
+    : AsmOperandClass {
+  let Name = "TypedVectorList" # count # "_" # lanes # kind;
 
-def FPR8 : RegisterClass<"AArch64", [v1i8], 8,
-                          (sequence "B%u", 0, 31)> {
+  let PredicateMethod
+      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
+  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
 }
 
-def FPR16 : RegisterClass<"AArch64", [f16, v1i16], 16,
-                          (sequence "H%u", 0, 31)> {
-}
+class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
+    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
+                                                   # kind # "'>">;
 
-def FPR32 : RegisterClass<"AArch64", [f32, v1i32], 32,
-                          (sequence "S%u", 0, 31)> {
-}
+multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
+  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
+  def _64AsmOperand : AsmOperandClass {
+    let Name = NAME # "64";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList64Operands<" # count # ">";
+  }
 
-def FPR64 : RegisterClass<"AArch64",
-                          [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                          64, (sequence "D%u", 0, 31)>;
+  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
+  }
 
-def FPR128 : RegisterClass<"AArch64",
-                           [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                           128, (sequence "Q%u", 0, 31)>;
+  def _128AsmOperand : AsmOperandClass {
+    let Name = NAME # "128";
+    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
+    let RenderMethod = "addVectorList128Operands<" # count # ">";
+  }
+
+  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
+  }
 
-def FPR64Lo : RegisterClass<"AArch64",
-                            [f64, v2f32, v2i32, v4i16, v8i8, v1i64, v1f64],
-                            64, (sequence "D%u", 0, 15)>;
+  // 64-bit register lists with explicit type.
 
-def FPR128Lo : RegisterClass<"AArch64",
-                             [f128, v2f64, v2i64, v4f32, v4i32, v8i16, v16i8],
-                             128, (sequence "Q%u", 0, 15)>;
+  // { v0.8b, v1.8b }
+  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
+  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Vector registers:
-//===----------------------------------------------------------------------===//
+  // { v0.4h, v1.4h }
+  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
+  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
+  }
 
-def VPR64AsmOperand : AsmOperandClass {
-  let Name = "VPR";
-  let PredicateMethod = "isReg";
-  let RenderMethod = "addRegOperands";
-}
+  // { v0.2s, v1.2s }
+  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
+  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
+  }
+
+  // { v0.1d, v1.1d }
+  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
+  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
+  }
 
-def VPR64 : RegisterOperand<FPR64, "printVPRRegister">;
+  // 128-bit register lists with explicit type
 
-def VPR128 : RegisterOperand<FPR128, "printVPRRegister">;
+  // { v0.16b, v1.16b }
+  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
+  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
+  }
 
-def VPR64Lo : RegisterOperand<FPR64Lo, "printVPRRegister">;
+  // { v0.8h, v1.8h }
+  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
+  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
+  }
 
-def VPR128Lo : RegisterOperand<FPR128Lo, "printVPRRegister">;
+  // { v0.4s, v1.4s }
+  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
+  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
+  }
 
-// Flags register
-def NZCV : Register<"nzcv"> {
-  let Namespace = "AArch64";
-}
+  // { v0.2d, v1.2d }
+  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
+  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
+  }
 
-def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
-  let CopyCost = -1;
-  let isAllocatable = 0;
-}
+  // { v0.b, v1.b }
+  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
+  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
+  }
 
-//===----------------------------------------------------------------------===//
-//  Consecutive vector registers
-//===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D31_D0
-def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
-                              [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-                              
-// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
-def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2)]>;
-                               
-// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
-def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
-                              [(rotl FPR64, 0), (rotl FPR64, 1),
-                               (rotl FPR64, 2), (rotl FPR64, 3)]>;
-
-// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
-def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
-                              [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-
-// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
-def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2)]>;
-                               
-// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
-def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
-                              [(rotl FPR128, 0), (rotl FPR128, 1),
-                               (rotl FPR128, 2), (rotl FPR128, 3)]>;
-
-// The followings are super register classes to model 2/3/4 consecutive
-// 64-bit/128-bit registers.
-
-def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
-
-def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
-  let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
-}
-
-def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
-
-def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
-
-def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
-  let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
-}
-
-def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
-
-
-// The followings are vector list operands
-multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
-                               RegisterClass RegList> {
-  def _asmoperand : AsmOperandClass {
-    let Name = PREFIX # LAYOUT # Count;
-    let RenderMethod = "addVectorListOperands";
-    let PredicateMethod = 
-        "isVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">";
-    let ParserMethod = "ParseVectorList";
+  // { v0.h, v1.h }
+  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
+  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
   }
 
-  def _operand : RegisterOperand<RegList,
-        "printVectorList<A64Layout::VL_" # LAYOUT # ", " # Count # ">"> {
-    let ParserMatchClass =
-      !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
+  // { v0.s, v1.s }
+  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
+  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
   }
-}
 
-multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
-                           RegisterClass QRegList> {
-  defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
-  defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
-  defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
-  defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
-  defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
-  defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
-  defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
-  defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
+  // { v0.d, v1.d }
+  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
+  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
+    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
+  }
+
+
 }
 
-// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
-defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
-defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
-defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>;
+defm VecListOne   : VectorList<1, FPR64, FPR128>;
+defm VecListTwo   : VectorList<2, DD,    QQ>;
+defm VecListThree : VectorList<3, DDD,   QQQ>;
+defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
+
+
+// Register operand versions of the scalar FP registers.
+def FPR16Op : RegisterOperand<FPR16, "printOperand">;
+def FPR32Op : RegisterOperand<FPR32, "printOperand">;
+def FPR64Op : RegisterOperand<FPR64, "printOperand">;
+def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
new file mode 100644
index 0000000..0c3949e
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -0,0 +1,291 @@
+//==- AArch64SchedA53.td - Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM Cortex A53 processors.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedModel.h for details.
+
+// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
+def CortexA53Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since A53 is in-order.
+  let IssueWidth = 2;        // 2 micro-ops are dispatched per cycle.
+  let MinLatency = 1 ;       // OperandCycles are interpreted as MinLatency.
+  let LoadLatency = 3;       // Optimistic load latency assuming bypass.
+                             // This is overriden by OperandCycles if the
+                             // Itineraries are queried instead.
+  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
+                             // Specification - Instruction Timings"
+                             // v 1.0 Spreadsheet
+}
+
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Cortex-A53 is in-order.
+
+def A53UnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def A53UnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def A53UnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def A53UnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def A53UnitB      : ProcResource<1> { let BufferSize = 0; } // Branch
+def A53UnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def A53UnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt
+
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = CortexA53Model in {
+
+// ALU - Despite having a full latency of 4, most of the ALU instructions can
+//       forward a cycle earlier and then two cycles earlier in the case of a
+//       shift-only instruction. These latencies will be incorrect when the
+//       result cannot be forwarded, but modeling isn't rocket surgery.
+def : WriteRes<WriteImm, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteI, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteISReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIEReg, [A53UnitALU]> { let Latency = 3; }
+def : WriteRes<WriteIS, [A53UnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [A53UnitALU]> { let Latency = 3; }
+
+// MAC
+def : WriteRes<WriteIM32, [A53UnitMAC]> { let Latency = 4; }
+def : WriteRes<WriteIM64, [A53UnitMAC]> { let Latency = 4; }
+
+// Div
+def : WriteRes<WriteID32, [A53UnitDiv]> { let Latency = 4; }
+def : WriteRes<WriteID64, [A53UnitDiv]> { let Latency = 4; }
+
+// Load
+def : WriteRes<WriteLD, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteLDHi, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd
+//               below, choosing the median of 3 which makes the latency 6.
+//               May model this more carefully in the future. The remaining
+//               A53WriteVLD# types represent the 1-5 cycle issues explicitly.
+def : WriteRes<WriteVLD, [A53UnitLdSt]> { let Latency = 6;
+                                          let ResourceCycles = [3]; }
+def A53WriteVLD1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVLD2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVLD3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+def A53WriteVLD4 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 7;
+                                                  let ResourceCycles = [4]; }
+def A53WriteVLD5 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 8;
+                                                  let ResourceCycles = [5]; }
+
+// Pre/Post Indexing - Performed as part of address generation which is already
+//                     accounted for in the WriteST* latencies below
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTP, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTIdx, [A53UnitLdSt]> { let Latency = 4; }
+def : WriteRes<WriteSTX, [A53UnitLdSt]> { let Latency = 4; }
+
+// Vector Store - Similar to vector loads, can take 1-3 cycles to issue.
+def : WriteRes<WriteVST, [A53UnitLdSt]> { let Latency = 5;
+                                          let ResourceCycles = [2];}
+def A53WriteVST1 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 4; }
+def A53WriteVST2 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 5;
+                                                  let ResourceCycles = [2]; }
+def A53WriteVST3 : SchedWriteRes<[A53UnitLdSt]> { let Latency = 6;
+                                                  let ResourceCycles = [3]; }
+
+// Branch
+def : WriteRes<WriteBr, [A53UnitB]>;
+def : WriteRes<WriteBrReg, [A53UnitB]>;
+def : WriteRes<WriteSys, [A53UnitB]>;
+def : WriteRes<WriteBarrier, [A53UnitB]>;
+def : WriteRes<WriteHint, [A53UnitB]>;
+
+// FP ALU
+def : WriteRes<WriteF, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [A53UnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [A53UnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [A53UnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [A53UnitFPMDS]> { let Latency = 33;
+                                            let ResourceCycles = [29]; }
+def A53WriteFMAC : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 10; }
+def A53WriteFDivSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 18;
+                                                     let ResourceCycles = [14]; }
+def A53WriteFDivDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 33;
+                                                     let ResourceCycles = [29]; }
+def A53WriteFSqrtSP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 17;
+                                                      let ResourceCycles = [13]; }
+def A53WriteFSqrtDP : SchedWriteRes<[A53UnitFPMDS]> { let Latency = 32;
+                                                      let ResourceCycles = [28]; }
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD, 0>;
+
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycle later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a seperate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm,WriteI,
+                             WriteISReg, WriteIEReg,WriteIS,
+                             WriteID32,WriteID64,
+                             WriteIM32,WriteIM64]>;
+def A53ReadShifted : SchedReadAdvance<1, [WriteImm,WriteI,
+                                          WriteISReg, WriteIEReg,WriteIS,
+                                          WriteID32,WriteID64,
+                                          WriteIM32,WriteIM64]>;
+def A53ReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI,
+                                             WriteISReg, WriteIEReg,WriteIS,
+                                             WriteID32,WriteID64,
+                                             WriteIM32,WriteIM64]>;
+def A53ReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, A53ReadISReg>;
+
+def A53ReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [A53ReadShifted]>,
+	SchedVar<NoSchedPred, [A53ReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, A53ReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm,WriteI,
+                               WriteISReg, WriteIEReg,WriteIS,
+                               WriteID32,WriteID64,
+                               WriteIM32,WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg,WriteIS,
+                              WriteID32,WriteID64,
+                              WriteIM32,WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRWs.
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[A53WriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[A53WriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[A53WriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[A53WriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[A53WriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[A53WriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[A53WriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[A53WriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[A53WriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[A53WriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[A53WriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[A53WriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[A53WriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[A53WriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[A53WriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[A53WriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/ARM64/ARM64SchedCyclone.td b/lib/Target/AArch64/AArch64SchedCyclone.td
index 65c68b3..a2a1802 100644
--- a/lib/Target/ARM64/ARM64SchedCyclone.td
+++ b/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -1,4 +1,4 @@
-//=- ARMSchedCyclone.td - ARM64 Cyclone Scheduling Defs ------*- tablegen -*-=//
+//=- ARMSchedCyclone.td - AArch64 Cyclone Scheduling Defs ----*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the machine model for ARM64 Cyclone to support
+// This file defines the machine model for AArch64 Cyclone to support
 // instruction scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
@@ -239,13 +239,13 @@ def : WriteRes<WriteST, [CyUnitLS]> {
 def CyWriteLDIdx : SchedWriteVariant<[
   SchedVar<ScaledIdxPred, [WriteIS, WriteLD]>, // Load from scaled register.
   SchedVar<NoSchedPred,   [WriteLD]>]>;        // Load from register offset.
-def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map ARM64->Cyclone type.
+def : SchedAlias<WriteLDIdx, CyWriteLDIdx>;    // Map AArch64->Cyclone type.
 
 // EXAMPLE: STR Xn, Xm [, lsl 3]
 def CyWriteSTIdx : SchedWriteVariant<[
   SchedVar<ScaledIdxPred, [WriteIS, WriteST]>, // Store to scaled register.
   SchedVar<NoSchedPred,   [WriteST]>]>;        // Store to register offset.
-def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map ARM64->Cyclone type.
+def : SchedAlias<WriteSTIdx, CyWriteSTIdx>;    // Map AArch64->Cyclone type.
 
 // Read the (unshifted) base register Xn in the second micro-op one cycle later.
 // EXAMPLE: LDR Xn, Xm [, lsl 3]
@@ -253,7 +253,7 @@ def ReadBaseRS : SchedReadAdvance<1>;
 def CyReadAdrBase : SchedReadVariant<[
   SchedVar<ScaledIdxPred, [ReadBaseRS]>, // Read base reg after shifting offset.
   SchedVar<NoSchedPred,   [ReadDefault]>]>;   // Read base reg with no shift.
-def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map ARM64->Cyclone type.
+def : SchedAlias<ReadAdrBase, CyReadAdrBase>; // Map AArch64->Cyclone type.
 
 //---
 // 7.8.9,7.8.11. Load/Store, paired
@@ -342,7 +342,9 @@ def : InstRW<[WriteVMov], (instrs ORRv16i8)>;
 // INS V[x],V[y] is a WriteV.
 
 // FMOVWSr,FMOVXDr,FMOVXDHighr
-def : SchedAlias<WriteFCopy, WriteVLD>;
+def : WriteRes<WriteFCopy, [CyUnitLS]> {
+  let Latency = 5;
+}
 
 // FMOVSWr,FMOVDXr
 def : InstRW<[WriteLD], (instrs FMOVSWr,FMOVDXr,FMOVDXHighr)>;
@@ -849,4 +851,15 @@ def : InstRW<[WriteAdr, WriteVSTPairShuffle], (instregex "ST4i(8|16|32)_POST")>;
 def : InstRW<[WriteVSTShuffle, WriteVSTShuffle],          (instrs ST4i64)>;
 def : InstRW<[WriteAdr, WriteVSTShuffle, WriteVSTShuffle],(instrs ST4i64_POST)>;
 
+//---
+// Unused SchedRead types
+//---
+
+def : ReadAdvance<ReadI, 0>;
+def : ReadAdvance<ReadISReg, 0>;
+def : ReadAdvance<ReadIEReg, 0>;
+def : ReadAdvance<ReadIM, 0>;
+def : ReadAdvance<ReadIMA, 0>;
+def : ReadAdvance<ReadID, 0>;
+
 } // SchedModel = CycloneModel
diff --git a/lib/Target/AArch64/AArch64Schedule.td b/lib/Target/AArch64/AArch64Schedule.td
index ec8450b..eaa9110 100644
--- a/lib/Target/AArch64/AArch64Schedule.td
+++ b/lib/Target/AArch64/AArch64Schedule.td
@@ -1,4 +1,4 @@
-//===- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
+//==-- AArch64Schedule.td - AArch64 Scheduling Definitions -*- tablegen -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,74 +7,98 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-// Generic processor itineraries for legacy compatibility.
-
-def GenericItineraries : ProcessorItineraries<[], [], []>;
-
-
-//===----------------------------------------------------------------------===//
-// Base SchedReadWrite types
-
-// Basic ALU
-def WriteALU : SchedWrite;  // Generic: may contain shift and/or ALU operation
-def WriteALUs : SchedWrite; // Shift only with no ALU operation
-def ReadALU : SchedRead;    // Operand not needed for shifting
-def ReadALUs : SchedRead;   // Operand needed for shifting
-
-// Multiply with optional accumulate
-def WriteMAC : SchedWrite;
-def ReadMAC : SchedRead;
-
-// Compares
-def WriteCMP : SchedWrite;
-def ReadCMP : SchedRead;
-
-// Division
-def WriteDiv : SchedWrite;
-def ReadDiv : SchedRead;
-
-// Loads
-def WriteLd : SchedWrite;
-def WritePreLd : SchedWrite;
-def WriteVecLd : SchedWrite;
-def ReadLd : SchedRead;
-def ReadPreLd : SchedRead;
-def ReadVecLd : SchedRead;
-
-// Stores
-def WriteSt : SchedWrite;
-def WriteVecSt : SchedWrite;
-def ReadSt : SchedRead;
-def ReadVecSt : SchedRead;
-
-// Branches
-def WriteBr : SchedWrite;
-def WriteBrL : SchedWrite;
-def ReadBr : SchedRead;
-
-// Floating Point ALU
-def WriteFPALU : SchedWrite;
-def ReadFPALU : SchedRead;
-
-// Floating Point MAC, Mul, Div, Sqrt
-//   Most processors will simply send all of these down a dedicated pipe, but
-//   they're explicitly seperated here for flexibility of modeling later. May
-//   consider consolidating them into a single WriteFPXXXX type in the future.
-def WriteFPMAC : SchedWrite;
-def WriteFPMul : SchedWrite;
-def WriteFPDiv : SchedWrite;
-def WriteFPSqrt : SchedWrite;
-def ReadFPMAC : SchedRead;
-def ReadFPMul : SchedRead;
-def ReadFPDiv : SchedRead;
-def ReadFPSqrt : SchedRead;
-
-// Noop
-def WriteNoop : SchedWrite;
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget specific Machine Models.
-
-include "AArch64ScheduleA53.td"
+// Define TII for use in SchedVariant Predicates.
+// const MachineInstr *MI and const TargetSchedModel *SchedModel
+// are defined by default.
+def : PredicateProlog<[{
+  const AArch64InstrInfo *TII =
+    static_cast<const AArch64InstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// AArch64 Scheduler Definitions
+
+def WriteImm       : SchedWrite; // MOVN, MOVZ
+// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
+// select the correct sequence of WriteImms.
+
+def WriteI         : SchedWrite; // ALU
+def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
+def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
+def ReadI          : SchedRead;  // ALU
+def ReadISReg      : SchedRead;  // ALU of Shifted-Reg
+def ReadIEReg      : SchedRead;  // ALU of Extended-Reg
+def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
+def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
+def WriteIS        : SchedWrite; // Shift/Scale
+def WriteID32      : SchedWrite; // 32-bit Divide
+def WriteID64      : SchedWrite; // 64-bit Divide
+def ReadID         : SchedRead;  // 32/64-bit Divide
+def WriteIM32      : SchedWrite; // 32-bit Multiply
+def WriteIM64      : SchedWrite; // 64-bit Multiply
+def ReadIM         : SchedRead;  // 32/64-bit Multiply
+def ReadIMA        : SchedRead;  // 32/64-bit Multiply Accumulate
+def WriteBr        : SchedWrite; // Branch
+def WriteBrReg     : SchedWrite; // Indirect Branch
+
+def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
+def WriteST        : SchedWrite; // Store to base addr plus immediate offset
+def WriteSTP       : SchedWrite; // Store a register pair.
+def WriteAdr       : SchedWrite; // Address pre/post increment.
+
+def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
+def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
+def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
+
+// Predicate for determining when a shiftable register is shifted.
+def RegShiftedPred : SchedPredicate<[{TII->hasShiftedReg(MI)}]>;
+
+// Predicate for determining when a extendedable register is extended.
+def RegExtendedPred : SchedPredicate<[{TII->hasExtendedReg(MI)}]>;
+
+// ScaledIdxPred is true if a WriteLDIdx operand will be
+// scaled. Subtargets can use this to dynamically select resources and
+// latency for WriteLDIdx and ReadAdrBase.
+def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
+
+// Serialized two-level address load.
+// EXAMPLE: LOADGot
+def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
+
+// Serialized two-level address lookup.
+// EXAMPLE: MOVaddr...
+def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
+
+// The second register of a load-pair.
+// LDP,LDPSW,LDNP,LDXP,LDAXP
+def WriteLDHi : SchedWrite;
+
+// Store-exclusive is a store followed by a dependent load.
+def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
+
+def WriteSys     : SchedWrite; // Long, variable latency system ops.
+def WriteBarrier : SchedWrite; // Memory barrier.
+def WriteHint    : SchedWrite; // Hint instruction.
+
+def WriteF       : SchedWrite; // General floating-point ops.
+def WriteFCmp    : SchedWrite; // Floating-point compare.
+def WriteFCvt    : SchedWrite; // Float conversion.
+def WriteFCopy   : SchedWrite; // Float-int register copy.
+def WriteFImm    : SchedWrite; // Floating-point immediate.
+def WriteFMul    : SchedWrite; // Floating-point multiply.
+def WriteFDiv    : SchedWrite; // Floating-point division.
+
+def WriteV   : SchedWrite; // Vector ops.
+def WriteVLD : SchedWrite; // Vector loads.
+def WriteVST : SchedWrite; // Vector stores.
+
+// Read the unwritten lanes of the VLD's destination registers.
+def ReadVLD : SchedRead;
+
+// Sequential vector load and shuffle.
+def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
+def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
+
+// Store a shuffled vector.
+def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
+def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/AArch64/AArch64ScheduleA53.td b/lib/Target/AArch64/AArch64ScheduleA53.td
deleted file mode 100644
index 20a14e7..0000000
--- a/lib/Target/AArch64/AArch64ScheduleA53.td
+++ /dev/null
@@ -1,144 +0,0 @@
-//=- AArch64ScheduleA53.td - ARM Cortex-A53 Scheduling Definitions -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the itinerary class data for the ARM Cortex A53 processors.
-//
-//===----------------------------------------------------------------------===//
-
-// ===---------------------------------------------------------------------===//
-// The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
-
-// Cortex-A53 machine model for scheduling and other instruction cost heuristics.
-def CortexA53Model : SchedMachineModel {
-  let IssueWidth = 2;  // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 1 ; // OperandCycles are interpreted as MinLatency.
-  let LoadLatency = 2; // Optimistic load latency assuming bypass.
-                       // This is overriden by OperandCycles if the
-                       // Itineraries are queried instead.
-  let MispredictPenalty = 9; // Based on "Cortex-A53 Software Optimisation
-                             // Specification - Instruction Timings"
-                             // v 1.0 Spreadsheet
-}
-
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available.
-
-// Modeling each pipeline as a ProcResource using the default BufferSize = -1.
-// Cortex-A53 is in-order and therefore should be using BufferSize = 0. The
-// current configuration performs better with the basic latencies provided so
-// far. Will revisit BufferSize once the latency information is more accurate.
-
-let SchedModel = CortexA53Model in {
-
-def A53UnitALU    : ProcResource<2>;                        // Int ALU
-def A53UnitMAC    : ProcResource<1>;                        // Int MAC
-def A53UnitDiv    : ProcResource<1>;                        // Int Division
-def A53UnitLdSt   : ProcResource<1>;                        // Load/Store
-def A53UnitB      : ProcResource<1>;                        // Branch
-def A53UnitFPALU  : ProcResource<1>;                        // FP ALU
-def A53UnitFPMDS  : ProcResource<1>;                        // FP Mult/Div/Sqrt
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedWrite types which both map the ProcResources and
-// set the latency.
-
-// Issue - Every instruction must consume an A53WriteIssue. Optionally,
-//         instructions that cannot be dual-issued will also include the
-//         A53WriteIssue2nd in their SchedRW list. That second WriteRes will
-//         ensure that a second issue slot is consumed.
-def A53WriteIssue : SchedWriteRes<[]>;
-def A53WriteIssue2nd : SchedWriteRes<[]> { let Latency = 0; }
-
-// ALU - These are reduced to 1 despite a true latency of 4 in order to easily
-//       model forwarding logic. Once forwarding is properly modelled, then
-//       they'll be corrected.
-def : WriteRes<WriteALU, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteALUs, [A53UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteCMP, [A53UnitALU]> { let Latency = 1; }
-
-// MAC
-def : WriteRes<WriteMAC, [A53UnitMAC]> { let Latency = 4; }
-
-// Div
-def : WriteRes<WriteDiv, [A53UnitDiv]> { let Latency = 4; }
-
-// Load - Note: Vector loads take 1-5 cycles to issue. For the WriteVecLd below,
-//        choosing the median of 3 which makes the latency 6. May model this more
-//        carefully in the future.
-def : WriteRes<WriteLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecLd, [A53UnitLdSt]> { let Latency = 6; }
-
-// Store - Note: Vector stores take 1-3 cycles to issue. For the ReadVecSt below,
-//         choosing the median of 2 which makes the latency 5. May model this more
-//         carefully in the future.
-def : WriteRes<WriteSt, [A53UnitLdSt]> { let Latency = 4; }
-def : WriteRes<WriteVecSt, [A53UnitLdSt]> { let Latency = 5; }
-
-// Branch
-def : WriteRes<WriteBr, [A53UnitB]>;
-def : WriteRes<WriteBrL, [A53UnitB]>;
-
-// FP ALU
-def : WriteRes<WriteFPALU, [A53UnitFPALU]> {let Latency = 6; }
-
-// FP MAC, Mul, Div, Sqrt
-//   Using Double Precision numbers for now as a worst case. Additionally, not
-//   modeling the exact hazard but instead treating the whole pipe as a hazard.
-//   As an example VMUL, VMLA, and others are actually pipelined. VDIV and VSQRT
-//   have a total latency of 33 and 32 respectively but only a hazard of 29 and
-//   28 (double-prescion example).
-def : WriteRes<WriteFPMAC, [A53UnitFPMDS]> { let Latency = 10; }
-def : WriteRes<WriteFPMul, [A53UnitFPMDS]> { let Latency = 6; }
-def : WriteRes<WriteFPDiv, [A53UnitFPMDS]> { let Latency = 33;
-                                             let ResourceCycles = [29]; }
-def : WriteRes<WriteFPSqrt, [A53UnitFPMDS]> { let Latency = 32;
-                                              let ResourceCycles = [28]; }
-
-
-//===----------------------------------------------------------------------===//
-// Subtarget-specific SchedRead types.
-
-// No forwarding defined for ReadALU yet.
-def : ReadAdvance<ReadALU, 0>;
-
-// No forwarding defined for ReadCMP yet.
-def : ReadAdvance<ReadCMP, 0>;
-
-// No forwarding defined for ReadBr yet.
-def : ReadAdvance<ReadBr, 0>;
-
-// No forwarding defined for ReadMAC yet.
-def : ReadAdvance<ReadMAC, 0>;
-
-// No forwarding defined for ReadDiv yet.
-def : ReadAdvance<ReadDiv, 0>;
-
-// No forwarding defined for ReadLd, ReadPreLd, ReadVecLd yet.
-def : ReadAdvance<ReadLd, 0>;
-def : ReadAdvance<ReadPreLd, 0>;
-def : ReadAdvance<ReadVecLd, 0>;
-
-// No forwarding defined for ReadSt and ReadVecSt yet.
-def : ReadAdvance<ReadSt, 0>;
-def : ReadAdvance<ReadVecSt, 0>;
-
-// No forwarding defined for ReadFPALU yet.
-def : ReadAdvance<ReadFPALU, 0>;
-
-// No forwarding defined for ReadFPMAC/Mul/Div/Sqrt yet.
-def : ReadAdvance<ReadFPMAC, 0>;
-def : ReadAdvance<ReadFPMul, 0>;
-def : ReadAdvance<ReadFPDiv, 0>;
-def : ReadAdvance<ReadFPSqrt, 0>;
-
-}
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 6bbe075..5c65b75 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -11,15 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-selectiondag-info"
 #include "AArch64TargetMachine.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 using namespace llvm;
 
-AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const AArch64TargetMachine &TM)
-  : TargetSelectionDAGInfo(TM),
-    Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {
-}
+#define DEBUG_TYPE "aarch64-selectiondag-info"
+
+AArch64SelectionDAGInfo::AArch64SelectionDAGInfo(const TargetMachine &TM)
+    : TargetSelectionDAGInfo(TM),
+      Subtarget(&TM.getSubtarget<AArch64Subtarget>()) {}
+
+AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {}
+
+SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
+    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
+    SDValue Size, unsigned Align, bool isVolatile,
+    MachinePointerInfo DstPtrInfo) const {
+  // Check to see if there is a specialized entry-point for memory zeroing.
+  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
+  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
+  const char *bzeroEntry =
+      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : nullptr;
+  // For small size (< 256), it is not beneficial to use bzero
+  // instead of memset.
+  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
+    const AArch64TargetLowering &TLI =
+        *static_cast<const AArch64TargetLowering *>(
+            DAG.getTarget().getTargetLowering());
 
-AArch64SelectionDAGInfo::~AArch64SelectionDAGInfo() {
+    EVT IntPtr = TLI.getPointerTy();
+    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = Dst;
+    Entry.Ty = IntPtrTy;
+    Args.push_back(Entry);
+    Entry.Node = Size;
+    Args.push_back(Entry);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+      .setDiscardResult();
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    return CallResult.second;
+  }
+  return SDValue();
 }
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d412ed2..8381f99 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -11,22 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64SELECTIONDAGINFO_H
-#define LLVM_AARCH64SELECTIONDAGINFO_H
+#ifndef AArch64SELECTIONDAGINFO_H
+#define AArch64SELECTIONDAGINFO_H
 
 #include "llvm/Target/TargetSelectionDAGInfo.h"
 
 namespace llvm {
 
-class AArch64TargetMachine;
-
 class AArch64SelectionDAGInfo : public TargetSelectionDAGInfo {
+  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
   const AArch64Subtarget *Subtarget;
+
 public:
-  explicit AArch64SelectionDAGInfo(const AArch64TargetMachine &TM);
+  explicit AArch64SelectionDAGInfo(const TargetMachine &TM);
   ~AArch64SelectionDAGInfo();
-};
 
+  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
+                                  SDValue Dst, SDValue Src, SDValue Size,
+                                  unsigned Align, bool isVolatile,
+                                  MachinePointerInfo DstPtrInfo) const override;
+};
 }
 
 #endif
diff --git a/lib/Target/ARM64/ARM64StorePairSuppress.cpp b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 6521d13..45f8ddb 100644
--- a/lib/Target/ARM64/ARM64StorePairSuppress.cpp
+++ b/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -1,4 +1,4 @@
-//===---- ARM64StorePairSuppress.cpp --- Suppress store pair formation ----===//
+//===--- AArch64StorePairSuppress.cpp --- Suppress store pair formation ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,22 +11,23 @@
 // store pairs. Later we may do the same for floating point loads.
 // ===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm64-stp-suppress"
-#include "ARM64InstrInfo.h"
+#include "AArch64InstrInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineTraceMetrics.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64-stp-suppress"
+
 namespace {
-class ARM64StorePairSuppress : public MachineFunctionPass {
-  const ARM64InstrInfo *TII;
+class AArch64StorePairSuppress : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const MachineRegisterInfo *MRI;
   MachineFunction *MF;
@@ -36,10 +37,10 @@ class ARM64StorePairSuppress : public MachineFunctionPass {
 
 public:
   static char ID;
-  ARM64StorePairSuppress() : MachineFunctionPass(ID) {}
+  AArch64StorePairSuppress() : MachineFunctionPass(ID) {}
 
   virtual const char *getPassName() const override {
-    return "ARM64 Store Pair Suppression";
+    return "AArch64 Store Pair Suppression";
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
@@ -56,11 +57,11 @@ private:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
-char ARM64StorePairSuppress::ID = 0;
+char AArch64StorePairSuppress::ID = 0;
 } // anonymous
 
-FunctionPass *llvm::createARM64StorePairSuppressPass() {
-  return new ARM64StorePairSuppress();
+FunctionPass *llvm::createAArch64StorePairSuppressPass() {
+  return new AArch64StorePairSuppress();
 }
 
 /// Return true if an STP can be added to this block without increasing the
@@ -69,7 +70,7 @@ FunctionPass *llvm::createARM64StorePairSuppressPass() {
 /// critical path. If the critical path is longer than the resource height, the
 /// extra vector ops can limit physreg renaming. Otherwise, it could simply
 /// oversaturate the vector units.
-bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
+bool AArch64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
   if (!MinInstr)
     MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
 
@@ -78,7 +79,7 @@ bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
 
   // Get the machine model's scheduling class for STPQi.
   // Bypass TargetSchedule's SchedClass resolution since we only have an opcode.
-  unsigned SCIdx = TII->get(ARM64::STPDi).getSchedClass();
+  unsigned SCIdx = TII->get(AArch64::STPDi).getSchedClass();
   const MCSchedClassDesc *SCDesc =
       SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx);
 
@@ -102,22 +103,22 @@ bool ARM64StorePairSuppress::shouldAddSTPToBlock(const MachineBasicBlock *BB) {
 /// tell us if it's profitable with no cpu knowledge here.
 ///
 /// FIXME: We plan to develop a decent Target abstraction for simple loads and
-/// stores. Until then use a nasty switch similar to ARM64LoadStoreOptimizer.
-bool ARM64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
+/// stores. Until then use a nasty switch similar to AArch64LoadStoreOptimizer.
+bool AArch64StorePairSuppress::isNarrowFPStore(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
+  case AArch64::STRSui:
+  case AArch64::STRDui:
+  case AArch64::STURSi:
+  case AArch64::STURDi:
     return true;
   }
 }
 
-bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
+bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
-  TII = static_cast<const ARM64InstrInfo *>(MF->getTarget().getInstrInfo());
+  TII = static_cast<const AArch64InstrInfo *>(MF->getTarget().getInstrInfo());
   TRI = MF->getTarget().getRegisterInfo();
   MRI = &MF->getRegInfo();
   const TargetSubtargetInfo &ST =
@@ -125,7 +126,7 @@ bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   SchedModel.init(*ST.getSchedModel(), &ST, TII);
 
   Traces = &getAnalysis<MachineTraceMetrics>();
-  MinInstr = 0;
+  MinInstr = nullptr;
 
   DEBUG(dbgs() << "*** " << getPassName() << ": " << MF->getName() << '\n');
 
@@ -138,10 +139,10 @@ bool ARM64StorePairSuppress::runOnMachineFunction(MachineFunction &mf) {
   // precisely determine whether a store pair can be formed. But we do want to
   // filter out most situations where we can't form store pairs to avoid
   // computing trace metrics in those cases.
-  for (auto &MBB: *MF) {
+  for (auto &MBB : *MF) {
     bool SuppressSTP = false;
     unsigned PrevBaseReg = 0;
-    for (auto &MI: MBB) {
+    for (auto &MI : MBB) {
       if (!isNarrowFPStore(MI))
         continue;
       unsigned BaseReg;
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 9140bbd..cd69994 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information --------------===//
+//===-- AArch64Subtarget.cpp - AArch64 Subtarget Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,57 +7,110 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the AArch64 specific subclass of TargetSubtargetInfo.
+// This file implements the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
+#include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
-#include "AArch64RegisterInfo.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-subtarget"
 
-#define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
+#define GET_SUBTARGETINFO_TARGET_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-using namespace llvm;
-
-// Pin the vtable to this file.
-void AArch64Subtarget::anchor() {}
+static cl::opt<bool>
+EnableEarlyIfConvert("aarch64-early-ifcvt", cl::desc("Enable the early if "
+                     "converter pass"), cl::init(true), cl::Hidden);
 
-AArch64Subtarget::AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                                   bool LittleEndian)
+AArch64Subtarget::AArch64Subtarget(const std::string &TT,
+                                   const std::string &CPU,
+                                   const std::string &FS, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
-      HasFPARMv8(false), HasNEON(false), HasCrypto(false), TargetTriple(TT),
-      CPUString(CPU), IsLittleEndian(LittleEndian) {
+      HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
+      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false), CPUString(CPU),
+      TargetTriple(TT), IsLittleEndian(LittleEndian) {
+  // Determine default and user-specified characteristics
+
+  if (CPUString.empty())
+    CPUString = "generic";
 
-  initializeSubtargetFeatures(CPU, FS);
+  ParseSubtargetFeatures(CPUString, FS);
 }
 
-void AArch64Subtarget::initializeSubtargetFeatures(StringRef CPU,
-                                                   StringRef FS) {
-  if (CPU.empty())
-    CPUString = "generic";
+/// ClassifyGlobalReference - Find the target operand flags that describe
+/// how a global value should be referenced for the current subtarget.
+unsigned char
+AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const {
+
+  // Determine whether this is a reference to a definition or a declaration.
+  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
+  // load from stub.
+  bool isDecl = GV->hasAvailableExternallyLinkage();
+  if (GV->isDeclaration() && !GV->isMaterializable())
+    isDecl = true;
+
+  // MachO large model always goes via a GOT, simply to get a single 8-byte
+  // absolute relocation on all global addresses.
+  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
+    return AArch64II::MO_GOT;
+
+  // The small code mode's direct accesses use ADRP, which cannot necessarily
+  // produce the value 0 (if the code is above 4GB). Therefore they must use the
+  // GOT.
+  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
+    return AArch64II::MO_GOT;
+
+  // If symbol visibility is hidden, the extra load is not needed if
+  // the symbol is definitely defined in the current translation unit.
 
-  std::string FullFS = FS;
-  if (CPUString == "generic") {
-    // Enable FP by default.
-    if (FullFS.empty())
-      FullFS = "+fp-armv8";
+  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
+  //   + On MachO, if the symbol is defined in this module the GOT can be
+  //     skipped.
+  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
+  //     defined could end up in unexpected places. Use a GOT.
+  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
+    if (isTargetMachO())
+      return (isDecl || GV->isWeakForLinker()) ? AArch64II::MO_GOT
+                                               : AArch64II::MO_NO_FLAG;
     else
-      FullFS = "+fp-armv8," + FullFS;
+      // No need to go through the GOT for local symbols on ELF.
+      return GV->hasLocalLinkage() ? AArch64II::MO_NO_FLAG : AArch64II::MO_GOT;
   }
 
-  ParseSubtargetFeatures(CPU, FullFS);
+  return AArch64II::MO_NO_FLAG;
 }
 
-bool AArch64Subtarget::GVIsIndirectSymbol(const GlobalValue *GV,
-                                          Reloc::Model RelocM) const {
-  if (RelocM == Reloc::Static)
-    return false;
+/// This function returns the name of a function which has an interface
+/// like the non-standard bzero function, if such a function exists on
+/// the current subtarget and it is considered prefereable over
+/// memset with zero passed as the second argument. Otherwise it
+/// returns null.
+const char *AArch64Subtarget::getBZeroEntry() const {
+  // Prefer bzero on Darwin only.
+  if(isTargetDarwin())
+    return "bzero";
+
+  return nullptr;
+}
+
+void AArch64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
+                                         MachineInstr *begin, MachineInstr *end,
+                                         unsigned NumRegionInstrs) const {
+  // LNT run (at least on Cyclone) showed reasonably significant gains for
+  // bi-directional scheduling. 253.perlbmk.
+  Policy.OnlyTopDown = false;
+  Policy.OnlyBottomUp = false;
+}
 
-  return !GV->hasLocalLinkage() && !GV->hasHiddenVisibility();
+bool AArch64Subtarget::enableEarlyIfConversion() const {
+  return EnableEarlyIfConvert;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 68c6c4b..590ea05 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -1,4 +1,4 @@
-//==-- AArch64Subtarget.h - Define Subtarget for the AArch64 ---*- C++ -*--===//
+//===--- AArch64Subtarget.h - Define Subtarget for the AArch64 -*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,29 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file declares the AArch64 specific subclass of TargetSubtargetInfo.
+// This file declares the AArch64 specific subclass of TargetSubtarget.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_SUBTARGET_H
-#define LLVM_TARGET_AARCH64_SUBTARGET_H
+#ifndef AArch64SUBTARGET_H
+#define AArch64SUBTARGET_H
 
-#include "llvm/ADT/Triple.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include "AArch64RegisterInfo.h"
+#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "AArch64GenSubtargetInfo.inc"
 
-#include <string>
-
 namespace llvm {
-class StringRef;
 class GlobalValue;
+class StringRef;
 
 class AArch64Subtarget : public AArch64GenSubtargetInfo {
-  virtual void anchor();
 protected:
-  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57};
+  enum ARMProcFamilyEnum {Others, CortexA53, CortexA57, Cyclone};
 
   /// ARMProcFamily - ARM processor family: Cortex-A53, Cortex-A57, and others.
   ARMProcFamilyEnum ARMProcFamily;
@@ -37,47 +35,76 @@ protected:
   bool HasFPARMv8;
   bool HasNEON;
   bool HasCrypto;
+  bool HasCRC;
 
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
+  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
+  bool HasZeroCycleRegMove;
+
+  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
+  bool HasZeroCycleZeroing;
 
   /// CPUString - String name of used CPU.
   std::string CPUString;
 
-  /// IsLittleEndian - The target is Little Endian
-  bool IsLittleEndian;
+  /// TargetTriple - What processor and OS we're targeting.
+  Triple TargetTriple;
 
-private:
-  void initializeSubtargetFeatures(StringRef CPU, StringRef FS);
+  /// IsLittleEndian - Is the target little endian?
+  bool IsLittleEndian;
 
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
-  ///
-  AArch64Subtarget(StringRef TT, StringRef CPU, StringRef FS,
-                   bool LittleEndian);
+  AArch64Subtarget(const std::string &TT, const std::string &CPU,
+                 const std::string &FS, bool LittleEndian);
 
-  virtual bool enableMachineScheduler() const {
-    return true;
-  }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  bool enableMachineScheduler() const override { return true; }
 
-  bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
 
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-  bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
+  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
 
   bool hasFPARMv8() const { return HasFPARMv8; }
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
+  bool hasCRC() const { return HasCRC; }
+
+  bool isLittleEndian() const { return IsLittleEndian; }
+
+  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
+
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
+
+  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
+
+  bool isCyclone() const { return CPUString == "cyclone"; }
+
+  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
+  /// that still makes it profitable to inline the call.
+  unsigned getMaxInlineSizeThreshold() const { return 64; }
+
+  /// ParseSubtargetFeatures - Parses features string setting specified
+  /// subtarget options.  Definition of function is auto generated by tblgen.
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+
+  /// ClassifyGlobalReference - Find the target operand flags that describe
+  /// how a global value should be referenced for the current subtarget.
+  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
+                                        const TargetMachine &TM) const;
+
+  /// This function returns the name of a function which has an interface
+  /// like the non-standard bzero function, if such a function exists on
+  /// the current subtarget and it is considered prefereable over
+  /// memset with zero passed as the second argument. Otherwise it
+  /// returns null.
+  const char *getBZeroEntry() const;
 
-  bool isLittle() const { return IsLittleEndian; }
+  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
+                           MachineInstr *end,
+                           unsigned NumRegionInstrs) const override;
 
-  const std::string & getCPUString() const { return CPUString; }
+  bool enableEarlyIfConversion() const override;
 };
 } // End llvm namespace
 
-#endif  // LLVM_TARGET_AARCH64_SUBTARGET_H
+#endif // AArch64SUBTARGET_H
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index d9c990d..0b5dd2f 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -7,41 +7,80 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the implementation of the AArch64TargetMachine
-// methods. Principally just setting up the passes needed to generate correct
-// code on this architecture.
 //
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "llvm/CodeGen/Passes.h"
 #include "llvm/PassManager.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/TargetRegistry.h"
-
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnableCCMP("aarch64-ccmp", cl::desc("Enable the CCMP formation pass"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableStPairSuppress("aarch64-stp-suppress", cl::desc("Suppress STP for AArch64"),
+                     cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableAdvSIMDScalar("aarch64-simd-scalar", cl::desc("Enable use of AdvSIMD scalar"
+                    " integer instructions"), cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+EnablePromoteConstant("aarch64-promote-const", cl::desc("Enable the promote "
+                      "constant pass"), cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+EnableCollectLOH("aarch64-collect-loh", cl::desc("Enable the pass that emits the"
+                 " linker optimization hints (LOH)"), cl::init(true),
+                 cl::Hidden);
+
+static cl::opt<bool>
+EnableDeadRegisterElimination("aarch64-dead-def-elimination", cl::Hidden,
+                              cl::desc("Enable the pass that removes dead"
+                                       " definitons and replaces stores to"
+                                       " them with stores to the zero"
+                                       " register"),
+                              cl::init(true));
+
+static cl::opt<bool>
+EnableLoadStoreOpt("aarch64-load-store-opt", cl::desc("Enable the load/store pair"
+                   " optimization pass"), cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
+  // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
   RegisterTargetMachine<AArch64beTargetMachine> Y(TheAArch64beTarget);
+
+  RegisterTargetMachine<AArch64leTargetMachine> Z(TheARM64leTarget);
+  RegisterTargetMachine<AArch64beTargetMachine> W(TheARM64beTarget);
 }
 
+/// TargetMachine ctor - Create an AArch64 architecture model.
+///
 AArch64TargetMachine::AArch64TargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
                                            const TargetOptions &Options,
                                            Reloc::Model RM, CodeModel::Model CM,
                                            CodeGenOpt::Level OL,
                                            bool LittleEndian)
-  : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-    Subtarget(TT, CPU, FS, LittleEndian),
-    InstrInfo(Subtarget),
-    DL(LittleEndian ?
-       "e-m:e-i64:64-i128:128-n32:64-S128" :
-       "E-m:e-i64:64-i128:128-n32:64-S128"),
-    TLInfo(*this),
-    TSInfo(*this),
-    FrameLowering(Subtarget) {
+    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
+      Subtarget(TT, CPU, FS, LittleEndian),
+      // This nested ternary is horrible, but DL needs to be properly
+      // initialized
+      // before TLInfo is constructed.
+      DL(Subtarget.isTargetMachO()
+             ? "e-m:o-i64:64-i128:128-n32:64-S128"
+             : (LittleEndian ? "e-m:e-i64:64-i128:128-n32:64-S128"
+                             : "E-m:e-i64:64-i128:128-n32:64-S128")),
+      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
+      TSInfo(*this) {
   initAsmInfo();
 }
 
@@ -63,50 +102,107 @@ AArch64beTargetMachine(const Target &T, StringRef TT,
                        CodeGenOpt::Level OL)
   : AArch64TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
-  // allows the AArch64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createAArch64TargetTransformInfoPass(this));
-}
-
 namespace {
 /// AArch64 Code Generator Pass Configuration Options.
 class AArch64PassConfig : public TargetPassConfig {
 public:
   AArch64PassConfig(AArch64TargetMachine *TM, PassManagerBase &PM)
-    : TargetPassConfig(TM, PM) {}
+      : TargetPassConfig(TM, PM) {}
 
   AArch64TargetMachine &getAArch64TargetMachine() const {
     return getTM<AArch64TargetMachine>();
   }
 
-  const AArch64Subtarget &getAArch64Subtarget() const {
-    return *getAArch64TargetMachine().getSubtargetImpl();
-  }
-
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addILPOpts() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
+void AArch64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
+  // Add first the target-independent BasicTTI pass, then our AArch64 pass. This
+  // allows the AArch64 pass to delegate to the target independent layer when
+  // appropriate.
+  PM.add(createBasicTargetTransformInfoPass(this));
+  PM.add(createAArch64TargetTransformInfoPass(this));
+}
+
 TargetPassConfig *AArch64TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new AArch64PassConfig(this, PM);
 }
 
-bool AArch64PassConfig::addPreEmitPass() {
-  addPass(&UnpackMachineBundlesID);
-  addPass(createAArch64BranchFixupPass());
-  return true;
+// Pass Pipeline Configuration
+bool AArch64PassConfig::addPreISel() {
+  // Run promote constant before global merge, so that the promoted constants
+  // get a chance to be merged
+  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
+    addPass(createAArch64PromoteConstantPass());
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createGlobalMergePass(TM));
+  if (TM->getOptLevel() != CodeGenOpt::None)
+    addPass(createAArch64AddressTypePromotionPass());
+
+  // Always expand atomic operations, we don't deal with atomicrmw or cmpxchg
+  // ourselves.
+  addPass(createAtomicExpandLoadLinkedPass(TM));
+
+  return false;
 }
 
 bool AArch64PassConfig::addInstSelector() {
-  addPass(createAArch64ISelDAG(getAArch64TargetMachine(), getOptLevel()));
+  addPass(createAArch64ISelDag(getAArch64TargetMachine(), getOptLevel()));
 
-  // For ELF, cleanup any local-dynamic TLS accesses.
-  if (getAArch64Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
+  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
+  // references to _TLS_MODULE_BASE_ as possible.
+  if (TM->getSubtarget<AArch64Subtarget>().isTargetELF() &&
+      getOptLevel() != CodeGenOpt::None)
     addPass(createAArch64CleanupLocalDynamicTLSPass());
 
   return false;
 }
+
+bool AArch64PassConfig::addILPOpts() {
+  if (EnableCCMP)
+    addPass(createAArch64ConditionalCompares());
+  addPass(&EarlyIfConverterID);
+  if (EnableStPairSuppress)
+    addPass(createAArch64StorePairSuppressPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreRegAlloc() {
+  // Use AdvSIMD scalar instructions whenever profitable.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableAdvSIMDScalar)
+    addPass(createAArch64AdvSIMDScalar());
+  return true;
+}
+
+bool AArch64PassConfig::addPostRegAlloc() {
+  // Change dead register definitions to refer to the zero register.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableDeadRegisterElimination)
+    addPass(createAArch64DeadRegisterDefinitions());
+  return true;
+}
+
+bool AArch64PassConfig::addPreSched2() {
+  // Expand some pseudo instructions to allow proper scheduling.
+  addPass(createAArch64ExpandPseudoPass());
+  // Use load/store pair instructions when possible.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt)
+    addPass(createAArch64LoadStoreOptimizationPass());
+  return true;
+}
+
+bool AArch64PassConfig::addPreEmitPass() {
+  // Relax conditional branch instructions if they're otherwise out of
+  // range of their destination.
+  addPass(createAArch64BranchRelaxation());
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
+      TM->getSubtarget<AArch64Subtarget>().isTargetMachO())
+    addPass(createAArch64CollectLOHPass());
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 4297c92..079b19b 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -1,4 +1,4 @@
-//=== AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-===//
+//==-- AArch64TargetMachine.h - Define TargetMachine for AArch64 -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,60 +11,60 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETMACHINE_H
-#define LLVM_AARCH64TARGETMACHINE_H
+#ifndef AArch64TARGETMACHINE_H
+#define AArch64TARGETMACHINE_H
 
-#include "AArch64FrameLowering.h"
-#include "AArch64ISelLowering.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64SelectionDAGInfo.h"
+#include "AArch64ISelLowering.h"
 #include "AArch64Subtarget.h"
+#include "AArch64FrameLowering.h"
+#include "AArch64SelectionDAGInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
 
 class AArch64TargetMachine : public LLVMTargetMachine {
-  AArch64Subtarget          Subtarget;
-  AArch64InstrInfo          InstrInfo;
-  const DataLayout          DL;
-  AArch64TargetLowering     TLInfo;
-  AArch64SelectionDAGInfo   TSInfo;
-  AArch64FrameLowering      FrameLowering;
+protected:
+  AArch64Subtarget Subtarget;
+
+private:
+  const DataLayout DL;
+  AArch64InstrInfo InstrInfo;
+  AArch64TargetLowering TLInfo;
+  AArch64FrameLowering FrameLowering;
+  AArch64SelectionDAGInfo TSInfo;
 
 public:
   AArch64TargetMachine(const Target &T, StringRef TT, StringRef CPU,
                        StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
-                       CodeGenOpt::Level OL,
-                       bool LittleEndian);
+                       CodeGenOpt::Level OL, bool IsLittleEndian);
 
-  const AArch64InstrInfo *getInstrInfo() const {
-    return &InstrInfo;
+  const AArch64Subtarget *getSubtargetImpl() const override {
+    return &Subtarget;
   }
-
-  const AArch64FrameLowering *getFrameLowering() const {
+  const AArch64TargetLowering *getTargetLowering() const override {
+    return &TLInfo;
+  }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const AArch64FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-
-  const AArch64TargetLowering *getTargetLowering() const {
-    return &TLInfo;
+  const AArch64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const AArch64RegisterInfo *getRegisterInfo() const override {
+    return &InstrInfo.getRegisterInfo();
   }
-
-  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const {
+  const AArch64SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  const AArch64Subtarget *getSubtargetImpl() const { return &Subtarget; }
-
-  const DataLayout *getDataLayout() const { return &DL; }
-
-  const TargetRegisterInfo *getRegisterInfo() const {
-    return &InstrInfo.getRegisterInfo();
-  }
-  TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  // Pass Pipeline Configuration
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  /// \brief Register AArch64 analysis passes with a pass manager.
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 // AArch64leTargetMachine - AArch64 little endian target machine.
@@ -72,8 +72,8 @@ public:
 class AArch64leTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64leTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64leTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
@@ -83,12 +83,12 @@ public:
 class AArch64beTargetMachine : public AArch64TargetMachine {
   virtual void anchor();
 public:
-  AArch64beTargetMachine(const Target &T, StringRef TT,
-                         StringRef CPU, StringRef FS, const TargetOptions &Options,
+  AArch64beTargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                         StringRef FS, const TargetOptions &Options,
                          Reloc::Model RM, CodeModel::Model CM,
                          CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 663d619..4069038 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -6,19 +6,47 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
-
 
 #include "AArch64TargetObjectFile.h"
-
+#include "AArch64TargetMachine.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Dwarf.h"
 using namespace llvm;
+using namespace dwarf;
 
-void
-AArch64ElfTargetObjectFile::Initialize(MCContext &Ctx,
-                                       const TargetMachine &TM) {
+void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    const MCExpr *Res =
+        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
+    MCSymbol *PCSym = getContext().CreateTempSymbol();
+    Streamer.EmitLabel(PCSym);
+    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
+    return MCBinaryExpr::CreateSub(Res, PC, getContext());
+  }
+
+  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+      GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+MCSymbol *AArch64_MachoTargetObjectFile::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  return TM.getSymbol(GV, Mang);
+}
diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h
index 0f00a78..de63cb4 100644
--- a/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -1,4 +1,4 @@
-//===-- AArch64TargetObjectFile.h - AArch64 Object Info ---------*- C++ -*-===//
+//===-- AArch64TargetObjectFile.h - AArch64 Object Info -*- C++ ---------*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,25 +6,34 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file deals with any AArch64 specific requirements on object files.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_AARCH64_TARGETOBJECTFILE_H
+#ifndef LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
+#define LLVM_TARGET_AArch64_TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+class AArch64TargetMachine;
+
+/// This implementation is used for AArch64 ELF targets (Linux in particular).
+class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+/// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
+class AArch64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
+public:
+  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
+                                        unsigned Encoding, Mangler &Mang,
+                                        const TargetMachine &TM,
+                                        MachineModuleInfo *MMI,
+                                        MCStreamer &Streamer) const override;
 
-  /// AArch64ElfTargetObjectFile - This implementation is used for ELF
-  /// AArch64 targets.
-  class AArch64ElfTargetObjectFile : public TargetLoweringObjectFileELF {
-    virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
-  };
+  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
+                                    const TargetMachine &TM,
+                                    MachineModuleInfo *MMI) const override;
+};
 
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e2a1647..33e482a 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass ---------===//
+//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI pass --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,15 +14,18 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64tti"
 #include "AArch64.h"
 #include "AArch64TargetMachine.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
+#include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "aarch64tti"
+
 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
@@ -33,25 +36,28 @@ void initializeAArch64TTIPass(PassRegistry &);
 namespace {
 
 class AArch64TTI final : public ImmutablePass, public TargetTransformInfo {
+  const AArch64TargetMachine *TM;
   const AArch64Subtarget *ST;
   const AArch64TargetLowering *TLI;
 
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
 public:
-  AArch64TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+  AArch64TTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   AArch64TTI(const AArch64TargetMachine *TM)
-      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializeAArch64TTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override {
-    pushTTIStack(this);
-  }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -59,31 +65,37 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo*)this;
+      return (TargetTransformInfo *)this;
     return this;
   }
 
   /// \name Scalar TTI Implementations
   /// @{
+  unsigned getIntImmCost(int64_t Val) const;
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
 
   /// @}
 
-
   /// \name Vector TTI Implementations
   /// @{
 
-  unsigned getNumberOfRegisters(bool Vector) const {
+  unsigned getNumberOfRegisters(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 32;
       return 0;
     }
-    return 32;
+    return 31;
   }
 
-  unsigned getRegisterBitWidth(bool Vector) const {
+  unsigned getRegisterBitWidth(bool Vector) const override {
     if (Vector) {
       if (ST->hasNEON())
         return 128;
@@ -92,6 +104,26 @@ public:
     return 64;
   }
 
+  unsigned getMaximumUnrollFactor() const override { return 2; }
+
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
+      override;
+
+  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
+      override;
+
+  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                  OperandValueKind Opd1Info = OK_AnyValue,
+                                  OperandValueKind Opd2Info = OK_AnyValue) const
+      override;
+
+  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
+
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
+      override;
+
+  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                           unsigned AddressSpace) const override;
   /// @}
 };
 
@@ -105,3 +137,328 @@ ImmutablePass *
 llvm::createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM) {
   return new AArch64TTI(TM);
 }
+
+/// \brief Calculate the cost of materializing a 64-bit value. This helper
+/// method might only calculate a fraction of a larger immediate. Therefore it
+/// is valid to return a cost of ZERO.
+unsigned AArch64TTI::getIntImmCost(int64_t Val) const {
+  // Check if the immediate can be encoded within an instruction.
+  if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
+    return 0;
+
+  if (Val < 0)
+    Val = ~Val;
+
+  // Calculate how many moves we will need to materialize this constant.
+  unsigned LZ = countLeadingZeros((uint64_t)Val);
+  return (64 - LZ + 15) / 16;
+}
+
+/// \brief Calculate the cost of materializing the given constant.
+unsigned AArch64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  // Sign-extend all constants to a multiple of 64-bit.
+  APInt ImmVal = Imm;
+  if (BitSize & 0x3f)
+    ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
+
+  // Split the constant into 64-bit chunks and calculate the cost for each
+  // chunk.
+  unsigned Cost = 0;
+  for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
+    APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
+    int64_t Val = Tmp.getSExtValue();
+    Cost += getIntImmCost(Val);
+  }
+  // We need at least one instruction to materialze the constant.
+  return std::max(1U, Cost);
+}
+
+unsigned AArch64TTI::getIntImmCost(unsigned Opcode, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  unsigned ImmIdx = ~0U;
+  switch (Opcode) {
+  default:
+    return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::Store:
+    ImmIdx = 0;
+    break;
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::ICmp:
+    ImmIdx = 1;
+    break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::IntToPtr:
+  case Instruction::PtrToInt:
+  case Instruction::BitCast:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Select:
+  case Instruction::Ret:
+  case Instruction::Load:
+    break;
+  }
+
+  if (Idx == ImmIdx) {
+    unsigned NumConstants = (BitSize + 63) / 64;
+    unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+    return (Cost <= NumConstants * TCC_Basic)
+      ? static_cast<unsigned>(TCC_Free) : Cost;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned AArch64TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                                 const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
+  if (BitSize == 0)
+    return TCC_Free;
+
+  switch (IID) {
+  default:
+    return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::umul_with_overflow:
+    if (Idx == 1) {
+      unsigned NumConstants = (BitSize + 63) / 64;
+      unsigned Cost = AArch64TTI::getIntImmCost(Imm, Ty);
+      return (Cost <= NumConstants * TCC_Basic)
+        ? static_cast<unsigned>(TCC_Free) : Cost;
+    }
+    break;
+  case Intrinsic::experimental_stackmap:
+    if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
+      return TCC_Free;
+    break;
+  }
+  return AArch64TTI::getIntImmCost(Imm, Ty);
+}
+
+AArch64TTI::PopcntSupportKind
+AArch64TTI::getPopcntSupport(unsigned TyWidth) const {
+  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
+  if (TyWidth == 32 || TyWidth == 64)
+    return PSK_FastHardware;
+  // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
+  return PSK_Software;
+}
+
+unsigned AArch64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+
+  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
+    // LowerVectorINT_TO_FP:
+    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
+    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
+    // LowerVectorFP_TO_INT
+    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 4 },
+    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
+    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
+  };
+
+  int Idx = ConvertCostTableLookup<MVT>(
+      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
+      SrcTy.getSimpleVT());
+  if (Idx != -1)
+    return ConversionTbl[Idx].Cost;
+
+  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
+}
+
+unsigned AArch64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // The element at index zero is already inside the vector.
+    if (Index == 0)
+      return 0;
+  }
+
+  // All other insert/extracts cost this much.
+  return 2;
+}
+
+unsigned AArch64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                          OperandValueKind Opd1Info,
+                                          OperandValueKind Opd2Info) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+
+  switch (ISD) {
+  default:
+    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
+                                                       Opd2Info);
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::AND:
+    // These nodes are marked as 'custom' for combining purposes only.
+    // We know that they are legal. See LowerAdd in ISelLowering.
+    return 1 * LT.first;
+  }
+}
+
+unsigned AArch64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
+  // In many cases the address computation is not merged into the instruction
+  // addressing mode.
+  return 1;
+}
+
+unsigned AArch64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const {
+
+  int ISD = TLI->InstructionOpcodeToISD(Opcode);
+  // We don't lower vector selects well that are wider than the register width.
+  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
+    // We would need this many instructions to hide the scalarization happening.
+    unsigned AmortizationCost = 20;
+    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
+    VectorSelectTbl[] = {
+      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
+      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
+      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
+      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
+    };
+
+    EVT SelCondTy = TLI->getValueType(CondTy);
+    EVT SelValTy = TLI->getValueType(ValTy);
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx =
+          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
+                                 SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return VectorSelectTbl[Idx].Cost;
+    }
+  }
+  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned AArch64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
+
+  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
+      Src->getVectorElementType()->isIntegerTy(64)) {
+    // Unaligned stores are extremely inefficient. We don't split
+    // unaligned v2i64 stores because the negative impact that has shown in
+    // practice on inlined memcpy code.
+    // We make v2i64 stores expensive so that we will only vectorize if there
+    // are 6 other instructions getting vectorized.
+    unsigned AmortizationCost = 6;
+
+    return LT.first * 2 * AmortizationCost;
+  }
+
+  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
+      Src->getVectorNumElements() < 8) {
+    // We scalarize the loads/stores because there is not v.4b register and we
+    // have to promote the elements to v.4h.
+    unsigned NumVecElts = Src->getVectorNumElements();
+    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
+    // We generate 2 instructions per vector element.
+    return NumVectorizableInstsToAmortize * NumVecElts * 2;
+  }
+
+  return LT.first;
+}
diff --git a/lib/Target/AArch64/Android.mk b/lib/Target/AArch64/Android.mk
index 144c2d3..d0a50da 100644
--- a/lib/Target/AArch64/Android.mk
+++ b/lib/Target/AArch64/Android.mk
@@ -3,31 +3,41 @@ LOCAL_PATH := $(call my-dir)
 arm64_codegen_TBLGEN_TABLES := \
   AArch64GenRegisterInfo.inc \
   AArch64GenInstrInfo.inc \
-  AArch64GenCodeEmitter.inc \
-  AArch64GenMCCodeEmitter.inc \
-  AArch64GenMCPseudoLowering.inc \
   AArch64GenAsmWriter.inc \
-  AArch64GenAsmMatcher.inc \
+  AArch64GenAsmWriter1.inc \
   AArch64GenDAGISel.inc \
-  AArch64GenFastISel.inc \
   AArch64GenCallingConv.inc \
+  AArch64GenAsmMatcher.inc \
   AArch64GenSubtargetInfo.inc \
-  AArch64GenDisassemblerTables.inc
+  AArch64GenMCCodeEmitter.inc \
+  AArch64GenFastISel.inc \
+  AArch64GenDisassemblerTables.inc \
+  AArch64GenMCPseudoLowering.inc \
 
 arm64_codegen_SRC_FILES := \
+  AArch64AddressTypePromotion.cpp \
+  AArch64AdvSIMDScalarPass.cpp \
   AArch64AsmPrinter.cpp \
+  AArch64BranchRelaxation.cpp \
+  AArch64CleanupLocalDynamicTLSPass.cpp \
+  AArch64CollectLOH.cpp \
+  AArch64ConditionalCompares.cpp \
+  AArch64DeadRegisterDefinitionsPass.cpp \
+  AArch64ExpandPseudoInsts.cpp \
+  AArch64FastISel.cpp \
   AArch64FrameLowering.cpp \
-  AArch64ISelDAGToDAG.cpp \
-  AArch64MachineFunctionInfo.cpp \
-  AArch64RegisterInfo.cpp \
-  AArch64Subtarget.cpp \
-  AArch64TargetObjectFile.cpp \
-  AArch64BranchFixupPass.cpp \
   AArch64InstrInfo.cpp \
+  AArch64ISelDAGToDAG.cpp \
   AArch64ISelLowering.cpp \
+  AArch64LoadStoreOptimizer.cpp \
   AArch64MCInstLower.cpp \
+  AArch64PromoteConstant.cpp \
+  AArch64RegisterInfo.cpp \
   AArch64SelectionDAGInfo.cpp \
+  AArch64StorePairSuppress.cpp \
+  AArch64Subtarget.cpp \
   AArch64TargetMachine.cpp \
+  AArch64TargetObjectFile.cpp \
   AArch64TargetTransformInfo.cpp
 
 # For the host
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index e933ec1..65b77c5 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -6,34 +6,31 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the (GNU-style) assembly parser for the AArch64
-// architecture.
-//
-//===----------------------------------------------------------------------===//
 
-
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cstdio>
 using namespace llvm;
 
 namespace {
@@ -41,21 +38,74 @@ namespace {
 class AArch64Operand;
 
 class AArch64AsmParser : public MCTargetAsmParser {
+public:
+  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
+
+private:
+  StringRef Mnemonic; ///< Instruction mnemonic.
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
 
+  MCAsmParser &getParser() const { return Parser; }
+  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
+
+  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
+
+  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  AArch64CC::CondCode parseCondCodeString(StringRef Cond);
+  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
+  int tryParseRegister();
+  int tryMatchVectorRegister(StringRef &Kind, bool expected);
+  bool parseRegister(OperandVector &Operands);
+  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
+  bool parseVectorList(OperandVector &Operands);
+  bool parseOperand(OperandVector &Operands, bool isCondCode,
+                    bool invertCondCode);
+
+  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
+  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
+  bool showMatchError(SMLoc Loc, unsigned ErrCode);
+
+  bool parseDirectiveWord(unsigned Size, SMLoc L);
+  bool parseDirectiveTLSDescCall(SMLoc L);
+
+  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
+
+  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               unsigned &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+/// @name Auto-generated Match Functions
+/// {
+
 #define GET_ASSEMBLER_HEADER
 #include "AArch64GenAsmMatcher.inc"
 
+  /// }
+
+  OperandMatchResultTy tryParseOptionalShiftExtend(OperandVector &Operands);
+  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
+  OperandMatchResultTy tryParseMRSSystemRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysReg(OperandVector &Operands);
+  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
+  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
+  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
+  OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
+  bool tryParseVectorRegister(OperandVector &Operands);
+
 public:
   enum AArch64MatchResultTy {
-    Match_FirstAArch64 = FIRST_TARGET_MATCH_RESULT_TY,
+    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
 #define GET_OPERAND_DIAGNOSTIC_TYPES
 #include "AArch64GenAsmMatcher.inc"
   };
-
   AArch64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                   const MCInstrInfo &MII)
+                 const MCInstrInfo &MII,
+                 const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
@@ -63,191 +113,197 @@ public:
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  // These are the public interface of the MCTargetAsmParser
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool ParseDirective(AsmToken DirectiveID);
-  bool ParseDirectiveTLSDescCall(SMLoc L);
-  bool ParseDirectiveWord(unsigned Size, SMLoc L);
-
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               MCStreamer&Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-
-  // The rest of the sub-parsers have more freedom over interface: they return
-  // an OperandMatchResultTy because it's less ambiguous than true/false or
-  // -1/0/1 even if it is more verbose
-  OperandMatchResultTy
-  ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-               StringRef Mnemonic);
-
-  OperandMatchResultTy ParseImmediate(const MCExpr *&ExprVal);
-
-  OperandMatchResultTy ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind);
-
-  OperandMatchResultTy
-  ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t NumLanes);
-
-  OperandMatchResultTy
-  ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                uint32_t &NumLanes);
-
-  OperandMatchResultTy
-  ParseImmWithLSLOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCondCodeOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseCRxOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseFPImm0AndImm0Operand( SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  template<typename SomeNamedImmMapper> OperandMatchResultTy
-  ParseNamedImmOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-    return ParseNamedImmOperand(SomeNamedImmMapper(), Operands);
-  }
-
-  OperandMatchResultTy
-  ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                       SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseLSXAddressOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseShiftExtend(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  OperandMatchResultTy
-  ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
-                      SMLoc &LayoutLoc);
-
-  OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
-
-  bool validateInstruction(MCInst &Inst,
-                          const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-
-  /// Scan the next token (which had better be an identifier) and determine
-  /// whether it represents a general-purpose or vector register. It returns
-  /// true if an identifier was found and populates its reference arguments. It
-  /// does not consume the token.
-  bool
-  IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc, StringRef &LayoutSpec,
-                   SMLoc &LayoutLoc) const;
-
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
+
+  static bool classifySymbolRef(const MCExpr *Expr,
+                                AArch64MCExpr::VariantKind &ELFRefKind,
+                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                int64_t &Addend);
 };
-
-}
+} // end anonymous namespace
 
 namespace {
 
-/// Instances of this class represent a parsed AArch64 machine instruction.
+/// AArch64Operand - Instances of this class represent a parsed AArch64 machine
+/// instruction.
 class AArch64Operand : public MCParsedAsmOperand {
 private:
   enum KindTy {
-    k_ImmWithLSL,     // #uimm {, LSL #amt }
-    k_CondCode,       // eq/ne/...
-    k_FPImmediate,    // Limited-precision floating-point imm
-    k_Immediate,      // Including expressions referencing symbols
+    k_Immediate,
+    k_ShiftedImm,
+    k_CondCode,
     k_Register,
+    k_VectorList,
+    k_VectorIndex,
+    k_Token,
+    k_SysReg,
+    k_SysCR,
+    k_Prefetch,
     k_ShiftExtend,
-    k_VectorList,     // A sequential list of 1 to 4 registers.
-    k_SysReg,         // The register operand of MRS and MSR instructions
-    k_Token,          // The mnemonic; other raw tokens the auto-generated
-    k_WrappedRegister // Load/store exclusive permit a wrapped register.
+    k_FPImm,
+    k_Barrier
   } Kind;
 
   SMLoc StartLoc, EndLoc;
 
-  struct ImmWithLSLOp {
-    const MCExpr *Val;
-    unsigned ShiftAmount;
-    bool ImplicitAmount;
+  struct TokOp {
+    const char *Data;
+    unsigned Length;
+    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
   };
 
-  struct CondCodeOp {
-    A64CC::CondCodes Code;
+  struct RegOp {
+    unsigned RegNum;
+    bool isVector;
   };
 
-  struct FPImmOp {
-    double Val;
+  struct VectorListOp {
+    unsigned RegNum;
+    unsigned Count;
+    unsigned NumElements;
+    unsigned ElementKind;
+  };
+
+  struct VectorIndexOp {
+    unsigned Val;
   };
 
   struct ImmOp {
     const MCExpr *Val;
   };
 
-  struct RegOp {
-    unsigned RegNum;
+  struct ShiftedImmOp {
+    const MCExpr *Val;
+    unsigned ShiftAmount;
   };
 
-  struct ShiftExtendOp {
-    A64SE::ShiftExtSpecifiers ShiftType;
-    unsigned Amount;
-    bool ImplicitAmount;
+  struct CondCodeOp {
+    AArch64CC::CondCode Code;
   };
 
-  // A vector register list is a sequential list of 1 to 4 registers.
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    A64Layout::VectorLayout Layout;
+  struct FPImmOp {
+    unsigned Val; // Encoded 8-bit representation.
+  };
+
+  struct BarrierOp {
+    unsigned Val; // Not the enum since not all values have names.
   };
 
   struct SysRegOp {
     const char *Data;
     unsigned Length;
+    uint64_t FeatureBits; // We need to pass through information about which
+                          // core we are compiling for so that the SysReg
+                          // Mappers can appropriately conditionalize.
   };
 
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
+  struct SysCRImmOp {
+    unsigned Val;
+  };
+
+  struct PrefetchOp {
+    unsigned Val;
+  };
+
+  struct ShiftExtendOp {
+    AArch64_AM::ShiftExtendType Type;
+    unsigned Amount;
+    bool HasExplicitAmount;
+  };
+
+  struct ExtendOp {
+    unsigned Val;
   };
 
   union {
-    struct ImmWithLSLOp ImmWithLSL;
-    struct CondCodeOp CondCode;
-    struct FPImmOp FPImm;
-    struct ImmOp Imm;
+    struct TokOp Tok;
     struct RegOp Reg;
-    struct ShiftExtendOp ShiftExtend;
     struct VectorListOp VectorList;
+    struct VectorIndexOp VectorIndex;
+    struct ImmOp Imm;
+    struct ShiftedImmOp ShiftedImm;
+    struct CondCodeOp CondCode;
+    struct FPImmOp FPImm;
+    struct BarrierOp Barrier;
     struct SysRegOp SysReg;
-    struct TokOp Tok;
+    struct SysCRImmOp SysCRImm;
+    struct PrefetchOp Prefetch;
+    struct ShiftExtendOp ShiftExtend;
   };
 
-  AArch64Operand(KindTy K, SMLoc S, SMLoc E)
-    : MCParsedAsmOperand(), Kind(K), StartLoc(S), EndLoc(E) {}
+  // Keep the MCContext around as the MCExprs may need manipulated during
+  // the add<>Operands() calls.
+  MCContext &Ctx;
+
+  AArch64Operand(KindTy K, MCContext &_Ctx)
+      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
 
 public:
-  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand() {
+  AArch64Operand(const AArch64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
+    Kind = o.Kind;
+    StartLoc = o.StartLoc;
+    EndLoc = o.EndLoc;
+    switch (Kind) {
+    case k_Token:
+      Tok = o.Tok;
+      break;
+    case k_Immediate:
+      Imm = o.Imm;
+      break;
+    case k_ShiftedImm:
+      ShiftedImm = o.ShiftedImm;
+      break;
+    case k_CondCode:
+      CondCode = o.CondCode;
+      break;
+    case k_FPImm:
+      FPImm = o.FPImm;
+      break;
+    case k_Barrier:
+      Barrier = o.Barrier;
+      break;
+    case k_Register:
+      Reg = o.Reg;
+      break;
+    case k_VectorList:
+      VectorList = o.VectorList;
+      break;
+    case k_VectorIndex:
+      VectorIndex = o.VectorIndex;
+      break;
+    case k_SysReg:
+      SysReg = o.SysReg;
+      break;
+    case k_SysCR:
+      SysCRImm = o.SysCRImm;
+      break;
+    case k_Prefetch:
+      Prefetch = o.Prefetch;
+      break;
+    case k_ShiftExtend:
+      ShiftExtend = o.ShiftExtend;
+      break;
+    }
   }
 
-  SMLoc getStartLoc() const { return StartLoc; }
-  SMLoc getEndLoc() const { return EndLoc; }
-  void print(raw_ostream&) const;
-  void dump() const;
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
 
   StringRef getToken() const {
     assert(Kind == k_Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
-    assert((Kind == k_Register || Kind == k_WrappedRegister)
-           && "Invalid access!");
-    return Reg.RegNum;
+  bool isTokenSuffix() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return Tok.IsSuffix;
   }
 
   const MCExpr *getImm() const {
@@ -255,1234 +311,1778 @@ public:
     return Imm.Val;
   }
 
-  A64CC::CondCodes getCondCode() const {
-    assert(Kind == k_CondCode && "Invalid access!");
-    return CondCode.Code;
+  const MCExpr *getShiftedImmVal() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.Val;
   }
 
-  static bool isNonConstantExpr(const MCExpr *E,
-                                AArch64MCExpr::VariantKind &Variant) {
-    if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
-      Variant = A64E->getKind();
-      return true;
-    } else if (!isa<MCConstantExpr>(E)) {
-      Variant = AArch64MCExpr::VK_AARCH64_None;
-      return true;
-    }
-
-    return false;
+  unsigned getShiftedImmShift() const {
+    assert(Kind == k_ShiftedImm && "Invalid access!");
+    return ShiftedImm.ShiftAmount;
   }
 
-  bool isCondCode() const { return Kind == k_CondCode; }
-  bool isToken() const { return Kind == k_Token; }
-  bool isReg() const { return Kind == k_Register; }
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isMem() const { return false; }
-  bool isFPImm() const { return Kind == k_FPImmediate; }
-  bool isShiftOrExtend() const { return Kind == k_ShiftExtend; }
-  bool isSysReg() const { return Kind == k_SysReg; }
-  bool isImmWithLSL() const { return Kind == k_ImmWithLSL; }
-  bool isWrappedReg() const { return Kind == k_WrappedRegister; }
-
-  bool isAddSubImmLSL0() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 0) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC
-          || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC_LO12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  AArch64CC::CondCode getCondCode() const {
+    assert(Kind == k_CondCode && "Invalid access!");
+    return CondCode.Code;
   }
 
-  bool isAddSubImmLSL12() const {
-    if (!isImmWithLSL()) return false;
-    if (ImmWithLSL.ShiftAmount != 12) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_DTPREL_HI12
-          || Variant == AArch64MCExpr::VK_AARCH64_TPREL_HI12;
-    }
-
-    // Otherwise it should be a real immediate in range:
-    const MCConstantExpr *CE = cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  unsigned getFPImm() const {
+    assert(Kind == k_FPImm && "Invalid access!");
+    return FPImm.Val;
   }
 
-  template<unsigned MemSize, unsigned RmSize> bool isAddrRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    A64SE::ShiftExtSpecifiers Ext = ShiftExtend.ShiftType;
-    if (RmSize == 32 && !(Ext == A64SE::UXTW || Ext == A64SE::SXTW))
-      return false;
-
-    if (RmSize == 64 && !(Ext == A64SE::LSL || Ext == A64SE::SXTX))
-      return false;
-
-    return ShiftExtend.Amount == Log2_32(MemSize) || ShiftExtend.Amount == 0;
+  unsigned getBarrier() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return Barrier.Val;
   }
 
-  bool isAdrpLabel() const {
-    if (!isImm()) return false;
-
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-        || Variant == AArch64MCExpr::VK_AARCH64_GOT
-        || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL
-        || Variant == AArch64MCExpr::VK_AARCH64_TLSDESC;
-    }
-
-    return isLabel<21, 4096>();
+  unsigned getReg() const override {
+    assert(Kind == k_Register && "Invalid access!");
+    return Reg.RegNum;
   }
 
-  template<unsigned RegWidth>  bool isBitfieldWidth() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListStart() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.RegNum;
   }
 
-  template<int RegWidth>
-  bool isCVTFixedPos() const {
-    if (!isImm()) return false;
-
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-
-    return CE->getValue() >= 1 && CE->getValue() <= RegWidth;
+  unsigned getVectorListCount() const {
+    assert(Kind == k_VectorList && "Invalid access!");
+    return VectorList.Count;
   }
 
-  bool isFMOVImm() const {
-    if (!isFPImm()) return false;
-
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    return A64Imms::isFPImm(RealVal, ImmVal);
+  unsigned getVectorIndex() const {
+    assert(Kind == k_VectorIndex && "Invalid access!");
+    return VectorIndex.Val;
   }
 
-  bool isFPZero() const {
-    if (!isFPImm()) return false;
+  StringRef getSysReg() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return StringRef(SysReg.Data, SysReg.Length);
+  }
 
-    APFloat RealVal(FPImm.Val);
-    return RealVal.isPosZero();
+  uint64_t getSysRegFeatureBits() const {
+    assert(Kind == k_SysReg && "Invalid access!");
+    return SysReg.FeatureBits;
   }
 
-  template<unsigned field_width, unsigned scale>
-  bool isLabel() const {
-    if (!isImm()) return false;
+  unsigned getSysCR() const {
+    assert(Kind == k_SysCR && "Invalid access!");
+    return SysCRImm.Val;
+  }
 
-    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) {
-      return true;
-    } else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
-      int64_t Val = CE->getValue();
-      int64_t Min = - (scale * (1LL << (field_width - 1)));
-      int64_t Max = scale * ((1LL << (field_width - 1)) - 1);
-      return (Val % scale) == 0 && Val >= Min && Val <= Max;
-    }
+  unsigned getPrefetch() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return Prefetch.Val;
+  }
 
-    // N.b. this disallows explicit relocation specifications via an
-    // AArch64MCExpr. Users needing that behaviour
-    return false;
+  AArch64_AM::ShiftExtendType getShiftExtendType() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Type;
   }
 
-  bool isLane1() const {
-    if (!isImm()) return false;
+  unsigned getShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.Amount;
+  }
 
-    // Because it's come through custom assembly parsing, it must always be a
-    // constant expression.
-    return cast<MCConstantExpr>(getImm())->getValue() == 1;
+  bool hasShiftExtendAmount() const {
+    assert(Kind == k_ShiftExtend && "Invalid access!");
+    return ShiftExtend.HasExplicitAmount;
   }
 
-  bool isLoadLitLabel() const {
-    if (!isImm()) return false;
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return false; }
+  bool isSImm9() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val < 256);
+  }
+  bool isSImm7s4() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
+  }
+  bool isSImm7s8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
+  }
+  bool isSImm7s16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
+  }
+
+  bool isSymbolicUImm12Offset(const MCExpr *Expr, unsigned Scale) const {
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
+                                           Addend)) {
+      // If we don't understand the expression, assume the best and
+      // let the fixup and relocation code deal with it.
+      return true;
+    }
 
-    AArch64MCExpr::VariantKind Variant;
-    if (isNonConstantExpr(getImm(), Variant)) {
-      return Variant == AArch64MCExpr::VK_AARCH64_None
-          || Variant == AArch64MCExpr::VK_AARCH64_GOTTPREL;
+    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+        ELFRefKind == AArch64MCExpr::VK_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_GOT_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+        ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_GOTTPREL_LO12_NC ||
+        ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) {
+      // Note that we don't range-check the addend. It's adjusted modulo page
+      // size when converted, so there is no "out of range" condition when using
+      // @pageoff.
+      return Addend >= 0 && (Addend % Scale) == 0;
+    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
+               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
+      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
+      return Addend == 0;
     }
 
-    return isLabel<19, 4>();
+    return false;
   }
 
-  template<unsigned RegWidth> bool isLogicalImm() const {
-    if (!isImm()) return false;
+  template <int Scale> bool isUImm12Offset() const {
+    if (!isImm())
+      return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
-    if (!CE) return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return isSymbolicUImm12Offset(getImm(), Scale);
 
-    uint32_t Bits;
-    return A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+    int64_t Val = MCE->getValue();
+    return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  template<unsigned RegWidth> bool isLogicalImmMOV() const {
-    if (!isLogicalImm<RegWidth>()) return false;
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
-
-    // The move alias for ORR is only valid if the immediate cannot be
-    // represented with a move (immediate) instruction; they take priority.
-    int UImm16, Shift;
-    return !A64Imms::isMOVZImm(RegWidth, CE->getValue(), UImm16, Shift)
-      && !A64Imms::isMOVNImm(RegWidth, CE->getValue(), UImm16, Shift);
+  bool isImm0_7() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 8);
+  }
+  bool isImm1_8() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 9);
+  }
+  bool isImm0_15() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 16);
+  }
+  bool isImm1_16() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val > 0 && Val < 17);
+  }
+  bool isImm0_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 32);
+  }
+  bool isImm1_31() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 32);
+  }
+  bool isImm1_32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 33);
+  }
+  bool isImm0_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 64);
+  }
+  bool isImm1_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 64);
+  }
+  bool isImm1_64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 1 && Val < 65);
+  }
+  bool isImm0_127() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 128);
+  }
+  bool isImm0_255() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 256);
+  }
+  bool isImm0_65535() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 0 && Val < 65536);
+  }
+  bool isImm32_63() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    int64_t Val = MCE->getValue();
+    return (Val >= 32 && Val < 64);
+  }
+  bool isLogicalImm32() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 32);
   }
+  bool isLogicalImm64() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isLogicalImmediate(MCE->getValue(), 64);
+  }
+  bool isShiftedImm() const { return Kind == k_ShiftedImm; }
+  bool isAddSubImm() const {
+    if (!isShiftedImm() && !isImm())
+      return false;
 
-  template<int MemSize>
-  bool isOffsetUImm12() const {
-    if (!isImm()) return false;
+    const MCExpr *Expr;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
+    if (isShiftedImm()) {
+      unsigned Shift = ShiftedImm.ShiftAmount;
+      Expr = ShiftedImm.Val;
+      if (Shift != 0 && Shift != 12)
+        return false;
+    } else {
+      Expr = getImm();
+    }
 
-    // Assume they know what they're doing for now if they've given us a
-    // non-constant expression. In principle we could check for ridiculous
-    // things that can't possibly work or relocations that would almost
-    // certainly break resulting code.
-    if (!CE)
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (AArch64AsmParser::classifySymbolRef(Expr, ELFRefKind,
+                                          DarwinRefKind, Addend)) {
+      return DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF
+          || DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF
+          || (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF && Addend == 0)
+          || ELFRefKind == AArch64MCExpr::VK_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_HI12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12
+          || ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC
+          || ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12;
+    }
+
+    // Otherwise it should be a real immediate in range:
+    const MCConstantExpr *CE = cast<MCConstantExpr>(Expr);
+    return CE->getValue() >= 0 && CE->getValue() <= 0xfff;
+  }
+  bool isCondCode() const { return Kind == k_CondCode; }
+  bool isSIMDImmType10() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return false;
+    return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
+  }
+  bool isBranchTarget26() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
+  }
+  bool isPCRelLabel19() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
+  }
+  bool isBranchTarget14() const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
       return true;
+    int64_t Val = MCE->getValue();
+    if (Val & 0x3)
+      return false;
+    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+  }
 
-    int64_t Val = CE->getValue();
+  bool
+  isMovWSymbol(ArrayRef<AArch64MCExpr::VariantKind> AllowedModifiers) const {
+    if (!isImm())
+      return false;
 
-    // Must be a multiple of the access size in bytes.
-    if ((Val & (MemSize - 1)) != 0) return false;
+    AArch64MCExpr::VariantKind ELFRefKind;
+    MCSymbolRefExpr::VariantKind DarwinRefKind;
+    int64_t Addend;
+    if (!AArch64AsmParser::classifySymbolRef(getImm(), ELFRefKind,
+                                             DarwinRefKind, Addend)) {
+      return false;
+    }
+    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
+      return false;
 
-    // Must be 12-bit unsigned
-    return Val >= 0 && Val <= 0xfff * MemSize;
-  }
+    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
+      if (ELFRefKind == AllowedModifiers[i])
+        return Addend == 0;
+    }
 
-  template<A64SE::ShiftExtSpecifiers SHKind, bool is64Bit>
-  bool isShift() const {
-    if (!isShiftOrExtend()) return false;
+    return false;
+  }
 
-    if (ShiftExtend.ShiftType != SHKind)
-      return false;
+  bool isMovZSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
 
-    return is64Bit ? ShiftExtend.Amount <= 63 : ShiftExtend.Amount <= 31;
+  bool isMovZSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2, AArch64MCExpr::VK_ABS_G2_S,
+        AArch64MCExpr::VK_TPREL_G2, AArch64MCExpr::VK_DTPREL_G2};
+    return isMovWSymbol(Variants);
   }
 
-  bool isMOVN32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
+  bool isMovZSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G1,      AArch64MCExpr::VK_ABS_G1_S,
+        AArch64MCExpr::VK_GOTTPREL_G1, AArch64MCExpr::VK_TPREL_G1,
+        AArch64MCExpr::VK_DTPREL_G1,
     };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVN64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+    return isMovWSymbol(Variants);
   }
 
+  bool isMovZSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G0, AArch64MCExpr::VK_ABS_G0_S,
+        AArch64MCExpr::VK_TPREL_G0, AArch64MCExpr::VK_DTPREL_G0};
+    return isMovWSymbol(Variants);
+  }
 
-  bool isMOVZ32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVZ64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0,
-      AArch64MCExpr::VK_AARCH64_ABS_G1,
-      AArch64MCExpr::VK_AARCH64_ABS_G2,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_SABS_G0,
-      AArch64MCExpr::VK_AARCH64_SABS_G1,
-      AArch64MCExpr::VK_AARCH64_SABS_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G2,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G2,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
+  bool isMovKSymbolG3() const {
+    static AArch64MCExpr::VariantKind Variants[] = { AArch64MCExpr::VK_ABS_G3 };
+    return isMovWSymbol(Variants);
+  }
 
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+  bool isMovKSymbolG2() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+        AArch64MCExpr::VK_ABS_G2_NC};
+    return isMovWSymbol(Variants);
   }
 
-  bool isMOVK32Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
-    };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(32, PermittedModifiers, NumModifiers);
-  }
-
-  bool isMOVK64Imm() const {
-    static const AArch64MCExpr::VariantKind PermittedModifiers[] = {
-      AArch64MCExpr::VK_AARCH64_ABS_G0_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G1_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G2_NC,
-      AArch64MCExpr::VK_AARCH64_ABS_G3,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC,
-      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC,
+  bool isMovKSymbolG1() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G1_NC, AArch64MCExpr::VK_TPREL_G1_NC,
+      AArch64MCExpr::VK_DTPREL_G1_NC
     };
-    const unsigned NumModifiers = llvm::array_lengthof(PermittedModifiers);
-
-    return isMoveWideImm(64, PermittedModifiers, NumModifiers);
+    return isMovWSymbol(Variants);
   }
 
-  bool isMoveWideImm(unsigned RegWidth,
-                     const AArch64MCExpr::VariantKind *PermittedModifiers,
-                     unsigned NumModifiers) const {
-    if (!isImmWithLSL()) return false;
+  bool isMovKSymbolG0() const {
+    static AArch64MCExpr::VariantKind Variants[] = {
+      AArch64MCExpr::VK_ABS_G0_NC,   AArch64MCExpr::VK_GOTTPREL_G0_NC,
+      AArch64MCExpr::VK_TPREL_G0_NC, AArch64MCExpr::VK_DTPREL_G0_NC
+    };
+    return isMovWSymbol(Variants);
+  }
 
-    if (ImmWithLSL.ShiftAmount % 16 != 0) return false;
-    if (ImmWithLSL.ShiftAmount >= RegWidth) return false;
+  template<int RegWidth, int Shift>
+  bool isMOVZMovAlias() const {
+    if (!isImm()) return false;
 
-    AArch64MCExpr::VariantKind Modifier;
-    if (isNonConstantExpr(ImmWithLSL.Val, Modifier)) {
-      // E.g. "#:abs_g0:sym, lsl #16" makes no sense.
-      if (!ImmWithLSL.ImplicitAmount) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    uint64_t Value = CE->getValue();
 
-      for (unsigned i = 0; i < NumModifiers; ++i)
-        if (PermittedModifiers[i] == Modifier) return true;
+    if (RegWidth == 32)
+      Value &= 0xffffffffULL;
 
+    // "lsl #0" takes precedence: in practice this only affects "#0, lsl #0".
+    if (Value == 0 && Shift != 0)
       return false;
-    }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmWithLSL.Val);
-    return CE && CE->getValue() >= 0  && CE->getValue() <= 0xffff;
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  template<int RegWidth, bool (*isValidImm)(int, uint64_t, int&, int&)>
-  bool isMoveWideMovAlias() const {
+  template<int RegWidth, int Shift>
+  bool isMOVNMovAlias() const {
     if (!isImm()) return false;
 
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
-
-    int UImm16, Shift;
     uint64_t Value = CE->getValue();
 
-    // If this is a 32-bit instruction then all bits above 32 should be the
-    // same: either of these is fine because signed/unsigned values should be
-    // permitted.
-    if (RegWidth == 32) {
-      if ((Value >> 32) != 0 && (Value >> 32) != 0xffffffff)
+    // MOVZ takes precedence over MOVN.
+    for (int MOVZShift = 0; MOVZShift <= 48; MOVZShift += 16)
+      if ((Value & ~(0xffffULL << MOVZShift)) == 0)
         return false;
 
+    Value = ~Value;
+    if (RegWidth == 32)
       Value &= 0xffffffffULL;
-    }
 
-    return isValidImm(RegWidth, Value, UImm16, Shift);
+    return (Value & ~(0xffffULL << Shift)) == 0;
   }
 
-  bool isMSRWithReg() const {
+  bool isFPImm() const { return Kind == k_FPImm; }
+  bool isBarrier() const { return Kind == k_Barrier; }
+  bool isSysReg() const { return Kind == k_SysReg; }
+  bool isMRSSystemRegister() const {
     if (!isSysReg()) return false;
 
     bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MSRMapper().fromString(Name, IsKnownRegister);
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
 
     return IsKnownRegister;
   }
-
-  bool isMSRPState() const {
+  bool isMSRSystemRegister() const {
     if (!isSysReg()) return false;
 
     bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64PState::PStateMapper().fromString(Name, IsKnownRegister);
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    Mapper.fromString(getSysReg(), IsKnownRegister);
 
     return IsKnownRegister;
   }
-
-  bool isMRS() const {
+  bool isSystemPStateField() const {
     if (!isSysReg()) return false;
 
-    // First check against specific MSR-only (write-only) registers
     bool IsKnownRegister;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    A64SysReg::MRSMapper().fromString(Name, IsKnownRegister);
+    AArch64PState::PStateMapper().fromString(getSysReg(), IsKnownRegister);
 
     return IsKnownRegister;
   }
+  bool isReg() const override { return Kind == k_Register && !Reg.isVector; }
+  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
+  bool isVectorRegLo() const {
+    return Kind == k_Register && Reg.isVector &&
+           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+               Reg.RegNum);
+  }
+  bool isGPR32as64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
+  }
 
-  bool isPRFM() const {
-    if (!isImm()) return false;
+  bool isGPR64sp0() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+  /// Is this a vector list with the type implicit (presumably attached to the
+  /// instruction itself)?
+  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
+    return Kind == k_VectorList && VectorList.Count == NumRegs &&
+           !VectorList.ElementKind;
+  }
 
-    if (!CE)
+  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
+  bool isTypedVectorList() const {
+    if (Kind != k_VectorList)
       return false;
-
-    return CE->getValue() >= 0 && CE->getValue() <= 31;
+    if (VectorList.Count != NumRegs)
+      return false;
+    if (VectorList.ElementKind != ElementKind)
+      return false;
+    return VectorList.NumElements == NumElements;
   }
 
-  template<A64SE::ShiftExtSpecifiers SHKind> bool isRegExtend() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != SHKind)
+  bool isVectorIndex1() const {
+    return Kind == k_VectorIndex && VectorIndex.Val == 1;
+  }
+  bool isVectorIndexB() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 16;
+  }
+  bool isVectorIndexH() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 8;
+  }
+  bool isVectorIndexS() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 4;
+  }
+  bool isVectorIndexD() const {
+    return Kind == k_VectorIndex && VectorIndex.Val < 2;
+  }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isTokenEqual(StringRef Str) const {
+    return Kind == k_Token && getToken() == Str;
+  }
+  bool isSysCR() const { return Kind == k_SysCR; }
+  bool isPrefetch() const { return Kind == k_Prefetch; }
+  bool isShiftExtend() const { return Kind == k_ShiftExtend; }
+  bool isShifter() const {
+    if (!isShiftExtend())
       return false;
 
-    return ShiftExtend.Amount <= 4;
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR ||
+            ST == AArch64_AM::MSL);
   }
-
-  bool isRegExtendLSL() const {
-    if (!isShiftOrExtend()) return false;
-
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+  bool isExtend() const {
+    if (!isShiftExtend())
       return false;
 
-    return !ShiftExtend.ImplicitAmount && ShiftExtend.Amount <= 4;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTB || ET == AArch64_AM::SXTB ||
+            ET == AArch64_AM::UXTH || ET == AArch64_AM::SXTH ||
+            ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW ||
+            ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  // if 0 < value <= w, return true
-  bool isShrFixedWidth(int w) const {
-    if (!isImm())
+  bool isExtend64() const {
+    if (!isExtend())
       return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return ET != AArch64_AM::UXTX && ET != AArch64_AM::SXTX;
+  }
+  bool isExtendLSL64() const {
+    if (!isExtend())
       return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= w;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTX || ET == AArch64_AM::SXTX ||
+            ET == AArch64_AM::LSL) &&
+           getShiftExtendAmount() <= 4;
   }
 
-  bool isShrImm8() const { return isShrFixedWidth(8); }
-
-  bool isShrImm16() const { return isShrFixedWidth(16); }
-
-  bool isShrImm32() const { return isShrFixedWidth(32); }
-
-  bool isShrImm64() const { return isShrFixedWidth(64); }
-
-  // if 0 <= value < w, return true
-  bool isShlFixedWidth(int w) const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
+  template<int Width> bool isMemXExtend() const {
+    if (!isExtend())
       return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < w;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::LSL || ET == AArch64_AM::SXTX) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
   }
 
-  bool isShlImm8() const { return isShlFixedWidth(8); }
-
-  bool isShlImm16() const { return isShlFixedWidth(16); }
-
-  bool isShlImm32() const { return isShlFixedWidth(32); }
-
-  bool isShlImm64() const { return isShlFixedWidth(64); }
-
-  bool isNeonMovImmShiftLSL() const {
-    if (!isShiftOrExtend())
+  template<int Width> bool isMemWExtend() const {
+    if (!isExtend())
       return false;
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    return (ET == AArch64_AM::UXTW || ET == AArch64_AM::SXTW) &&
+           (getShiftExtendAmount() == Log2_32(Width / 8) ||
+            getShiftExtendAmount() == 0);
+  }
 
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+  template <unsigned width>
+  bool isArithmeticShifter() const {
+    if (!isShifter())
       return false;
 
-    // Valid shift amount is 0, 8, 16 and 24.
-    return ShiftExtend.Amount % 8 == 0 && ShiftExtend.Amount <= 24;
+    // An arithmetic shifter is LSL, LSR, or ASR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR) && getShiftExtendAmount() < width;
   }
 
-  bool isNeonMovImmShiftLSLH() const {
-    if (!isShiftOrExtend())
+  template <unsigned width>
+  bool isLogicalShifter() const {
+    if (!isShifter())
       return false;
 
-    if (ShiftExtend.ShiftType != A64SE::LSL)
+    // A logical shifter is LSL, LSR, ASR or ROR.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    return (ST == AArch64_AM::LSL || ST == AArch64_AM::LSR ||
+            ST == AArch64_AM::ASR || ST == AArch64_AM::ROR) &&
+           getShiftExtendAmount() < width;
+  }
+
+  bool isMovImm32Shifter() const {
+    if (!isShifter())
       return false;
 
-    // Valid shift amount is 0 and 8.
-    return ShiftExtend.Amount == 0 || ShiftExtend.Amount == 8;
+    // A MOVi shifter is LSL of 0, 16, 32, or 48.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
+      return false;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16);
   }
 
-  bool isNeonMovImmShiftMSL() const {
-    if (!isShiftOrExtend())
+  bool isMovImm64Shifter() const {
+    if (!isShifter())
       return false;
 
-    if (ShiftExtend.ShiftType != A64SE::MSL)
+    // A MOVi shifter is LSL of 0 or 16.
+    AArch64_AM::ShiftExtendType ST = getShiftExtendType();
+    if (ST != AArch64_AM::LSL)
       return false;
-
-    // Valid shift amount is 8 and 16.
-    return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
+    uint64_t Val = getShiftExtendAmount();
+    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
   }
 
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  bool isVectorList() const {
-    return Kind == k_VectorList && VectorList.Layout == Layout &&
-           VectorList.Count == Count;
+  bool isLogicalVecShifter() const {
+    if (!isShifter())
+      return false;
+
+    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
   }
 
-  template <int MemSize> bool isSImm7Scaled() const {
-    if (!isImm())
+  bool isLogicalVecHalfWordShifter() const {
+    if (!isLogicalVecShifter())
       return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    // A logical vector shifter is a left shift by 0 or 8.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::LSL &&
+           (Shift == 0 || Shift == 8);
+  }
 
-    int64_t Val = CE->getValue();
-    if (Val % MemSize != 0) return false;
+  bool isMoveVecShifter() const {
+    if (!isShiftExtend())
+      return false;
 
-    Val /= MemSize;
+    // A logical vector shifter is a left shift by 8 or 16.
+    unsigned Shift = getShiftExtendAmount();
+    return getShiftExtendType() == AArch64_AM::MSL &&
+           (Shift == 8 || Shift == 16);
+  }
 
-    return Val >= -64 && Val < 64;
+  // Fallback unscaled operands are for aliases of LDR/STR that fall back
+  // to LDUR/STUR when the offset is not legal for the former but is for
+  // the latter. As such, in addition to checking for being a legal unscaled
+  // address, also check that it is not a legal scaled address. This avoids
+  // ambiguity in the matcher.
+  template<int Width>
+  bool isSImm9OffsetFB() const {
+    return isSImm9() && !isUImm12Offset<Width / 8>();
   }
 
-  template<int BitWidth>
-  bool isSImm() const {
-    if (!isImm()) return false;
+  bool isAdrpLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (4096 * (1LL << (21 - 1)));
+      int64_t Max = 4096 * ((1LL << (21 - 1)) - 1);
+      return (Val % 4096) == 0 && Val >= Min && Val <= Max;
+    }
 
-    return CE->getValue() >= -(1LL << (BitWidth - 1))
-      && CE->getValue() < (1LL << (BitWidth - 1));
+    return true;
   }
 
-  template<int bitWidth>
-  bool isUImm() const {
-    if (!isImm()) return false;
+  bool isAdrLabel() const {
+    // Validation was handled during parsing, so we just sanity check that
+    // something didn't go haywire.
+    if (!isImm())
+        return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Min = - (1LL << (21 - 1));
+      int64_t Max = ((1LL << (21 - 1)) - 1);
+      return Val >= Min && Val <= Max;
+    }
 
-    return CE->getValue() >= 0 && CE->getValue() < (1LL << bitWidth);
+    return true;
   }
 
-  bool isUImm() const {
-    if (!isImm()) return false;
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediates when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::CreateImm(0));
+    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::CreateExpr(Expr));
+  }
 
-    return isa<MCConstantExpr>(getImm());
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
   }
 
-  bool isNeonUImm64Mask() const {
-    if (!isImm())
-      return false;
+  void addGPR32as64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(getReg()));
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE)
-      return false;
+    const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+    uint32_t Reg = RI->getRegClass(AArch64::GPR32RegClassID).getRegister(
+        RI->getEncodingValue(getReg()));
 
-    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateReg(Reg));
+  }
 
-    // i64 value with each byte being either 0x00 or 0xff.
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8)
-      if ((Value & 0xff) != 0 && (Value & 0xff) != 0xff)
-        return false;
-    return true;
+  void addVectorReg64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(AArch64::D0 + getReg() - AArch64::Q0));
   }
 
-  // if value == N, return true
-  template<int N>
-  bool isExactImm() const {
-    if (!isImm()) return false;
+  void addVectorReg128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    assert(
+        AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(getReg()));
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+  void addVectorRegLoOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getReg()));
+  }
 
-    return CE->getValue() == N;
+  template <unsigned NumRegs>
+  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::D0,       AArch64::D0_D1,
+                                    AArch64::D0_D1_D2, AArch64::D0_D1_D2_D3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
   }
 
-  bool isFPZeroIZero() const {
-    return isFPZero();
+  template <unsigned NumRegs>
+  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    static unsigned FirstRegs[] = { AArch64::Q0,       AArch64::Q0_Q1,
+                                    AArch64::Q0_Q1_Q2, AArch64::Q0_Q1_Q2_Q3 };
+    unsigned FirstReg = FirstRegs[NumRegs - 1];
+
+    Inst.addOperand(
+        MCOperand::CreateReg(FirstReg + getVectorListStart() - AArch64::Q0));
   }
 
-  static AArch64Operand *CreateImmWithLSL(const MCExpr *Val,
-                                          unsigned ShiftAmount,
-                                          bool ImplicitAmount,
-										  SMLoc S,SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ImmWithLSL, S, E);
-    Op->ImmWithLSL.Val = Val;
-    Op->ImmWithLSL.ShiftAmount = ShiftAmount;
-    Op->ImmWithLSL.ImplicitAmount = ImplicitAmount;
-    return Op;
+  void addVectorIndex1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  static AArch64Operand *CreateCondCode(A64CC::CondCodes Code,
-                                        SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_CondCode, S, E);
-    Op->CondCode.Code = Code;
-    return Op;
+  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  static AArch64Operand *CreateFPImm(double Val,
-                                     SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_FPImmediate, S, E);
-    Op->FPImm.Val = Val;
-    return Op;
+  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Immediate, S, E);
-    Op->Imm.Val = Val;
-    return Op;
+  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  static AArch64Operand *CreateReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_Register, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
   }
 
-  static AArch64Operand *CreateWrappedReg(unsigned RegNum, SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_WrappedRegister, S, E);
-    Op->Reg.RegNum = RegNum;
-    return Op;
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // If this is a pageoff symrefexpr with an addend, adjust the addend
+    // to be only the page-offset portion. Otherwise, just add the expr
+    // as-is.
+    addExpr(Inst, getImm());
   }
 
-  static AArch64Operand *CreateShiftExtend(A64SE::ShiftExtSpecifiers ShiftTyp,
-                                           unsigned Amount,
-                                           bool ImplicitAmount,
-                                           SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, S, E);
-    Op->ShiftExtend.ShiftType = ShiftTyp;
-    Op->ShiftExtend.Amount = Amount;
-    Op->ShiftExtend.ImplicitAmount = ImplicitAmount;
-    return Op;
+  void addAddSubImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    if (isShiftedImm()) {
+      addExpr(Inst, getShiftedImmVal());
+      Inst.addOperand(MCOperand::CreateImm(getShiftedImmShift()));
+    } else {
+      addExpr(Inst, getImm());
+      Inst.addOperand(MCOperand::CreateImm(0));
+    }
   }
 
-  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_SysReg, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
   }
 
-  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                          A64Layout::VectorLayout Layout,
-                                          SMLoc S, SMLoc E) {
-    AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.Layout = Layout;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
+  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE)
+      addExpr(Inst, getImm());
+    else
+      Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 12));
   }
 
-  static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
-    AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    return Op;
+  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
   }
 
+  template<int Scale>
+  void addUImm12OffsetOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
 
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
+    if (!MCE) {
+      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+      return;
+    }
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / Scale));
   }
 
-  template<unsigned RegWidth>
-  void addBFILSBOperands(MCInst &Inst, unsigned N) const {
+  void addSImm9Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    unsigned EncodedVal = (RegWidth - CE->getValue()) % RegWidth;
-    Inst.addOperand(MCOperand::CreateImm(EncodedVal));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addBFIWidthOperands(MCInst &Inst, unsigned N) const {
+  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue() - 1));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
   }
 
-  void addBFXWidthOperands(MCInst &Inst, unsigned N) const {
+  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    uint64_t LSB = Inst.getOperand(Inst.getNumOperands()-1).getImm();
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-
-    Inst.addOperand(MCOperand::CreateImm(LSB + CE->getValue() - 1));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
   }
 
-  void addCondCodeOperands(MCInst &Inst, unsigned N) const {
+  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCondCode()));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
   }
 
-  void addCVTFixedPosOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::CreateImm(64 - CE->getValue()));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addFMOVImmOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    APFloat RealVal(FPImm.Val);
-    uint32_t ImmVal;
-    A64Imms::isFPImm(RealVal, ImmVal);
+  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addFPZeroOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    Inst.addOperand(MCOperand::CreateImm(0));
+  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addFPZeroIZeroOperands(MCInst &Inst, unsigned N) const {
-    addFPZeroOperands(Inst, N);
+  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addInvCondCodeOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    unsigned Encoded = A64InvertCondCode(getCondCode());
-    Inst.addOperand(MCOperand::CreateImm(Encoded));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addRegOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addImmOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
-    addExpr(Inst, getImm());
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  template<int MemSize>
-  void addSImm7ScaledOperands(MCInst &Inst, unsigned N) const {
+  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue() / MemSize;
-    Inst.addOperand(MCOperand::CreateImm(Val  & 0x7f));
+  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  template<int BitWidth>
-  void addSImmOperands(MCInst &Inst, unsigned N) const {
+  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Val = CE->getValue();
-    Inst.addOperand(MCOperand::CreateImm(Val  & ((1ULL << BitWidth) - 1)));
+  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
   }
 
-  void addImmWithLSLOperands(MCInst &Inst, unsigned N) const {
-    assert (N == 1 && "Invalid number of operands!");
+  void addImm32_63Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
+  }
 
-    addExpr(Inst, ImmWithLSL.Val);
+  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
   }
 
-  template<unsigned field_width, unsigned scale>
-  void addLabelOperands(MCInst &Inst, unsigned N) const {
+  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid logical immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    assert(MCE && "Invalid immediate operand!");
+    uint64_t encoding = AArch64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
+    Inst.addOperand(MCOperand::CreateImm(encoding));
+  }
 
-    if (!CE) {
-      addExpr(Inst, Imm.Val);
+  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
       return;
     }
-
-    int64_t Val = CE->getValue();
-    assert(Val % scale == 0 && "Unaligned immediate in instruction");
-    Val /= scale;
-
-    Inst.addOperand(MCOperand::CreateImm(Val & ((1LL << field_width) - 1)));
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
   }
 
-  template<int MemSize>
-  void addOffsetUImm12Operands(MCInst &Inst, unsigned N) const {
+  void addPCRelLabel19Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
     assert(N == 1 && "Invalid number of operands!");
-
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / MemSize));
-    } else {
-      Inst.addOperand(MCOperand::CreateExpr(getImm()));
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
     }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
   }
 
-  template<unsigned RegWidth>
-  void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands");
-    const MCConstantExpr *CE = cast<MCConstantExpr>(Imm.Val);
+  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
+    // Branch operands don't encode the low bits, so shift them off
+    // here. If it's a label, however, just put it on directly as there's
+    // not enough information now to do anything.
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
+    if (!MCE) {
+      addExpr(Inst, getImm());
+      return;
+    }
+    assert(MCE && "Invalid constant immediate operand!");
+    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
+  }
 
-    uint32_t Bits;
-    A64Imms::isLogicalImm(RegWidth, CE->getValue(), Bits);
+  void addFPImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(Bits));
+  void addBarrierOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
   }
 
-  void addMRSOperands(MCInst &Inst, unsigned N) const {
+  void addMRSSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
     bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MRSMapper().fromString(Name, Valid);
+    auto Mapper = AArch64SysReg::MRSMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
 
     Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  void addMSRWithRegOperands(MCInst &Inst, unsigned N) const {
+  void addMSRSystemRegisterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
     bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64SysReg::MSRMapper().fromString(Name, Valid);
+    auto Mapper = AArch64SysReg::MSRMapper(getSysRegFeatureBits());
+    uint32_t Bits = Mapper.fromString(getSysReg(), Valid);
 
     Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  void addMSRPStateOperands(MCInst &Inst, unsigned N) const {
+  void addSystemPStateFieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
     bool Valid;
-    StringRef Name(SysReg.Data, SysReg.Length);
-    uint32_t Bits = A64PState::PStateMapper().fromString(Name, Valid);
+    uint32_t Bits =
+        AArch64PState::PStateMapper().fromString(getSysReg(), Valid);
 
     Inst.addOperand(MCOperand::CreateImm(Bits));
   }
 
-  void addMoveWideImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-
-    addExpr(Inst, ImmWithLSL.Val);
-
-    AArch64MCExpr::VariantKind Variant;
-    if (!isNonConstantExpr(ImmWithLSL.Val, Variant)) {
-      Inst.addOperand(MCOperand::CreateImm(ImmWithLSL.ShiftAmount / 16));
-      return;
-    }
-
-    // We know it's relocated
-    switch (Variant) {
-    case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
-      Inst.addOperand(MCOperand::CreateImm(0));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-      Inst.addOperand(MCOperand::CreateImm(1));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-      Inst.addOperand(MCOperand::CreateImm(2));
-      break;
-    case AArch64MCExpr::VK_AARCH64_ABS_G3:
-      Inst.addOperand(MCOperand::CreateImm(3));
-      break;
-    default: llvm_unreachable("Inappropriate move wide relocation");
-    }
+  void addSysCROperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
   }
 
-  template<int RegWidth, bool isValidImm(int, uint64_t, int&, int&)>
-  void addMoveWideMovAliasOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && "Invalid number of operands!");
-    int UImm16, Shift;
-
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-
-    if (RegWidth == 32) {
-      Value &= 0xffffffffULL;
-    }
+  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
+  }
 
-    bool Valid = isValidImm(RegWidth, Value, UImm16, Shift);
-    (void)Valid;
-    assert(Valid && "Invalid immediates should have been weeded out by now");
+  void addShifterOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    unsigned Imm =
+        AArch64_AM::getShifterImm(getShiftExtendType(), getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(UImm16));
-    Inst.addOperand(MCOperand::CreateImm(Shift));
+  void addExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTW;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
   }
 
-  void addPRFMOperands(MCInst &Inst, unsigned N) const {
+  void addExtend64Operands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    if (ET == AArch64_AM::LSL) ET = AArch64_AM::UXTX;
+    unsigned Imm = AArch64_AM::getArithExtendImm(ET, getShiftExtendAmount());
+    Inst.addOperand(MCOperand::CreateImm(Imm));
+  }
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    assert(CE->getValue() >= 0 && CE->getValue() <= 31
-           && "PRFM operand should be 5-bits");
+  void addMemExtendOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(getShiftExtendAmount() != 0));
+  }
 
-    Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+  // For 8-bit load/store instructions with a register offset, both the
+  // "DoShift" and "NoShift" variants have a shift of 0. Because of this,
+  // they're disambiguated by whether the shift was explicit or implicit rather
+  // than its size.
+  void addMemExtend8Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+    AArch64_AM::ShiftExtendType ET = getShiftExtendType();
+    bool IsSigned = ET == AArch64_AM::SXTW || ET == AArch64_AM::SXTX;
+    Inst.addOperand(MCOperand::CreateImm(IsSigned));
+    Inst.addOperand(MCOperand::CreateImm(hasShiftExtendAmount()));
   }
 
-  // For Add-sub (extended register) operands.
-  void addRegExtendOperands(MCInst &Inst, unsigned N) const {
+  template<int Shift>
+  void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((Value >> Shift) & 0xffff));
   }
 
-  // For Vector Immediates shifted imm operands.
-  void addNeonMovImmShiftLSLOperands(MCInst &Inst, unsigned N) const {
+  template<int Shift>
+  void addMOVNMovAliasOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    if (ShiftExtend.Amount % 8 != 0 || ShiftExtend.Amount > 24)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
-
-    // Encode LSL shift amount 0, 8, 16, 24 as 0, 1, 2, 3.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
+    uint64_t Value = CE->getValue();
+    Inst.addOperand(MCOperand::CreateImm((~Value >> Shift) & 0xffff));
   }
 
-  void addNeonMovImmShiftLSLHOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  void print(raw_ostream &OS) const override;
 
-    if (ShiftExtend.Amount != 0 && ShiftExtend.Amount != 8)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
-
-    // Encode LSLH shift amount 0, 8  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  static AArch64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Token, Ctx);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->Tok.IsSuffix = IsSuffix;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
-  void addNeonMovImmShiftMSLOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-
-    if (ShiftExtend.Amount != 8 && ShiftExtend.Amount != 16)
-      llvm_unreachable("Invalid shift amount for vector immediate inst.");
+  static AArch64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
+                                 SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Register, Ctx);
+    Op->Reg.RegNum = RegNum;
+    Op->Reg.isVector = isVector;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    // Encode MSL shift amount 8, 16  as 0, 1.
-    int64_t Imm = ShiftExtend.Amount / 8 - 1;
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
+                                        unsigned NumElements, char ElementKind,
+                                        SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorList, Ctx);
+    Op->VectorList.RegNum = RegNum;
+    Op->VectorList.Count = Count;
+    Op->VectorList.NumElements = NumElements;
+    Op->VectorList.ElementKind = ElementKind;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
 
-  // For the extend in load-store (register offset) instructions.
-  template<unsigned MemSize>
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N) const {
-    addAddrRegExtendOperands(Inst, N, MemSize);
+  static AArch64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
+                                         MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_VectorIndex, Ctx);
+    Op->VectorIndex.Val = Idx;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
 
-  void addAddrRegExtendOperands(MCInst &Inst, unsigned N,
-                                unsigned MemSize) const {
-    assert(N == 1 && "Invalid number of operands!");
+  static AArch64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
+                                 MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Immediate, Ctx);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    // First bit of Option is set in instruction classes, the high two bits are
-    // as follows:
-    unsigned OptionHi = 0;
-    switch (ShiftExtend.ShiftType) {
-    case A64SE::UXTW:
-    case A64SE::LSL:
-      OptionHi = 1;
-      break;
-    case A64SE::SXTW:
-    case A64SE::SXTX:
-      OptionHi = 3;
-      break;
-    default:
-      llvm_unreachable("Invalid extend type for register offset");
-    }
+  static AArch64Operand *CreateShiftedImm(const MCExpr *Val,
+                                          unsigned ShiftAmount, SMLoc S,
+                                          SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftedImm, Ctx);
+    Op->ShiftedImm .Val = Val;
+    Op->ShiftedImm.ShiftAmount = ShiftAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    unsigned S = 0;
-    if (MemSize == 1 && !ShiftExtend.ImplicitAmount)
-      S = 1;
-    else if (MemSize != 1 && ShiftExtend.Amount != 0)
-      S = 1;
+  static AArch64Operand *CreateCondCode(AArch64CC::CondCode Code, SMLoc S,
+                                        SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_CondCode, Ctx);
+    Op->CondCode.Code = Code;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    Inst.addOperand(MCOperand::CreateImm((OptionHi << 1) | S));
+  static AArch64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_FPImm, Ctx);
+    Op->FPImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
-  void addShiftOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
 
-    Inst.addOperand(MCOperand::CreateImm(ShiftExtend.Amount));
+  static AArch64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Barrier, Ctx);
+    Op->Barrier.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
-  void addNeonUImm64MaskOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
+  static AArch64Operand *CreateSysReg(StringRef Str, SMLoc S,
+                                    uint64_t FeatureBits, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysReg, Ctx);
+    Op->SysReg.Data = Str.data();
+    Op->SysReg.Length = Str.size();
+    Op->SysReg.FeatureBits = FeatureBits;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
 
-    // A bit from each byte in the constant forms the encoded immediate
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
+  static AArch64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
+                                   MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_SysCR, Ctx);
+    Op->SysCRImm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
 
-    unsigned Imm = 0;
-    for (unsigned i = 0; i < 8; ++i, Value >>= 8) {
-      Imm |= (Value & 1) << i;
-    }
-    Inst.addOperand(MCOperand::CreateImm(Imm));
+  static AArch64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_Prefetch, Ctx);
+    Op->Prefetch.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
   }
 
-  void addVectorListOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
+  static AArch64Operand *CreateShiftExtend(AArch64_AM::ShiftExtendType ShOp,
+                                         unsigned Val, bool HasExplicitAmount,
+                                         SMLoc S, SMLoc E, MCContext &Ctx) {
+    AArch64Operand *Op = new AArch64Operand(k_ShiftExtend, Ctx);
+    Op->ShiftExtend.Type = ShOp;
+    Op->ShiftExtend.Amount = Val;
+    Op->ShiftExtend.HasExplicitAmount = HasExplicitAmount;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
   }
 };
 
 } // end anonymous namespace.
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                               StringRef Mnemonic) {
+void AArch64Operand::print(raw_ostream &OS) const {
+  switch (Kind) {
+  case k_FPImm:
+    OS << "<fpimm " << getFPImm() << "("
+       << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
+    break;
+  case k_Barrier: {
+    bool Valid;
+    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
+    if (Valid)
+      OS << "<barrier " << Name << ">";
+    else
+      OS << "<barrier invalid #" << getBarrier() << ">";
+    break;
+  }
+  case k_Immediate:
+    getImm()->print(OS);
+    break;
+  case k_ShiftedImm: {
+    unsigned Shift = getShiftedImmShift();
+    OS << "<shiftedimm ";
+    getShiftedImmVal()->print(OS);
+    OS << ", lsl #" << AArch64_AM::getShiftValue(Shift) << ">";
+    break;
+  }
+  case k_CondCode:
+    OS << "<condcode " << getCondCode() << ">";
+    break;
+  case k_Register:
+    OS << "<register " << getReg() << ">";
+    break;
+  case k_VectorList: {
+    OS << "<vectorlist ";
+    unsigned Reg = getVectorListStart();
+    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
+      OS << Reg + i << " ";
+    OS << ">";
+    break;
+  }
+  case k_VectorIndex:
+    OS << "<vectorindex " << getVectorIndex() << ">";
+    break;
+  case k_SysReg:
+    OS << "<sysreg: " << getSysReg() << '>';
+    break;
+  case k_Token:
+    OS << "'" << getToken() << "'";
+    break;
+  case k_SysCR:
+    OS << "c" << getSysCR();
+    break;
+  case k_Prefetch: {
+    bool Valid;
+    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
+    if (Valid)
+      OS << "<prfop " << Name << ">";
+    else
+      OS << "<prfop invalid #" << getPrefetch() << ">";
+    break;
+  }
+  case k_ShiftExtend: {
+    OS << "<" << AArch64_AM::getShiftExtendName(getShiftExtendType()) << " #"
+       << getShiftExtendAmount();
+    if (!hasShiftExtendAmount())
+      OS << "<imp>";
+    OS << '>';
+    break;
+  }
+  }
+}
 
-  // See if the operand has a custom parser
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+/// @name Auto-generated Match Functions
+/// {
 
-  // It could either succeed, fail or just not care.
-  if (ResTy != MatchOperand_NoMatch)
-    return ResTy;
+static unsigned MatchRegisterName(StringRef Name);
 
-  switch (getLexer().getKind()) {
-  default:
-    Error(Parser.getTok().getLoc(), "unexpected token in operand");
-    return MatchOperand_ParseFail;
-  case AsmToken::Identifier: {
-    // It might be in the LSL/UXTB family ...
-    OperandMatchResultTy GotShift = ParseShiftExtend(Operands);
+/// }
+
+static unsigned matchVectorRegName(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+      .Case("v0", AArch64::Q0)
+      .Case("v1", AArch64::Q1)
+      .Case("v2", AArch64::Q2)
+      .Case("v3", AArch64::Q3)
+      .Case("v4", AArch64::Q4)
+      .Case("v5", AArch64::Q5)
+      .Case("v6", AArch64::Q6)
+      .Case("v7", AArch64::Q7)
+      .Case("v8", AArch64::Q8)
+      .Case("v9", AArch64::Q9)
+      .Case("v10", AArch64::Q10)
+      .Case("v11", AArch64::Q11)
+      .Case("v12", AArch64::Q12)
+      .Case("v13", AArch64::Q13)
+      .Case("v14", AArch64::Q14)
+      .Case("v15", AArch64::Q15)
+      .Case("v16", AArch64::Q16)
+      .Case("v17", AArch64::Q17)
+      .Case("v18", AArch64::Q18)
+      .Case("v19", AArch64::Q19)
+      .Case("v20", AArch64::Q20)
+      .Case("v21", AArch64::Q21)
+      .Case("v22", AArch64::Q22)
+      .Case("v23", AArch64::Q23)
+      .Case("v24", AArch64::Q24)
+      .Case("v25", AArch64::Q25)
+      .Case("v26", AArch64::Q26)
+      .Case("v27", AArch64::Q27)
+      .Case("v28", AArch64::Q28)
+      .Case("v29", AArch64::Q29)
+      .Case("v30", AArch64::Q30)
+      .Case("v31", AArch64::Q31)
+      .Default(0);
+}
 
-    // We can only continue if no tokens were eaten.
-    if (GotShift != MatchOperand_NoMatch)
-      return GotShift;
+static bool isValidVectorKind(StringRef Name) {
+  return StringSwitch<bool>(Name.lower())
+      .Case(".8b", true)
+      .Case(".16b", true)
+      .Case(".4h", true)
+      .Case(".8h", true)
+      .Case(".2s", true)
+      .Case(".4s", true)
+      .Case(".1d", true)
+      .Case(".2d", true)
+      .Case(".1q", true)
+      // Accept the width neutral ones, too, for verbose syntax. If those
+      // aren't used in the right places, the token operand won't match so
+      // all will work out.
+      .Case(".b", true)
+      .Case(".h", true)
+      .Case(".s", true)
+      .Case(".d", true)
+      .Default(false);
+}
 
-    // ... or it might be a register ...
-    uint32_t NumLanes = 0;
-    OperandMatchResultTy GotReg = ParseRegister(Operands, NumLanes);
-    assert(GotReg != MatchOperand_ParseFail
-           && "register parsing shouldn't partially succeed");
-
-    if (GotReg == MatchOperand_Success) {
-      if (Parser.getTok().is(AsmToken::LBrac))
-        return ParseNEONLane(Operands, NumLanes);
-      else
-        return MatchOperand_Success;
-    }
-    // ... or it might be a symbolish thing
-  }
-    // Fall through
-  case AsmToken::LParen:  // E.g. (strcmp-4)
-  case AsmToken::Integer: // 1f, 2b labels
-  case AsmToken::String:  // quoted labels
-  case AsmToken::Dot:     // . is Current location
-  case AsmToken::Dollar:  // $ is PC
-  case AsmToken::Colon: {
-    SMLoc StartLoc  = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = 0;
-
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
+                                 char &ElementKind) {
+  assert(isValidVectorKind(Name));
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
-  }
-  case AsmToken::Hash: {   // Immediates
-    SMLoc StartLoc = Parser.getTok().getLoc();
-    SMLoc EndLoc;
-    const MCExpr *ImmVal = 0;
-    Parser.Lex();
+  ElementKind = Name.lower()[Name.size() - 1];
+  NumElements = 0;
 
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+  if (Name.size() == 2)
+    return;
 
-    EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, StartLoc, EndLoc));
-    return MatchOperand_Success;
+  // Parse the lane count
+  Name = Name.drop_front();
+  while (isdigit(Name.front())) {
+    NumElements = 10 * NumElements + (Name.front() - '0');
+    Name = Name.drop_front();
   }
-  case AsmToken::LBrac: {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-    Parser.Lex(); // Eat '['
+}
 
-    // There's no comma after a '[', so we can parse the next operand
-    // immediately.
-    return ParseOperand(Operands, Mnemonic);
-  }
-  // The following will likely be useful later, but not in very early cases
-  case AsmToken::LCurly: // SIMD vector list is not parsed here
-    llvm_unreachable("Don't know how to deal with '{' in operand");
-    return MatchOperand_ParseFail;
+bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  StartLoc = getLoc();
+  RegNo = tryParseRegister();
+  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  return (RegNo == (unsigned)-1);
+}
+
+/// tryParseRegister - Try to parse a register name. The token must be an
+/// Identifier when called, and if it is a register name the token is eaten and
+/// the register is added to the operand list.
+int AArch64AsmParser::tryParseRegister() {
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  std::string lowerCase = Tok.getString().lower();
+  unsigned RegNum = MatchRegisterName(lowerCase);
+  // Also handle a few aliases of registers.
+  if (RegNum == 0)
+    RegNum = StringSwitch<unsigned>(lowerCase)
+                 .Case("fp",  AArch64::FP)
+                 .Case("lr",  AArch64::LR)
+                 .Case("x31", AArch64::XZR)
+                 .Case("w31", AArch64::WZR)
+                 .Default(0);
+
+  if (RegNum == 0)
+    return -1;
+
+  Parser.Lex(); // Eat identifier token.
+  return RegNum;
+}
+
+/// tryMatchVectorRegister - Try to parse a vector register name with optional
+/// kind specifier. If it is a register specifier, eat the token and return it.
+int AArch64AsmParser::tryMatchVectorRegister(StringRef &Kind, bool expected) {
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    TokError("vector register expected");
+    return -1;
+  }
+
+  StringRef Name = Parser.getTok().getString();
+  // If there is a kind specifier, it's separated from the register name by
+  // a '.'.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+  unsigned RegNum = matchVectorRegName(Head);
+  if (RegNum) {
+    if (Next != StringRef::npos) {
+      Kind = Name.slice(Next, StringRef::npos);
+      if (!isValidVectorKind(Kind)) {
+        TokError("invalid vector kind qualifier");
+        return -1;
+      }
+    }
+    Parser.Lex(); // Eat the register token.
+    return RegNum;
   }
+
+  if (expected)
+    TokError("vector register expected");
+  return -1;
 }
 
+/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmediate(const MCExpr *&ExprVal) {
-  if (getLexer().is(AsmToken::Colon)) {
-    AArch64MCExpr::VariantKind RefKind;
+AArch64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-    OperandMatchResultTy ResTy = ParseRelocPrefix(RefKind);
-    if (ResTy != MatchOperand_Success)
-      return ResTy;
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
 
-    const MCExpr *SubExprVal;
-    if (getParser().parseExpression(SubExprVal))
-      return MatchOperand_ParseFail;
+  StringRef Tok = Parser.getTok().getIdentifier();
+  if (Tok[0] != 'c' && Tok[0] != 'C') {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
+  }
 
-    ExprVal = AArch64MCExpr::Create(RefKind, SubExprVal, getContext());
-    return MatchOperand_Success;
+  uint32_t CRNum;
+  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
+  if (BadNum || CRNum > 15) {
+    Error(S, "Expected cN operand where 0 <= N <= 15");
+    return MatchOperand_ParseFail;
   }
 
-  // No weird AArch64MCExpr prefix
-  return getParser().parseExpression(ExprVal)
-    ? MatchOperand_ParseFail : MatchOperand_Success;
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(CRNum, S, getLoc(), getContext()));
+  return MatchOperand_Success;
 }
 
-// A lane attached to a NEON register. "[N]", which should yield three tokens:
-// '[', N, ']'. A hash is not allowed to precede the immediate here.
+/// tryParsePrefetch - Try to parse a prefetch operand.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNEONLane(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t NumLanes) {
-  SMLoc Loc = Parser.getTok().getLoc();
+AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  // Either an identifier for named values or a 5-bit immediate.
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    if (Hash)
+      Parser.Lex(); // Eat hash token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
 
-  assert(Parser.getTok().is(AsmToken::LBrac) && "inappropriate operand");
-  Operands.push_back(AArch64Operand::CreateToken("[", Loc));
-  Parser.Lex(); // Eat '['
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for prefetch operand");
+      return MatchOperand_ParseFail;
+    }
+    unsigned prfop = MCE->getValue();
+    if (prfop > 31) {
+      TokError("prefetch operand out of range, [0,31] expected");
+      return MatchOperand_ParseFail;
+    }
 
-  if (Parser.getTok().isNot(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(), "expected lane number");
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
-  if (Parser.getTok().getIntVal() >= NumLanes) {
-    Error(Parser.getTok().getLoc(), "lane number incompatible with layout");
+  bool Valid;
+  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
-  const MCExpr *Lane = MCConstantExpr::Create(Parser.getTok().getIntVal(),
-                                              getContext());
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat actual lane
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImm(Lane, S, E));
+  Parser.Lex(); // Eat identifier token.
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+  return MatchOperand_Success;
+}
 
+/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
+/// instruction.
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(Parser.getTok().getLoc(), "expected ']' after lane");
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
+  }
+
+  if (parseSymbolicImmVal(Expr))
     return MatchOperand_ParseFail;
+
+  AArch64MCExpr::VariantKind ELFRefKind;
+  MCSymbolRefExpr::VariantKind DarwinRefKind;
+  int64_t Addend;
+  if (classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+    if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
+        ELFRefKind == AArch64MCExpr::VK_INVALID) {
+      // No modifier was specified at all; this is the syntax for an ELF basic
+      // ADRP relocation (unfortunately).
+      Expr =
+          AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_ABS_PAGE, getContext());
+    } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
+                DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
+               Addend != 0) {
+      Error(S, "gotpage label reference not allowed an addend");
+      return MatchOperand_ParseFail;
+    } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
+               DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOT_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_GOTTPREL_PAGE &&
+               ELFRefKind != AArch64MCExpr::VK_TLSDESC_PAGE) {
+      // The operand must be an @page or @gotpage qualified symbolref.
+      Error(S, "page or gotpage label reference expected");
+      return MatchOperand_ParseFail;
+    }
   }
 
-  Operands.push_back(AArch64Operand::CreateToken("]", Loc));
-  Parser.Lex(); // Eat ']'
+  // We have either a label reference possibly with addend or an immediate. The
+  // addend is a raw value here. The linker will adjust it to only reference the
+  // page.
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
 
   return MatchOperand_Success;
 }
 
+/// tryParseAdrLabel - Parse and validate a source label for the ADR
+/// instruction.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRelocPrefix(AArch64MCExpr::VariantKind &RefKind) {
-  assert(getLexer().is(AsmToken::Colon) && "expected a ':'");
-  Parser.Lex();
+AArch64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const MCExpr *Expr;
 
-  if (getLexer().isNot(AsmToken::Identifier)) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
-    return MatchOperand_ParseFail;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat hash token.
   }
 
-  std::string LowerCase = Parser.getTok().getIdentifier().lower();
-  RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
-    .Case("got",              AArch64MCExpr::VK_AARCH64_GOT)
-    .Case("got_lo12",         AArch64MCExpr::VK_AARCH64_GOT_LO12)
-    .Case("lo12",             AArch64MCExpr::VK_AARCH64_LO12)
-    .Case("abs_g0",           AArch64MCExpr::VK_AARCH64_ABS_G0)
-    .Case("abs_g0_nc",        AArch64MCExpr::VK_AARCH64_ABS_G0_NC)
-    .Case("abs_g1",           AArch64MCExpr::VK_AARCH64_ABS_G1)
-    .Case("abs_g1_nc",        AArch64MCExpr::VK_AARCH64_ABS_G1_NC)
-    .Case("abs_g2",           AArch64MCExpr::VK_AARCH64_ABS_G2)
-    .Case("abs_g2_nc",        AArch64MCExpr::VK_AARCH64_ABS_G2_NC)
-    .Case("abs_g3",           AArch64MCExpr::VK_AARCH64_ABS_G3)
-    .Case("abs_g0_s",         AArch64MCExpr::VK_AARCH64_SABS_G0)
-    .Case("abs_g1_s",         AArch64MCExpr::VK_AARCH64_SABS_G1)
-    .Case("abs_g2_s",         AArch64MCExpr::VK_AARCH64_SABS_G2)
-    .Case("dtprel_g2",        AArch64MCExpr::VK_AARCH64_DTPREL_G2)
-    .Case("dtprel_g1",        AArch64MCExpr::VK_AARCH64_DTPREL_G1)
-    .Case("dtprel_g1_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC)
-    .Case("dtprel_g0",        AArch64MCExpr::VK_AARCH64_DTPREL_G0)
-    .Case("dtprel_g0_nc",     AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC)
-    .Case("dtprel_hi12",      AArch64MCExpr::VK_AARCH64_DTPREL_HI12)
-    .Case("dtprel_lo12",      AArch64MCExpr::VK_AARCH64_DTPREL_LO12)
-    .Case("dtprel_lo12_nc",   AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC)
-    .Case("gottprel_g1",      AArch64MCExpr::VK_AARCH64_GOTTPREL_G1)
-    .Case("gottprel_g0_nc",   AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC)
-    .Case("gottprel",         AArch64MCExpr::VK_AARCH64_GOTTPREL)
-    .Case("gottprel_lo12",    AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12)
-    .Case("tprel_g2",         AArch64MCExpr::VK_AARCH64_TPREL_G2)
-    .Case("tprel_g1",         AArch64MCExpr::VK_AARCH64_TPREL_G1)
-    .Case("tprel_g1_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G1_NC)
-    .Case("tprel_g0",         AArch64MCExpr::VK_AARCH64_TPREL_G0)
-    .Case("tprel_g0_nc",      AArch64MCExpr::VK_AARCH64_TPREL_G0_NC)
-    .Case("tprel_hi12",       AArch64MCExpr::VK_AARCH64_TPREL_HI12)
-    .Case("tprel_lo12",       AArch64MCExpr::VK_AARCH64_TPREL_LO12)
-    .Case("tprel_lo12_nc",    AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC)
-    .Case("tlsdesc",          AArch64MCExpr::VK_AARCH64_TLSDESC)
-    .Case("tlsdesc_lo12",     AArch64MCExpr::VK_AARCH64_TLSDESC_LO12)
-    .Default(AArch64MCExpr::VK_AARCH64_None);
-
-  if (RefKind == AArch64MCExpr::VK_AARCH64_None) {
-    Error(Parser.getTok().getLoc(),
-          "expected relocation specifier in operand after ':'");
+  if (getParser().parseExpression(Expr))
     return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat identifier
 
-  if (getLexer().isNot(AsmToken::Colon)) {
-    Error(Parser.getTok().getLoc(),
-          "expected ':' after relocation specifier");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex();
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+
   return MatchOperand_Success;
 }
 
+/// tryParseFPImm - A floating point immediate expression operand.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseImmWithLSLOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
-  SMLoc S = Parser.getTok().getLoc();
+  bool Hash = false;
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex(); // Eat '#'
+    Hash = true;
+  }
+
+  // Handle negation, as that still comes through as a separate token.
+  bool isNegative = false;
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    isNegative = true;
+    Parser.Lex();
+  }
+  const AsmToken &Tok = Parser.getTok();
+  if (Tok.is(AsmToken::Real)) {
+    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    // If we had a '-' in front, toggle the sign bit.
+    IntVal ^= (uint64_t)isNegative << 63;
+    int Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    Parser.Lex(); // Eat the token.
+    // Check for out of range values. As an exception, we let Zero through,
+    // as we handle that special case in post-processing before matching in
+    // order to use the zero register for it.
+    if (Val == -1 && !RealVal.isZero()) {
+      TokError("expected compatible register or floating-point constant");
+      return MatchOperand_ParseFail;
+    }
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+  if (Tok.is(AsmToken::Integer)) {
+    int64_t Val;
+    if (!isNegative && Tok.getString().startswith("0x")) {
+      Val = Tok.getIntVal();
+      if (Val > 255 || Val < 0) {
+        TokError("encoded floating point value out of range");
+        return MatchOperand_ParseFail;
+      }
+    } else {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      // If we had a '-' in front, toggle the sign bit.
+      IntVal ^= (uint64_t)isNegative << 63;
+      Val = AArch64_AM::getFP64Imm(APInt(64, IntVal));
+    }
+    Parser.Lex(); // Eat the token.
+    Operands.push_back(AArch64Operand::CreateFPImm(Val, S, getContext()));
+    return MatchOperand_Success;
+  }
+
+  if (!Hash)
+    return MatchOperand_NoMatch;
+
+  TokError("invalid floating point immediate");
+  return MatchOperand_ParseFail;
+}
+
+/// tryParseAddSubImm - Parse ADD/SUB shifted immediate operand
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseAddSubImm(OperandVector &Operands) {
+  SMLoc S = getLoc();
 
   if (Parser.getTok().is(AsmToken::Hash))
     Parser.Lex(); // Eat '#'
@@ -1491,11 +2091,21 @@ AArch64AsmParser::ParseImmWithLSLOperand(
     return MatchOperand_NoMatch;
 
   const MCExpr *Imm;
-  if (ParseImmediate(Imm) != MatchOperand_Success)
+  if (parseSymbolicImmVal(Imm))
     return MatchOperand_ParseFail;
   else if (Parser.getTok().isNot(AsmToken::Comma)) {
+    uint64_t ShiftAmount = 0;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Imm);
+    if (MCE) {
+      int64_t Val = MCE->getValue();
+      if (Val > 0xfff && (Val & 0xfff) == 0) {
+        Imm = MCConstantExpr::Create(Val >> 12, getContext());
+        ShiftAmount = 12;
+      }
+    }
     SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, 0, true, S, E));
+    Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount, S, E,
+                                                        getContext()));
     return MatchOperand_Success;
   }
 
@@ -1503,18 +2113,22 @@ AArch64AsmParser::ParseImmWithLSLOperand(
   Parser.Lex();
 
   // The optional operand must be "lsl #N" where N is non-negative.
-  if (Parser.getTok().is(AsmToken::Identifier)
-      && Parser.getTok().getIdentifier().equals_lower("lsl")) {
-    Parser.Lex();
+  if (!Parser.getTok().is(AsmToken::Identifier) ||
+      !Parser.getTok().getIdentifier().equals_lower("lsl")) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
+  }
 
-    if (Parser.getTok().is(AsmToken::Hash)) {
-      Parser.Lex();
+  // Eat 'lsl'
+  Parser.Lex();
 
-      if (Parser.getTok().isNot(AsmToken::Integer)) {
-        Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
-        return MatchOperand_ParseFail;
-      }
-    }
+  if (Parser.getTok().is(AsmToken::Hash)) {
+    Parser.Lex();
+  }
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(), "only 'lsl #+N' valid after immediate");
+    return MatchOperand_ParseFail;
   }
 
   int64_t ShiftAmount = Parser.getTok().getIntVal();
@@ -1526,791 +2140,977 @@ AArch64AsmParser::ParseImmWithLSLOperand(
   Parser.Lex(); // Eat the number
 
   SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateImmWithLSL(Imm, ShiftAmount,
-                                                      false, S, E));
+  Operands.push_back(AArch64Operand::CreateShiftedImm(Imm, ShiftAmount,
+                                                      S, E, getContext()));
   return MatchOperand_Success;
 }
 
+/// parseCondCodeString - Parse a Condition Code string.
+AArch64CC::CondCode AArch64AsmParser::parseCondCodeString(StringRef Cond) {
+  AArch64CC::CondCode CC = StringSwitch<AArch64CC::CondCode>(Cond.lower())
+                    .Case("eq", AArch64CC::EQ)
+                    .Case("ne", AArch64CC::NE)
+                    .Case("cs", AArch64CC::HS)
+                    .Case("hs", AArch64CC::HS)
+                    .Case("cc", AArch64CC::LO)
+                    .Case("lo", AArch64CC::LO)
+                    .Case("mi", AArch64CC::MI)
+                    .Case("pl", AArch64CC::PL)
+                    .Case("vs", AArch64CC::VS)
+                    .Case("vc", AArch64CC::VC)
+                    .Case("hi", AArch64CC::HI)
+                    .Case("ls", AArch64CC::LS)
+                    .Case("ge", AArch64CC::GE)
+                    .Case("lt", AArch64CC::LT)
+                    .Case("gt", AArch64CC::GT)
+                    .Case("le", AArch64CC::LE)
+                    .Case("al", AArch64CC::AL)
+                    .Case("nv", AArch64CC::NV)
+                    .Default(AArch64CC::Invalid);
+  return CC;
+}
+
+/// parseCondCode - Parse a Condition Code operand.
+bool AArch64AsmParser::parseCondCode(OperandVector &Operands,
+                                     bool invertCondCode) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = Parser.getTok();
+  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
+
+  StringRef Cond = Tok.getString();
+  AArch64CC::CondCode CC = parseCondCodeString(Cond);
+  if (CC == AArch64CC::Invalid)
+    return TokError("invalid condition code");
+  Parser.Lex(); // Eat identifier token.
+
+  if (invertCondCode)
+    CC = AArch64CC::getInvertedCondCode(AArch64CC::CondCode(CC));
+
+  Operands.push_back(
+      AArch64Operand::CreateCondCode(CC, S, getLoc(), getContext()));
+  return false;
+}
 
+/// tryParseOptionalShift - Some operands take an optional shift argument. Parse
+/// them if present.
 AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCondCodeOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
+AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  std::string LowerID = Tok.getString().lower();
+  AArch64_AM::ShiftExtendType ShOp =
+      StringSwitch<AArch64_AM::ShiftExtendType>(LowerID)
+          .Case("lsl", AArch64_AM::LSL)
+          .Case("lsr", AArch64_AM::LSR)
+          .Case("asr", AArch64_AM::ASR)
+          .Case("ror", AArch64_AM::ROR)
+          .Case("msl", AArch64_AM::MSL)
+          .Case("uxtb", AArch64_AM::UXTB)
+          .Case("uxth", AArch64_AM::UXTH)
+          .Case("uxtw", AArch64_AM::UXTW)
+          .Case("uxtx", AArch64_AM::UXTX)
+          .Case("sxtb", AArch64_AM::SXTB)
+          .Case("sxth", AArch64_AM::SXTH)
+          .Case("sxtw", AArch64_AM::SXTW)
+          .Case("sxtx", AArch64_AM::SXTX)
+          .Default(AArch64_AM::InvalidShiftExtend);
+
+  if (ShOp == AArch64_AM::InvalidShiftExtend)
     return MatchOperand_NoMatch;
 
-  StringRef Tok = Parser.getTok().getIdentifier();
-  A64CC::CondCodes CondCode = A64StringToCondCode(Tok);
+  SMLoc S = Tok.getLoc();
+  Parser.Lex();
 
-  if (CondCode == A64CC::Invalid)
-    return MatchOperand_NoMatch;
+  bool Hash = getLexer().is(AsmToken::Hash);
+  if (!Hash && getLexer().isNot(AsmToken::Integer)) {
+    if (ShOp == AArch64_AM::LSL || ShOp == AArch64_AM::LSR ||
+        ShOp == AArch64_AM::ASR || ShOp == AArch64_AM::ROR ||
+        ShOp == AArch64_AM::MSL) {
+      // We expect a number here.
+      TokError("expected #imm after shift specifier");
+      return MatchOperand_ParseFail;
+    }
 
-  SMLoc S = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat condition code
-  SMLoc E = Parser.getTok().getLoc();
+    // "extend" type operatoins don't need an immediate, #0 is implicit.
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(
+        AArch64Operand::CreateShiftExtend(ShOp, 0, false, S, E, getContext()));
+    return MatchOperand_Success;
+  }
 
-  Operands.push_back(AArch64Operand::CreateCondCode(CondCode, S, E));
-  return MatchOperand_Success;
-}
+  if (Hash)
+    Parser.Lex(); // Eat the '#'.
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseCRxOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
+  // Make sure we do actually have a number
+  if (!Parser.getTok().is(AsmToken::Integer)) {
+    Error(Parser.getTok().getLoc(),
+          "expected integer shift amount");
     return MatchOperand_ParseFail;
   }
 
-  StringRef Tok = Parser.getTok().getIdentifier();
-  if (Tok[0] != 'c' && Tok[0] != 'C') {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
+  const MCExpr *ImmVal;
+  if (getParser().parseExpression(ImmVal))
     return MatchOperand_ParseFail;
-  }
 
-  uint32_t CRNum;
-  bool BadNum = Tok.drop_front().getAsInteger(10, CRNum);
-  if (BadNum || CRNum > 15) {
-    Error(S, "Expected cN operand where 0 <= N <= 15");
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+  if (!MCE) {
+    TokError("expected #imm after shift specifier");
     return MatchOperand_ParseFail;
   }
 
-  const MCExpr *CRImm = MCConstantExpr::Create(CRNum, getContext());
-
-  Parser.Lex();
-  SMLoc E = Parser.getTok().getLoc();
-
-  Operands.push_back(AArch64Operand::CreateImm(CRImm, S, E));
+  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+  Operands.push_back(AArch64Operand::CreateShiftExtend(
+      ShOp, MCE->getValue(), true, S, E, getContext()));
   return MatchOperand_Success;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImmOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
+/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
+                                   OperandVector &Operands) {
+  if (Name.find('.') != StringRef::npos)
+    return TokError("invalid operand");
 
-  SMLoc S = Parser.getTok().getLoc();
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sys", false, NameLoc, getContext()));
 
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
-  }
+  const AsmToken &Tok = Parser.getTok();
+  StringRef Op = Tok.getString();
+  SMLoc S = Tok.getLoc();
 
-  bool Negative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Negative = true;
-    Parser.Lex(); // Eat '-'
-  } else if (Parser.getTok().is(AsmToken::Plus)) {
-    Parser.Lex(); // Eat '+'
+  const MCExpr *Expr = nullptr;
+
+#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
+  do {                                                                         \
+    Expr = MCConstantExpr::Create(op1, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
+    Expr = MCConstantExpr::Create(op2, getContext());                          \
+    Operands.push_back(                                                        \
+        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
+  } while (0)
+
+  if (Mnemonic == "ic") {
+    if (!Op.compare_lower("ialluis")) {
+      // SYS #0, C7, C1, #0
+      SYS_ALIAS(0, 7, 1, 0);
+    } else if (!Op.compare_lower("iallu")) {
+      // SYS #0, C7, C5, #0
+      SYS_ALIAS(0, 7, 5, 0);
+    } else if (!Op.compare_lower("ivau")) {
+      // SYS #3, C7, C5, #1
+      SYS_ALIAS(3, 7, 5, 1);
+    } else {
+      return TokError("invalid operand for IC instruction");
+    }
+  } else if (Mnemonic == "dc") {
+    if (!Op.compare_lower("zva")) {
+      // SYS #3, C7, C4, #1
+      SYS_ALIAS(3, 7, 4, 1);
+    } else if (!Op.compare_lower("ivac")) {
+      // SYS #3, C7, C6, #1
+      SYS_ALIAS(0, 7, 6, 1);
+    } else if (!Op.compare_lower("isw")) {
+      // SYS #0, C7, C6, #2
+      SYS_ALIAS(0, 7, 6, 2);
+    } else if (!Op.compare_lower("cvac")) {
+      // SYS #3, C7, C10, #1
+      SYS_ALIAS(3, 7, 10, 1);
+    } else if (!Op.compare_lower("csw")) {
+      // SYS #0, C7, C10, #2
+      SYS_ALIAS(0, 7, 10, 2);
+    } else if (!Op.compare_lower("cvau")) {
+      // SYS #3, C7, C11, #1
+      SYS_ALIAS(3, 7, 11, 1);
+    } else if (!Op.compare_lower("civac")) {
+      // SYS #3, C7, C14, #1
+      SYS_ALIAS(3, 7, 14, 1);
+    } else if (!Op.compare_lower("cisw")) {
+      // SYS #0, C7, C14, #2
+      SYS_ALIAS(0, 7, 14, 2);
+    } else {
+      return TokError("invalid operand for DC instruction");
+    }
+  } else if (Mnemonic == "at") {
+    if (!Op.compare_lower("s1e1r")) {
+      // SYS #0, C7, C8, #0
+      SYS_ALIAS(0, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e2r")) {
+      // SYS #4, C7, C8, #0
+      SYS_ALIAS(4, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e3r")) {
+      // SYS #6, C7, C8, #0
+      SYS_ALIAS(6, 7, 8, 0);
+    } else if (!Op.compare_lower("s1e1w")) {
+      // SYS #0, C7, C8, #1
+      SYS_ALIAS(0, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e2w")) {
+      // SYS #4, C7, C8, #1
+      SYS_ALIAS(4, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e3w")) {
+      // SYS #6, C7, C8, #1
+      SYS_ALIAS(6, 7, 8, 1);
+    } else if (!Op.compare_lower("s1e0r")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 2);
+    } else if (!Op.compare_lower("s1e0w")) {
+      // SYS #0, C7, C8, #3
+      SYS_ALIAS(0, 7, 8, 3);
+    } else if (!Op.compare_lower("s12e1r")) {
+      // SYS #4, C7, C8, #4
+      SYS_ALIAS(4, 7, 8, 4);
+    } else if (!Op.compare_lower("s12e1w")) {
+      // SYS #4, C7, C8, #5
+      SYS_ALIAS(4, 7, 8, 5);
+    } else if (!Op.compare_lower("s12e0r")) {
+      // SYS #4, C7, C8, #6
+      SYS_ALIAS(4, 7, 8, 6);
+    } else if (!Op.compare_lower("s12e0w")) {
+      // SYS #4, C7, C8, #7
+      SYS_ALIAS(4, 7, 8, 7);
+    } else {
+      return TokError("invalid operand for AT instruction");
+    }
+  } else if (Mnemonic == "tlbi") {
+    if (!Op.compare_lower("vmalle1is")) {
+      // SYS #0, C8, C3, #0
+      SYS_ALIAS(0, 8, 3, 0);
+    } else if (!Op.compare_lower("alle2is")) {
+      // SYS #4, C8, C3, #0
+      SYS_ALIAS(4, 8, 3, 0);
+    } else if (!Op.compare_lower("alle3is")) {
+      // SYS #6, C8, C3, #0
+      SYS_ALIAS(6, 8, 3, 0);
+    } else if (!Op.compare_lower("vae1is")) {
+      // SYS #0, C8, C3, #1
+      SYS_ALIAS(0, 8, 3, 1);
+    } else if (!Op.compare_lower("vae2is")) {
+      // SYS #4, C8, C3, #1
+      SYS_ALIAS(4, 8, 3, 1);
+    } else if (!Op.compare_lower("vae3is")) {
+      // SYS #6, C8, C3, #1
+      SYS_ALIAS(6, 8, 3, 1);
+    } else if (!Op.compare_lower("aside1is")) {
+      // SYS #0, C8, C3, #2
+      SYS_ALIAS(0, 8, 3, 2);
+    } else if (!Op.compare_lower("vaae1is")) {
+      // SYS #0, C8, C3, #3
+      SYS_ALIAS(0, 8, 3, 3);
+    } else if (!Op.compare_lower("alle1is")) {
+      // SYS #4, C8, C3, #4
+      SYS_ALIAS(4, 8, 3, 4);
+    } else if (!Op.compare_lower("vale1is")) {
+      // SYS #0, C8, C3, #5
+      SYS_ALIAS(0, 8, 3, 5);
+    } else if (!Op.compare_lower("vaale1is")) {
+      // SYS #0, C8, C3, #7
+      SYS_ALIAS(0, 8, 3, 7);
+    } else if (!Op.compare_lower("vmalle1")) {
+      // SYS #0, C8, C7, #0
+      SYS_ALIAS(0, 8, 7, 0);
+    } else if (!Op.compare_lower("alle2")) {
+      // SYS #4, C8, C7, #0
+      SYS_ALIAS(4, 8, 7, 0);
+    } else if (!Op.compare_lower("vale2is")) {
+      // SYS #4, C8, C3, #5
+      SYS_ALIAS(4, 8, 3, 5);
+    } else if (!Op.compare_lower("vale3is")) {
+      // SYS #6, C8, C3, #5
+      SYS_ALIAS(6, 8, 3, 5);
+    } else if (!Op.compare_lower("alle3")) {
+      // SYS #6, C8, C7, #0
+      SYS_ALIAS(6, 8, 7, 0);
+    } else if (!Op.compare_lower("vae1")) {
+      // SYS #0, C8, C7, #1
+      SYS_ALIAS(0, 8, 7, 1);
+    } else if (!Op.compare_lower("vae2")) {
+      // SYS #4, C8, C7, #1
+      SYS_ALIAS(4, 8, 7, 1);
+    } else if (!Op.compare_lower("vae3")) {
+      // SYS #6, C8, C7, #1
+      SYS_ALIAS(6, 8, 7, 1);
+    } else if (!Op.compare_lower("aside1")) {
+      // SYS #0, C8, C7, #2
+      SYS_ALIAS(0, 8, 7, 2);
+    } else if (!Op.compare_lower("vaae1")) {
+      // SYS #0, C8, C7, #3
+      SYS_ALIAS(0, 8, 7, 3);
+    } else if (!Op.compare_lower("alle1")) {
+      // SYS #4, C8, C7, #4
+      SYS_ALIAS(4, 8, 7, 4);
+    } else if (!Op.compare_lower("vale1")) {
+      // SYS #0, C8, C7, #5
+      SYS_ALIAS(0, 8, 7, 5);
+    } else if (!Op.compare_lower("vale2")) {
+      // SYS #4, C8, C7, #5
+      SYS_ALIAS(4, 8, 7, 5);
+    } else if (!Op.compare_lower("vale3")) {
+      // SYS #6, C8, C7, #5
+      SYS_ALIAS(6, 8, 7, 5);
+    } else if (!Op.compare_lower("vaale1")) {
+      // SYS #0, C8, C7, #7
+      SYS_ALIAS(0, 8, 7, 7);
+    } else if (!Op.compare_lower("ipas2e1")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 4, 1);
+    } else if (!Op.compare_lower("ipas2le1")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 4, 5);
+    } else if (!Op.compare_lower("ipas2e1is")) {
+      // SYS #4, C8, C4, #1
+      SYS_ALIAS(4, 8, 0, 1);
+    } else if (!Op.compare_lower("ipas2le1is")) {
+      // SYS #4, C8, C4, #5
+      SYS_ALIAS(4, 8, 0, 5);
+    } else if (!Op.compare_lower("vmalls12e1")) {
+      // SYS #4, C8, C7, #6
+      SYS_ALIAS(4, 8, 7, 6);
+    } else if (!Op.compare_lower("vmalls12e1is")) {
+      // SYS #4, C8, C3, #6
+      SYS_ALIAS(4, 8, 3, 6);
+    } else {
+      return TokError("invalid operand for TLBI instruction");
+    }
   }
 
-  if (Parser.getTok().isNot(AsmToken::Real)) {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "Expected floating-point immediate");
-    return MatchOperand_ParseFail;
-  }
+#undef SYS_ALIAS
 
-  APFloat RealVal(APFloat::IEEEdouble, Parser.getTok().getString());
-  if (Negative) RealVal.changeSign();
-  double DblVal = RealVal.convertToDouble();
+  Parser.Lex(); // Eat operand.
 
-  Parser.Lex(); // Eat real number
-  SMLoc E = Parser.getTok().getLoc();
+  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool HasRegister = false;
 
-  Operands.push_back(AArch64Operand::CreateFPImm(DblVal, S, E));
-  return MatchOperand_Success;
-}
+  // Check for the optional register operand.
+  if (getLexer().is(AsmToken::Comma)) {
+    Parser.Lex(); // Eat comma.
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseFPImm0AndImm0Operand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
+    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
+      return TokError("expected register operand");
 
-  SMLoc S = Parser.getTok().getLoc();
+    HasRegister = true;
+  }
 
-  bool Hash = false;
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat '#'
-    Hash = true;
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    Parser.eatToEndOfStatement();
+    return TokError("unexpected token in argument list");
+  }
+
+  if (ExpectRegister && !HasRegister) {
+    return TokError("specified " + Mnemonic + " op requires a register");
+  }
+  else if (!ExpectRegister && HasRegister) {
+    return TokError("specified " + Mnemonic + " op does not use a register");
   }
 
-  APFloat RealVal(0.0);
-  if (Parser.getTok().is(AsmToken::Real)) {
-    if(Parser.getTok().getString() != "0.0") {
-      Error(S, "only #0.0 is acceptable as immediate");
+  Parser.Lex(); // Consume the EndOfStatement
+  return false;
+}
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+
+  // Can be either a #imm style literal or an option name
+  bool Hash = Tok.is(AsmToken::Hash);
+  if (Hash || Tok.is(AsmToken::Integer)) {
+    // Immediate operand.
+    if (Hash)
+      Parser.Lex(); // Eat the '#'
+    const MCExpr *ImmVal;
+    SMLoc ExprLoc = getLoc();
+    if (getParser().parseExpression(ImmVal))
+      return MatchOperand_ParseFail;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      Error(ExprLoc, "immediate value expected for barrier operand");
       return MatchOperand_ParseFail;
     }
-  }
-  else if (Parser.getTok().is(AsmToken::Integer)) {
-    if(Parser.getTok().getIntVal() != 0) {
-      Error(S, "only #0.0 is acceptable as immediate");
+    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
+      Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
+    Operands.push_back(
+        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    return MatchOperand_Success;
   }
-  else {
-    if (!Hash)
-      return MatchOperand_NoMatch;
-    Error(S, "only #0.0 is acceptable as immediate");
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    TokError("invalid operand for instruction");
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat real number
-  SMLoc E = Parser.getTok().getLoc();
+  bool Valid;
+  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  if (!Valid) {
+    TokError("invalid barrier option name");
+    return MatchOperand_ParseFail;
+  }
+
+  // The only valid named option for ISB is 'sy'
+  if (Mnemonic == "isb" && Opt != AArch64DB::SY) {
+    TokError("'sy' or #imm operand expected");
+    return MatchOperand_ParseFail;
+  }
+
+  Operands.push_back(
+      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Parser.Lex(); // Consume the option
 
-  Operands.push_back(AArch64Operand::CreateFPImm(0.0, S, E));
   return MatchOperand_Success;
 }
 
-// Automatically generated
-static unsigned MatchRegisterName(StringRef Name);
-
-bool
-AArch64AsmParser::IdentifyRegister(unsigned &RegNum, SMLoc &RegEndLoc,
-                                   StringRef &Layout,
-                                   SMLoc &LayoutLoc) const {
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseSysReg(OperandVector &Operands) {
   const AsmToken &Tok = Parser.getTok();
 
   if (Tok.isNot(AsmToken::Identifier))
-    return false;
+    return MatchOperand_NoMatch;
 
-  std::string LowerReg = Tok.getString().lower();
-  size_t DotPos = LowerReg.find('.');
+  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), getLoc(),
+                     STI.getFeatureBits(), getContext()));
+  Parser.Lex(); // Eat identifier
 
-  bool IsVec128 = false;
-  SMLoc S = Tok.getLoc();
-  RegEndLoc = SMLoc::getFromPointer(S.getPointer() + DotPos);
+  return MatchOperand_Success;
+}
 
-  if (DotPos == std::string::npos) {
-    Layout = StringRef();
-  } else {
-    // Everything afterwards needs to be a literal token, expected to be
-    // '.2d','.b' etc for vector registers.
-
-    // This StringSwitch validates the input and (perhaps more importantly)
-    // gives us a permanent string to use in the token (a pointer into LowerReg
-    // would go out of scope when we return).
-    LayoutLoc = SMLoc::getFromPointer(S.getPointer() + DotPos + 1);
-    StringRef LayoutText = StringRef(LowerReg).substr(DotPos);
-
-    // See if it's a 128-bit layout first.
-    Layout = StringSwitch<const char *>(LayoutText)
-      .Case(".q", ".q").Case(".1q", ".1q")
-      .Case(".d", ".d").Case(".2d", ".2d")
-      .Case(".s", ".s").Case(".4s", ".4s")
-      .Case(".h", ".h").Case(".8h", ".8h")
-      .Case(".b", ".b").Case(".16b", ".16b")
-      .Default("");
-
-    if (Layout.size() != 0)
-      IsVec128 = true;
-    else {
-      Layout = StringSwitch<const char *>(LayoutText)
-                   .Case(".1d", ".1d")
-                   .Case(".2s", ".2s")
-                   .Case(".4h", ".4h")
-                   .Case(".8b", ".8b")
-                   .Default("");
+/// tryParseVectorRegister - Parse a vector register operand.
+bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
+  if (Parser.getTok().isNot(AsmToken::Identifier))
+    return true;
+
+  SMLoc S = getLoc();
+  // Check for a vector register specifier first.
+  StringRef Kind;
+  int64_t Reg = tryMatchVectorRegister(Kind, false);
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
+  // If there was an explicit qualifier, that goes on as a literal text
+  // operand.
+  if (!Kind.empty())
+    Operands.push_back(
+        AArch64Operand::CreateToken(Kind, false, S, getContext()));
+
+  // If there is an index specifier following the register, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
+
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
     }
 
-    if (Layout.size() == 0) {
-      // If we've still not pinned it down the register is malformed.
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
       return false;
     }
-  }
 
-  RegNum = MatchRegisterName(LowerReg.substr(0, DotPos));
-  if (RegNum == AArch64::NoRegister) {
-    RegNum = StringSwitch<unsigned>(LowerReg.substr(0, DotPos))
-      .Case("ip0", AArch64::X16)
-      .Case("ip1", AArch64::X17)
-      .Case("fp", AArch64::X29)
-      .Case("lr", AArch64::X30)
-      .Case("v0", IsVec128 ? AArch64::Q0 : AArch64::D0)
-      .Case("v1", IsVec128 ? AArch64::Q1 : AArch64::D1)
-      .Case("v2", IsVec128 ? AArch64::Q2 : AArch64::D2)
-      .Case("v3", IsVec128 ? AArch64::Q3 : AArch64::D3)
-      .Case("v4", IsVec128 ? AArch64::Q4 : AArch64::D4)
-      .Case("v5", IsVec128 ? AArch64::Q5 : AArch64::D5)
-      .Case("v6", IsVec128 ? AArch64::Q6 : AArch64::D6)
-      .Case("v7", IsVec128 ? AArch64::Q7 : AArch64::D7)
-      .Case("v8", IsVec128 ? AArch64::Q8 : AArch64::D8)
-      .Case("v9", IsVec128 ? AArch64::Q9 : AArch64::D9)
-      .Case("v10", IsVec128 ? AArch64::Q10 : AArch64::D10)
-      .Case("v11", IsVec128 ? AArch64::Q11 : AArch64::D11)
-      .Case("v12", IsVec128 ? AArch64::Q12 : AArch64::D12)
-      .Case("v13", IsVec128 ? AArch64::Q13 : AArch64::D13)
-      .Case("v14", IsVec128 ? AArch64::Q14 : AArch64::D14)
-      .Case("v15", IsVec128 ? AArch64::Q15 : AArch64::D15)
-      .Case("v16", IsVec128 ? AArch64::Q16 : AArch64::D16)
-      .Case("v17", IsVec128 ? AArch64::Q17 : AArch64::D17)
-      .Case("v18", IsVec128 ? AArch64::Q18 : AArch64::D18)
-      .Case("v19", IsVec128 ? AArch64::Q19 : AArch64::D19)
-      .Case("v20", IsVec128 ? AArch64::Q20 : AArch64::D20)
-      .Case("v21", IsVec128 ? AArch64::Q21 : AArch64::D21)
-      .Case("v22", IsVec128 ? AArch64::Q22 : AArch64::D22)
-      .Case("v23", IsVec128 ? AArch64::Q23 : AArch64::D23)
-      .Case("v24", IsVec128 ? AArch64::Q24 : AArch64::D24)
-      .Case("v25", IsVec128 ? AArch64::Q25 : AArch64::D25)
-      .Case("v26", IsVec128 ? AArch64::Q26 : AArch64::D26)
-      .Case("v27", IsVec128 ? AArch64::Q27 : AArch64::D27)
-      .Case("v28", IsVec128 ? AArch64::Q28 : AArch64::D28)
-      .Case("v29", IsVec128 ? AArch64::Q29 : AArch64::D29)
-      .Case("v30", IsVec128 ? AArch64::Q30 : AArch64::D30)
-      .Case("v31", IsVec128 ? AArch64::Q31 : AArch64::D31)
-      .Default(AArch64::NoRegister);
-  }
-  if (RegNum == AArch64::NoRegister)
-    return false;
+    Parser.Lex(); // Eat right bracket token.
 
-  return true;
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+
+  return false;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseRegister(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                uint32_t &NumLanes) {
-  unsigned RegNum;
-  StringRef Layout;
-  SMLoc RegEndLoc, LayoutLoc;
-  SMLoc S = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    return MatchOperand_NoMatch;
+/// parseRegister - Parse a non-vector register operand.
+bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  // Try for a vector register.
+  if (!tryParseVectorRegister(Operands))
+    return false;
 
-  Operands.push_back(AArch64Operand::CreateReg(RegNum, S, RegEndLoc));
+  // Try for a scalar register.
+  int64_t Reg = tryParseRegister();
+  if (Reg == -1)
+    return true;
+  Operands.push_back(
+      AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
 
-  if (Layout.size() != 0) {
-    unsigned long long TmpLanes = 0;
-    llvm::getAsUnsignedInteger(Layout.substr(1), 10, TmpLanes);
-    if (TmpLanes != 0) {
-      NumLanes = TmpLanes;
-    } else {
-      // If the number of lanes isn't specified explicitly, a valid instruction
-      // will have an element specifier and be capable of acting on the entire
-      // vector register.
-      switch (Layout.back()) {
-      default: llvm_unreachable("Invalid layout specifier");
-      case 'b': NumLanes = 16; break;
-      case 'h': NumLanes = 8; break;
-      case 's': NumLanes = 4; break;
-      case 'd': NumLanes = 2; break;
-      case 'q': NumLanes = 1; break;
+  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
+  // as a string token in the instruction itself.
+  if (getLexer().getKind() == AsmToken::LBrac) {
+    SMLoc LBracS = getLoc();
+    Parser.Lex();
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Integer)) {
+      SMLoc IntS = getLoc();
+      int64_t Val = Tok.getIntVal();
+      if (Val == 1) {
+        Parser.Lex();
+        if (getLexer().getKind() == AsmToken::RBrac) {
+          SMLoc RBracS = getLoc();
+          Parser.Lex();
+          Operands.push_back(
+              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("1", false, IntS, getContext()));
+          Operands.push_back(
+              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
+          return false;
+        }
       }
     }
-
-    Operands.push_back(AArch64Operand::CreateToken(Layout, LayoutLoc));
   }
 
-  Parser.Lex();
-  return MatchOperand_Success;
-}
-
-bool
-AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                SMLoc &EndLoc) {
-  // This callback is used for things like DWARF frame directives in
-  // assembly. They don't care about things like NEON layouts or lanes, they
-  // just want to be able to produce the DWARF register number.
-  StringRef LayoutSpec;
-  SMLoc RegEndLoc, LayoutLoc;
-  StartLoc = Parser.getTok().getLoc();
-
-  if (!IdentifyRegister(RegNo, RegEndLoc, LayoutSpec, LayoutLoc))
-    return true;
-
-  Parser.Lex();
-  EndLoc = Parser.getTok().getLoc();
-
   return false;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseNamedImmOperand(const NamedImmMapper &Mapper,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Since these operands occur in very limited circumstances, without
-  // alternatives, we actually signal an error if there is no match. If relaxing
-  // this, beware of unintended consequences: an immediate will be accepted
-  // during matching, no matter how it gets into the AArch64Operand.
-  const AsmToken &Tok = Parser.getTok();
-  SMLoc S = Tok.getLoc();
+bool AArch64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
+  bool HasELFModifier = false;
+  AArch64MCExpr::VariantKind RefKind;
 
-  if (Tok.is(AsmToken::Identifier)) {
-    bool ValidName;
-    uint32_t Code = Mapper.fromString(Tok.getString().lower(), ValidName);
+  if (Parser.getTok().is(AsmToken::Colon)) {
+    Parser.Lex(); // Eat ':"
+    HasELFModifier = true;
 
-    if (!ValidName) {
-      Error(S, "operand specifier not recognised");
-      return MatchOperand_ParseFail;
+    if (Parser.getTok().isNot(AsmToken::Identifier)) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
     }
 
-    Parser.Lex(); // We're done with the identifier. Eat it
-
-    SMLoc E = Parser.getTok().getLoc();
-    const MCExpr *Imm = MCConstantExpr::Create(Code, getContext());
-    Operands.push_back(AArch64Operand::CreateImm(Imm, S, E));
-    return MatchOperand_Success;
-  } else if (Tok.is(AsmToken::Hash)) {
-    Parser.Lex();
+    std::string LowerCase = Parser.getTok().getIdentifier().lower();
+    RefKind = StringSwitch<AArch64MCExpr::VariantKind>(LowerCase)
+                  .Case("lo12", AArch64MCExpr::VK_LO12)
+                  .Case("abs_g3", AArch64MCExpr::VK_ABS_G3)
+                  .Case("abs_g2", AArch64MCExpr::VK_ABS_G2)
+                  .Case("abs_g2_s", AArch64MCExpr::VK_ABS_G2_S)
+                  .Case("abs_g2_nc", AArch64MCExpr::VK_ABS_G2_NC)
+                  .Case("abs_g1", AArch64MCExpr::VK_ABS_G1)
+                  .Case("abs_g1_s", AArch64MCExpr::VK_ABS_G1_S)
+                  .Case("abs_g1_nc", AArch64MCExpr::VK_ABS_G1_NC)
+                  .Case("abs_g0", AArch64MCExpr::VK_ABS_G0)
+                  .Case("abs_g0_s", AArch64MCExpr::VK_ABS_G0_S)
+                  .Case("abs_g0_nc", AArch64MCExpr::VK_ABS_G0_NC)
+                  .Case("dtprel_g2", AArch64MCExpr::VK_DTPREL_G2)
+                  .Case("dtprel_g1", AArch64MCExpr::VK_DTPREL_G1)
+                  .Case("dtprel_g1_nc", AArch64MCExpr::VK_DTPREL_G1_NC)
+                  .Case("dtprel_g0", AArch64MCExpr::VK_DTPREL_G0)
+                  .Case("dtprel_g0_nc", AArch64MCExpr::VK_DTPREL_G0_NC)
+                  .Case("dtprel_hi12", AArch64MCExpr::VK_DTPREL_HI12)
+                  .Case("dtprel_lo12", AArch64MCExpr::VK_DTPREL_LO12)
+                  .Case("dtprel_lo12_nc", AArch64MCExpr::VK_DTPREL_LO12_NC)
+                  .Case("tprel_g2", AArch64MCExpr::VK_TPREL_G2)
+                  .Case("tprel_g1", AArch64MCExpr::VK_TPREL_G1)
+                  .Case("tprel_g1_nc", AArch64MCExpr::VK_TPREL_G1_NC)
+                  .Case("tprel_g0", AArch64MCExpr::VK_TPREL_G0)
+                  .Case("tprel_g0_nc", AArch64MCExpr::VK_TPREL_G0_NC)
+                  .Case("tprel_hi12", AArch64MCExpr::VK_TPREL_HI12)
+                  .Case("tprel_lo12", AArch64MCExpr::VK_TPREL_LO12)
+                  .Case("tprel_lo12_nc", AArch64MCExpr::VK_TPREL_LO12_NC)
+                  .Case("tlsdesc_lo12", AArch64MCExpr::VK_TLSDESC_LO12)
+                  .Case("got", AArch64MCExpr::VK_GOT_PAGE)
+                  .Case("got_lo12", AArch64MCExpr::VK_GOT_LO12)
+                  .Case("gottprel", AArch64MCExpr::VK_GOTTPREL_PAGE)
+                  .Case("gottprel_lo12", AArch64MCExpr::VK_GOTTPREL_LO12_NC)
+                  .Case("gottprel_g1", AArch64MCExpr::VK_GOTTPREL_G1)
+                  .Case("gottprel_g0_nc", AArch64MCExpr::VK_GOTTPREL_G0_NC)
+                  .Case("tlsdesc", AArch64MCExpr::VK_TLSDESC_PAGE)
+                  .Default(AArch64MCExpr::VK_INVALID);
+
+    if (RefKind == AArch64MCExpr::VK_INVALID) {
+      Error(Parser.getTok().getLoc(),
+            "expect relocation specifier in operand after ':'");
+      return true;
+    }
 
-    const MCExpr *ImmVal;
-    if (ParseImmediate(ImmVal) != MatchOperand_Success)
-      return MatchOperand_ParseFail;
+    Parser.Lex(); // Eat identifier
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!CE || CE->getValue() < 0 || !Mapper.validImm(CE->getValue())) {
-      Error(S, "Invalid immediate for instruction");
-      return MatchOperand_ParseFail;
+    if (Parser.getTok().isNot(AsmToken::Colon)) {
+      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
+      return true;
     }
-
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E));
-    return MatchOperand_Success;
+    Parser.Lex(); // Eat ':'
   }
 
-  Error(S, "unexpected operand for instruction");
-  return MatchOperand_ParseFail;
-}
-
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseSysRegOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // Any MSR/MRS operand will be an identifier, and we want to store it as some
-  // kind of string: SPSel is valid for two different forms of MSR with two
-  // different encodings. There's no collision at the moment, but the potential
-  // is there.
-  if (!Tok.is(AsmToken::Identifier)) {
-    return MatchOperand_NoMatch;
-  }
+  if (getParser().parseExpression(ImmVal))
+    return true;
 
-  SMLoc S = Tok.getLoc();
-  Operands.push_back(AArch64Operand::CreateSysReg(Tok.getString(), S));
-  Parser.Lex(); // Eat identifier
+  if (HasELFModifier)
+    ImmVal = AArch64MCExpr::Create(ImmVal, RefKind, getContext());
 
-  return MatchOperand_Success;
+  return false;
 }
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseLSXAddressOperand(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  SMLoc S = Parser.getTok().getLoc();
-
-  unsigned RegNum;
-  SMLoc RegEndLoc, LayoutLoc;
-  StringRef Layout;
-  if(!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc)
-     || !AArch64MCRegisterClasses[AArch64::GPR64xspRegClassID].contains(RegNum)
-     || Layout.size() != 0) {
-    // Check Layout.size because we don't want to let "x3.4s" or similar
-    // through.
-    return MatchOperand_NoMatch;
-  }
-  Parser.Lex(); // Eat register
+/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
+bool AArch64AsmParser::parseVectorList(OperandVector &Operands) {
+  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat left bracket token.
+  StringRef Kind;
+  int64_t FirstReg = tryMatchVectorRegister(Kind, true);
+  if (FirstReg == -1)
+    return true;
+  int64_t PrevReg = FirstReg;
+  unsigned Count = 1;
 
-  if (Parser.getTok().is(AsmToken::RBrac)) {
-    // We're done
-    SMLoc E = Parser.getTok().getLoc();
-    Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
-    return MatchOperand_Success;
-  }
+  if (Parser.getTok().is(AsmToken::Minus)) {
+    Parser.Lex(); // Eat the minus.
 
-  // Otherwise, only ", #0" is valid
+    SMLoc Loc = getLoc();
+    StringRef NextKind;
+    int64_t Reg = tryMatchVectorRegister(NextKind, true);
+    if (Reg == -1)
+      return true;
+    // Any Kind suffices must match on all regs in the list.
+    if (Kind != NextKind)
+      return Error(Loc, "mismatched register size suffix");
 
-  if (Parser.getTok().isNot(AsmToken::Comma)) {
-    Error(Parser.getTok().getLoc(), "expected ',' or ']' after register");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat ','
+    unsigned Space = (PrevReg < Reg) ? (Reg - PrevReg) : (Reg + 32 - PrevReg);
 
-  if (Parser.getTok().isNot(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex(); // Eat '#'
+    if (Space == 0 || Space > 3) {
+      return Error(Loc, "invalid number of vectors");
+    }
 
-  if (Parser.getTok().isNot(AsmToken::Integer)
-      || Parser.getTok().getIntVal() != 0 ) {
-    Error(Parser.getTok().getLoc(), "expected '#0'");
-    return MatchOperand_ParseFail;
+    Count += Space;
   }
-  Parser.Lex(); // Eat '0'
-
-  SMLoc E = Parser.getTok().getLoc();
-  Operands.push_back(AArch64Operand::CreateWrappedReg(RegNum, S, E));
-  return MatchOperand_Success;
-}
+  else {
+    while (Parser.getTok().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma token.
 
-AArch64AsmParser::OperandMatchResultTy
-AArch64AsmParser::ParseShiftExtend(
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef IDVal = Parser.getTok().getIdentifier();
-  std::string LowerID = IDVal.lower();
-
-  A64SE::ShiftExtSpecifiers Spec =
-      StringSwitch<A64SE::ShiftExtSpecifiers>(LowerID)
-        .Case("lsl", A64SE::LSL)
-	.Case("msl", A64SE::MSL)
-	.Case("lsr", A64SE::LSR)
-	.Case("asr", A64SE::ASR)
-	.Case("ror", A64SE::ROR)
-	.Case("uxtb", A64SE::UXTB)
-	.Case("uxth", A64SE::UXTH)
-	.Case("uxtw", A64SE::UXTW)
-	.Case("uxtx", A64SE::UXTX)
-	.Case("sxtb", A64SE::SXTB)
-	.Case("sxth", A64SE::SXTH)
-	.Case("sxtw", A64SE::SXTW)
-	.Case("sxtx", A64SE::SXTX)
-	.Default(A64SE::Invalid);
-
-  if (Spec == A64SE::Invalid)
-    return MatchOperand_NoMatch;
+      SMLoc Loc = getLoc();
+      StringRef NextKind;
+      int64_t Reg = tryMatchVectorRegister(NextKind, true);
+      if (Reg == -1)
+        return true;
+      // Any Kind suffices must match on all regs in the list.
+      if (Kind != NextKind)
+        return Error(Loc, "mismatched register size suffix");
 
-  // Eat the shift
-  SMLoc S, E;
-  S = Parser.getTok().getLoc();
-  Parser.Lex();
+      // Registers must be incremental (with wraparound at 31)
+      if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
+          (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
+       return Error(Loc, "registers must be sequential");
 
-  if (Spec != A64SE::LSL && Spec != A64SE::LSR && Spec != A64SE::ASR &&
-      Spec != A64SE::ROR && Spec != A64SE::MSL) {
-    // The shift amount can be omitted for the extending versions, but not real
-    // shifts:
-    //     add x0, x0, x0, uxtb
-    // is valid, and equivalent to
-    //     add x0, x0, x0, uxtb #0
-
-    if (Parser.getTok().is(AsmToken::Comma) ||
-        Parser.getTok().is(AsmToken::EndOfStatement) ||
-        Parser.getTok().is(AsmToken::RBrac)) {
-      Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, 0, true,
-                                                           S, E));
-      return MatchOperand_Success;
+      PrevReg = Reg;
+      ++Count;
     }
   }
 
-  // Eat # at beginning of immediate
-  if (!Parser.getTok().is(AsmToken::Hash)) {
-    Error(Parser.getTok().getLoc(),
-          "expected #imm after shift specifier");
-    return MatchOperand_ParseFail;
-  }
-  Parser.Lex();
-
-  // Make sure we do actually have a number
-  if (!Parser.getTok().is(AsmToken::Integer)) {
-    Error(Parser.getTok().getLoc(),
-          "expected integer shift amount");
-    return MatchOperand_ParseFail;
-  }
-  unsigned Amount = Parser.getTok().getIntVal();
-  Parser.Lex();
-  E = Parser.getTok().getLoc();
+  if (Parser.getTok().isNot(AsmToken::RCurly))
+    return Error(getLoc(), "'}' expected");
+  Parser.Lex(); // Eat the '}' token.
 
-  Operands.push_back(AArch64Operand::CreateShiftExtend(Spec, Amount, false,
-                                                       S, E));
+  if (Count > 4)
+    return Error(S, "invalid number of vectors");
 
-  return MatchOperand_Success;
-}
+  unsigned NumElements = 0;
+  char ElementKind = 0;
+  if (!Kind.empty())
+    parseValidVectorKind(Kind, NumElements, ElementKind);
 
-/// Try to parse a vector register token, If it is a vector register,
-/// the token is eaten and return true. Otherwise return false.
-bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
-                                      StringRef &Layout, SMLoc &LayoutLoc) {
-  bool IsVector = true;
-
-  if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
-    IsVector = false;
-  else if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID]
-                .contains(RegNum) &&
-           !AArch64MCRegisterClasses[AArch64::FPR128RegClassID]
-                .contains(RegNum))
-    IsVector = false;
-  else if (Layout.size() == 0)
-    IsVector = false;
-
-  if (!IsVector)
-    Error(Parser.getTok().getLoc(), "expected vector type register");
-
-  Parser.Lex(); // Eat this token.
-  return IsVector;
-}
+  Operands.push_back(AArch64Operand::CreateVectorList(
+      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
 
+  // If there is an index specifier following the list, parse that too.
+  if (Parser.getTok().is(AsmToken::LBrac)) {
+    SMLoc SIdx = getLoc();
+    Parser.Lex(); // Eat left bracket token.
 
-// A vector list contains 1-4 consecutive registers.
-// Now there are two kinds of vector list when number of vector > 1:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// If the layout is like .b/.h/.s/.d, also parse the lane.
-AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
-    SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LCurly)) {
-    Error(Parser.getTok().getLoc(), "'{' expected");
-    return MatchOperand_ParseFail;
-  }
-  SMLoc SLoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '{' token.
+    const MCExpr *ImmVal;
+    if (getParser().parseExpression(ImmVal))
+      return false;
+    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
+    if (!MCE) {
+      TokError("immediate value expected for vector index");
+      return false;
+    }
 
-  unsigned Reg, Count = 1;
-  StringRef LayoutStr;
-  SMLoc RegEndLoc, LayoutLoc;
-  if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
-    return MatchOperand_ParseFail;
+    SMLoc E = getLoc();
+    if (Parser.getTok().isNot(AsmToken::RBrac)) {
+      Error(E, "']' expected");
+      return false;
+    }
 
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    Parser.Lex(); // Eat the minus.
+    Parser.Lex(); // Eat right bracket token.
 
-    unsigned Reg2;
-    StringRef LayoutStr2;
-    SMLoc RegEndLoc2, LayoutLoc2;
-    SMLoc RegLoc2 = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateVectorIndex(MCE->getValue(), SIdx,
+                                                         E, getContext()));
+  }
+  return false;
+}
 
-    if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-      return MatchOperand_ParseFail;
-    unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParseGPR64sp0Operand(OperandVector &Operands) {
+  const AsmToken &Tok = Parser.getTok();
+  if (!Tok.is(AsmToken::Identifier))
+    return MatchOperand_NoMatch;
 
-    if (LayoutStr != LayoutStr2) {
-      Error(LayoutLoc2, "expected the same vector layout");
-      return MatchOperand_ParseFail;
-    }
-    if (Space == 0 || Space > 3) {
-      Error(RegLoc2, "invalid number of vectors");
-      return MatchOperand_ParseFail;
-    }
+  unsigned RegNum = MatchRegisterName(Tok.getString().lower());
 
-    Count += Space;
-  } else {
-    unsigned LastReg = Reg;
-    while (Parser.getTok().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-      unsigned Reg2;
-      StringRef LayoutStr2;
-      SMLoc RegEndLoc2, LayoutLoc2;
-      SMLoc RegLoc2 = Parser.getTok().getLoc();
+  MCContext &Ctx = getContext();
+  const MCRegisterInfo *RI = Ctx.getRegisterInfo();
+  if (!RI->getRegClass(AArch64::GPR64spRegClassID).contains(RegNum))
+    return MatchOperand_NoMatch;
 
-      if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
-        return MatchOperand_ParseFail;
-      unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
-                                        : (Reg2 + 32 - LastReg);
-      Count++;
-
-      // The space between two vectors should be 1. And they should have the same layout.
-      // Total count shouldn't be great than 4
-      if (Space != 1) {
-        Error(RegLoc2, "invalid space between two vectors");
-        return MatchOperand_ParseFail;
-      }
-      if (LayoutStr != LayoutStr2) {
-        Error(LayoutLoc2, "expected the same vector layout");
-        return MatchOperand_ParseFail;
-      }
-      if (Count > 4) {
-        Error(RegLoc2, "invalid number of vectors");
-        return MatchOperand_ParseFail;
-      }
+  SMLoc S = getLoc();
+  Parser.Lex(); // Eat register
 
-      LastReg = Reg2;
-    }
+  if (Parser.getTok().isNot(AsmToken::Comma)) {
+    Operands.push_back(
+        AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+    return MatchOperand_Success;
   }
+  Parser.Lex(); // Eat comma.
 
-  if (Parser.getTok().isNot(AsmToken::RCurly)) {
-    Error(Parser.getTok().getLoc(), "'}' expected");
+  if (Parser.getTok().is(AsmToken::Hash))
+    Parser.Lex(); // Eat hash
+
+  if (Parser.getTok().isNot(AsmToken::Integer)) {
+    Error(getLoc(), "index must be absent or #0");
     return MatchOperand_ParseFail;
   }
-  SMLoc ELoc = Parser.getTok().getLoc();
-  Parser.Lex(); // Eat '}' token.
 
-  A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
-  if (Count > 1) { // If count > 1, create vector list using super register.
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    static unsigned SupRegIDs[3][2] = {
-      { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
-      { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
-      { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
-    };
-    unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
-    unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
-    Reg = MRI->getMatchingSuperReg(Reg, Sub0,
-                                   &AArch64MCRegisterClasses[SupRegID]);
+  const MCExpr *ImmVal;
+  if (Parser.parseExpression(ImmVal) || !isa<MCConstantExpr>(ImmVal) ||
+      cast<MCConstantExpr>(ImmVal)->getValue() != 0) {
+    Error(getLoc(), "index must be absent or #0");
+    return MatchOperand_ParseFail;
   }
-  Operands.push_back(
-      AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
 
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    uint32_t NumLanes = 0;
-    switch(Layout) {
-    case A64Layout::VL_B : NumLanes = 16; break;
-    case A64Layout::VL_H : NumLanes = 8; break;
-    case A64Layout::VL_S : NumLanes = 4; break;
-    case A64Layout::VL_D : NumLanes = 2; break;
-    default:
-      SMLoc Loc = getLexer().getLoc();
-      Error(Loc, "expected comma before next operand");
-      return MatchOperand_ParseFail;
-    }
-    return ParseNEONLane(Operands, NumLanes);
-  } else {
-    return MatchOperand_Success;
-  }
+  Operands.push_back(
+      AArch64Operand::CreateReg(RegNum, false, S, getLoc(), Ctx));
+  return MatchOperand_Success;
 }
 
-// FIXME: We would really like to be able to tablegen'erate this.
-bool AArch64AsmParser::
-validateInstruction(MCInst &Inst,
-                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  switch (Inst.getOpcode()) {
-  case AArch64::BFIwwii:
-  case AArch64::BFIxxii:
-  case AArch64::SBFIZwwii:
-  case AArch64::SBFIZxxii:
-  case AArch64::UBFIZwwii:
-  case AArch64::UBFIZxxii:  {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-
-    if (ImmR != 0 && ImmS >= ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested insert overflows register");
-    }
+/// parseOperand - Parse a arm instruction operand.  For now this parses the
+/// operand regardless of the mnemonic.
+bool AArch64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
+                                  bool invertCondCode) {
+  // Check if the current operand has a custom associated parser, if so, try to
+  // custom parse the operand, or fallback to the general approach.
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+  if (ResTy == MatchOperand_Success)
     return false;
-  }
-  case AArch64::BFXILwwii:
-  case AArch64::BFXILxxii:
-  case AArch64::SBFXwwii:
-  case AArch64::SBFXxxii:
-  case AArch64::UBFXwwii:
-  case AArch64::UBFXxxii: {
-    unsigned ImmOps = Inst.getNumOperands() - 2;
-    int64_t ImmR = Inst.getOperand(ImmOps).getImm();
-    int64_t ImmS = Inst.getOperand(ImmOps+1).getImm();
-    int64_t RegWidth = 0;
-    switch (Inst.getOpcode()) {
-    case AArch64::SBFXxxii: case AArch64::UBFXxxii: case AArch64::BFXILxxii:
-      RegWidth = 64;
-      break;
-    case AArch64::SBFXwwii: case AArch64::UBFXwwii: case AArch64::BFXILwwii:
-      RegWidth = 32;
-      break;
-    }
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_ParseFail)
+    return true;
 
-    if (ImmS >= RegWidth || ImmS < ImmR) {
-      return Error(Operands[4]->getStartLoc(),
-                   "requested extract overflows register");
-    }
+  // Nothing custom, so do general case parsing.
+  SMLoc S, E;
+  switch (getLexer().getKind()) {
+  default: {
+    SMLoc S = getLoc();
+    const MCExpr *Expr;
+    if (parseSymbolicImmVal(Expr))
+      return Error(S, "invalid operand");
+
+    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
     return false;
   }
-  case AArch64::ICix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (!A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op does not use a register");
-    }
-    return false;
+  case AsmToken::LBrac: {
+    SMLoc Loc = Parser.getTok().getLoc();
+    Operands.push_back(AArch64Operand::CreateToken("[", false, Loc,
+                                                   getContext()));
+    Parser.Lex(); // Eat '['
+
+    // There's no comma after a '[', so we can parse the next operand
+    // immediately.
+    return parseOperand(Operands, false, false);
   }
-  case AArch64::ICi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64IC::ICValues ICOp = static_cast<A64IC::ICValues>(ImmVal);
-    if (A64IC::NeedsRegister(ICOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified IC op requires a register");
-    }
+  case AsmToken::LCurly:
+    return parseVectorList(Operands);
+  case AsmToken::Identifier: {
+    // If we're expecting a Condition Code operand, then just parse that.
+    if (isCondCode)
+      return parseCondCode(Operands, invertCondCode);
+
+    // If it's a register name, parse it.
+    if (!parseRegister(Operands))
+      return false;
+
+    // This could be an optional "shift" or "extend" operand.
+    OperandMatchResultTy GotShift = tryParseOptionalShiftExtend(Operands);
+    // We can only continue if no tokens were eaten.
+    if (GotShift != MatchOperand_NoMatch)
+      return GotShift;
+
+    // This was not a register so parse other operands that start with an
+    // identifier (like labels) as expressions and create them as immediates.
+    const MCExpr *IdVal;
+    S = getLoc();
+    if (getParser().parseExpression(IdVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(IdVal, S, E, getContext()));
     return false;
   }
-  case AArch64::TLBIix: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (!A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op does not use a register");
+  case AsmToken::Integer:
+  case AsmToken::Real:
+  case AsmToken::Hash: {
+    // #42 -> immediate.
+    S = getLoc();
+    if (getLexer().is(AsmToken::Hash))
+      Parser.Lex();
+
+    // Parse a negative sign
+    bool isNegative = false;
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      isNegative = true;
+      // We need to consume this token only when we have a Real, otherwise
+      // we let parseSymbolicImmVal take care of it
+      if (Parser.getLexer().peekTok().is(AsmToken::Real))
+        Parser.Lex();
     }
-    return false;
-  }
-  case AArch64::TLBIi: {
-    int64_t ImmVal = Inst.getOperand(0).getImm();
-    A64TLBI::TLBIValues TLBIOp = static_cast<A64TLBI::TLBIValues>(ImmVal);
-    if (A64TLBI::NeedsRegister(TLBIOp)) {
-      return Error(Operands[1]->getStartLoc(),
-                   "specified TLBI op requires a register");
+
+    // The only Real that should come through here is a literal #0.0 for
+    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
+    // so convert the value.
+    const AsmToken &Tok = Parser.getTok();
+    if (Tok.is(AsmToken::Real)) {
+      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
+      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+      if (Mnemonic != "fcmp" && Mnemonic != "fcmpe" && Mnemonic != "fcmeq" &&
+          Mnemonic != "fcmge" && Mnemonic != "fcmgt" && Mnemonic != "fcmle" &&
+          Mnemonic != "fcmlt")
+        return TokError("unexpected floating point literal");
+      else if (IntVal != 0 || isNegative)
+        return TokError("expected floating-point constant #0.0");
+      Parser.Lex(); // Eat the token.
+
+      Operands.push_back(
+          AArch64Operand::CreateToken("#0", false, S, getContext()));
+      Operands.push_back(
+          AArch64Operand::CreateToken(".0", false, S, getContext()));
+      return false;
     }
+
+    const MCExpr *ImmVal;
+    if (parseSymbolicImmVal(ImmVal))
+      return true;
+
+    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+    Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
     return false;
   }
   }
-
-  return false;
 }
 
-
-// Parses the instruction *together with* all operands, appending each parsed
-// operand to the "Operands" list
+/// ParseInstruction - Parse an AArch64 instruction mnemonic followed by its
+/// operands.
 bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
-                               SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  StringRef PatchedName = StringSwitch<StringRef>(Name.lower())
-    .Case("beq", "b.eq")
-    .Case("bne", "b.ne")
-    .Case("bhs", "b.hs")
-    .Case("bcs", "b.cs")
-    .Case("blo", "b.lo")
-    .Case("bcc", "b.cc")
-    .Case("bmi", "b.mi")
-    .Case("bpl", "b.pl")
-    .Case("bvs", "b.vs")
-    .Case("bvc", "b.vc")
-    .Case("bhi", "b.hi")
-    .Case("bls", "b.ls")
-    .Case("bge", "b.ge")
-    .Case("blt", "b.lt")
-    .Case("bgt", "b.gt")
-    .Case("ble", "b.le")
-    .Case("bal", "b.al")
-    .Case("bnv", "b.nv")
-    .Default(Name);
-
-  size_t CondCodePos = PatchedName.find('.');
-
-  StringRef Mnemonic = PatchedName.substr(0, CondCodePos);
-  Operands.push_back(AArch64Operand::CreateToken(Mnemonic, NameLoc));
-
-  if (CondCodePos != StringRef::npos) {
-    // We have a condition code
-    SMLoc S = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 1);
-    StringRef CondStr = PatchedName.substr(CondCodePos + 1, StringRef::npos);
-    A64CC::CondCodes Code;
-
-    Code = A64StringToCondCode(CondStr);
-
-    if (Code == A64CC::Invalid) {
-      Error(S, "invalid condition code");
+                                        OperandVector &Operands) {
+  Name = StringSwitch<StringRef>(Name.lower())
+             .Case("beq", "b.eq")
+             .Case("bne", "b.ne")
+             .Case("bhs", "b.hs")
+             .Case("bcs", "b.cs")
+             .Case("blo", "b.lo")
+             .Case("bcc", "b.cc")
+             .Case("bmi", "b.mi")
+             .Case("bpl", "b.pl")
+             .Case("bvs", "b.vs")
+             .Case("bvc", "b.vc")
+             .Case("bhi", "b.hi")
+             .Case("bls", "b.ls")
+             .Case("bge", "b.ge")
+             .Case("blt", "b.lt")
+             .Case("bgt", "b.gt")
+             .Case("ble", "b.le")
+             .Case("bal", "b.al")
+             .Case("bnv", "b.nv")
+             .Default(Name);
+
+  // Create the leading tokens for the mnemonic, split by '.' characters.
+  size_t Start = 0, Next = Name.find('.');
+  StringRef Head = Name.slice(Start, Next);
+
+  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
+  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi") {
+    bool IsError = parseSysAlias(Head, NameLoc, Operands);
+    if (IsError && getLexer().isNot(AsmToken::EndOfStatement))
       Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    SMLoc DotL = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos);
-
-    Operands.push_back(AArch64Operand::CreateToken(".",  DotL));
-    SMLoc E = SMLoc::getFromPointer(NameLoc.getPointer() + CondCodePos + 3);
-    Operands.push_back(AArch64Operand::CreateCondCode(Code, S, E));
+    return IsError;
   }
 
-  // Now we parse the operands of this instruction
+  Operands.push_back(
+      AArch64Operand::CreateToken(Head, false, NameLoc, getContext()));
+  Mnemonic = Head;
+
+  // Handle condition codes for a branch mnemonic
+  if (Head == "b" && Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start + 1, Next);
+
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()));
+    AArch64CC::CondCode CC = parseCondCodeString(Head);
+    if (CC == AArch64CC::Invalid)
+      return Error(SuffixLoc, "invalid condition code");
+    Operands.push_back(
+        AArch64Operand::CreateToken(".", true, SuffixLoc, getContext()));
+    Operands.push_back(
+        AArch64Operand::CreateCondCode(CC, NameLoc, NameLoc, getContext()));
+  }
+
+  // Add the remaining tokens in the mnemonic.
+  while (Next != StringRef::npos) {
+    Start = Next;
+    Next = Name.find('.', Start + 1);
+    Head = Name.slice(Start, Next);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
+                                            (Head.data() - Name.data()) + 1);
+    Operands.push_back(
+        AArch64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
+  }
+
+  // Conditional compare instructions have a Condition Code operand, which needs
+  // to be parsed and an immediate operand created.
+  bool condCodeFourthOperand =
+      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
+       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
+       Head == "csinc" || Head == "csinv" || Head == "csneg");
+
+  // These instructions are aliases to some of the conditional select
+  // instructions. However, the condition code is inverted in the aliased
+  // instruction.
+  //
+  // FIXME: Is this the correct way to handle these? Or should the parser
+  //        generate the aliased instructions directly?
+  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
+  bool condCodeThirdOperand =
+      (Head == "cinc" || Head == "cinv" || Head == "cneg");
+
+  // Read the remaining operands.
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     // Read the first operand.
-    if (ParseOperand(Operands, Mnemonic)) {
+    if (parseOperand(Operands, false, false)) {
       Parser.eatToEndOfStatement();
       return true;
     }
 
+    unsigned N = 2;
     while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex();  // Eat the comma.
+      Parser.Lex(); // Eat the comma.
 
       // Parse and remember the operand.
-      if (ParseOperand(Operands, Mnemonic)) {
+      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
+                                     (N == 3 && condCodeThirdOperand) ||
+                                     (N == 2 && condCodeSecondOperand),
+                       condCodeSecondOperand || condCodeThirdOperand)) {
         Parser.eatToEndOfStatement();
         return true;
       }
 
-
       // After successfully parsing some operands there are two special cases to
       // consider (i.e. notional operands not separated by commas). Both are due
       // to memory specifiers:
@@ -2321,52 +3121,716 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
       // in the given context!
       if (Parser.getTok().is(AsmToken::RBrac)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("]", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("]", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
 
       if (Parser.getTok().is(AsmToken::Exclaim)) {
         SMLoc Loc = Parser.getTok().getLoc();
-        Operands.push_back(AArch64Operand::CreateToken("!", Loc));
+        Operands.push_back(AArch64Operand::CreateToken("!", false, Loc,
+                                                       getContext()));
         Parser.Lex();
       }
+
+      ++N;
     }
   }
 
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = getLexer().getLoc();
+    SMLoc Loc = Parser.getTok().getLoc();
     Parser.eatToEndOfStatement();
-    return Error(Loc, "expected comma before next operand");
+    return Error(Loc, "unexpected token in argument list");
   }
 
-  // Eat the EndOfStatement
-  Parser.Lex();
-
+  Parser.Lex(); // Consume the EndOfStatement
   return false;
 }
 
+// FIXME: This entire function is a giant hack to provide us with decent
+// operand range validation/diagnostics until TableGen/MC can be extended
+// to support autogeneration of this kind of validation.
+bool AArch64AsmParser::validateInstruction(MCInst &Inst,
+                                         SmallVectorImpl<SMLoc> &Loc) {
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  // Check for indexed addressing modes w/ the base register being the
+  // same as a destination/source register or pair load where
+  // the Rt == Rt2. All of those are undefined behaviour.
+  switch (Inst.getOpcode()) {
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::LDPXpost:
+  case AArch64::LDPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
+                           "is also a destination");
+    // FALLTHROUGH
+  }
+  case AArch64::LDPDi:
+  case AArch64::LDPQi:
+  case AArch64::LDPSi:
+  case AArch64::LDPSWi:
+  case AArch64::LDPWi:
+  case AArch64::LDPXi: {
+    unsigned Rt = Inst.getOperand(0).getReg();
+    unsigned Rt2 = Inst.getOperand(1).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::LDPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::LDPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::LDPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::LDPSWpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    if (Rt == Rt2)
+      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
+    break;
+  }
+  case AArch64::STPDpost:
+  case AArch64::STPDpre:
+  case AArch64::STPQpost:
+  case AArch64::STPQpre:
+  case AArch64::STPSpost:
+  case AArch64::STPSpre:
+  case AArch64::STPWpost:
+  case AArch64::STPWpre:
+  case AArch64::STPXpost:
+  case AArch64::STPXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rt2 = Inst.getOperand(2).getReg();
+    unsigned Rn = Inst.getOperand(3).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    if (RI->isSubRegisterEq(Rn, Rt2))
+      return Error(Loc[1], "unpredictable STP instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::LDRBBpre:
+  case AArch64::LDRBpre:
+  case AArch64::LDRHHpre:
+  case AArch64::LDRHpre:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRBBpost:
+  case AArch64::LDRBpost:
+  case AArch64::LDRHHpost:
+  case AArch64::LDRHpost:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRXpost: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  case AArch64::STRBBpost:
+  case AArch64::STRBpost:
+  case AArch64::STRHHpost:
+  case AArch64::STRHpost:
+  case AArch64::STRWpost:
+  case AArch64::STRXpost:
+  case AArch64::STRBBpre:
+  case AArch64::STRBpre:
+  case AArch64::STRHHpre:
+  case AArch64::STRHpre:
+  case AArch64::STRWpre:
+  case AArch64::STRXpre: {
+    unsigned Rt = Inst.getOperand(1).getReg();
+    unsigned Rn = Inst.getOperand(2).getReg();
+    if (RI->isSubRegisterEq(Rn, Rt))
+      return Error(Loc[0], "unpredictable STR instruction, writeback base "
+                           "is also a source");
+    break;
+  }
+  }
+
+  // Now check immediate ranges. Separate from the above as there is overlap
+  // in the instructions being checked and this keeps the nested conditionals
+  // to a minimum.
+  switch (Inst.getOpcode()) {
+  case AArch64::ADDSWri:
+  case AArch64::ADDSXri:
+  case AArch64::ADDWri:
+  case AArch64::ADDXri:
+  case AArch64::SUBSWri:
+  case AArch64::SUBSXri:
+  case AArch64::SUBWri:
+  case AArch64::SUBXri: {
+    // Annoyingly we can't do this in the isAddSubImm predicate, so there is
+    // some slight duplication here.
+    if (Inst.getOperand(2).isExpr()) {
+      const MCExpr *Expr = Inst.getOperand(2).getExpr();
+      AArch64MCExpr::VariantKind ELFRefKind;
+      MCSymbolRefExpr::VariantKind DarwinRefKind;
+      int64_t Addend;
+      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
+        return Error(Loc[2], "invalid immediate expression");
+      }
+
+      // Only allow these with ADDXri.
+      if ((DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
+          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) &&
+          Inst.getOpcode() == AArch64::ADDXri)
+        return false;
+
+      // Only allow these with ADDXri/ADDWri
+      if ((ELFRefKind == AArch64MCExpr::VK_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_DTPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_HI12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12 ||
+          ELFRefKind == AArch64MCExpr::VK_TPREL_LO12_NC ||
+          ELFRefKind == AArch64MCExpr::VK_TLSDESC_LO12) &&
+          (Inst.getOpcode() == AArch64::ADDXri ||
+          Inst.getOpcode() == AArch64::ADDWri))
+        return false;
+
+      // Don't allow expressions in the immediate field otherwise
+      return Error(Loc[2], "invalid immediate expression");
+    }
+    return false;
+  }
+  default:
+    return false;
+  }
+}
+
+bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
+  switch (ErrCode) {
+  case Match_MissingFeature:
+    return Error(Loc,
+                 "instruction requires a CPU feature not currently enabled");
+  case Match_InvalidOperand:
+    return Error(Loc, "invalid operand for instruction");
+  case Match_InvalidSuffix:
+    return Error(Loc, "invalid type suffix for instruction");
+  case Match_InvalidCondCode:
+    return Error(Loc, "expected AArch64 condition code");
+  case Match_AddSubRegExtendSmall:
+    return Error(Loc,
+      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubRegExtendLarge:
+    return Error(Loc,
+      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
+  case Match_AddSubSecondSource:
+    return Error(Loc,
+      "expected compatible register, symbol or integer in range [0, 4095]");
+  case Match_LogicalSecondSource:
+    return Error(Loc, "expected compatible register or logical immediate");
+  case Match_InvalidMovImm32Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0 or 16");
+  case Match_InvalidMovImm64Shift:
+    return Error(Loc, "expected 'lsl' with optional integer 0, 16, 32 or 48");
+  case Match_AddSubRegShift32:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
+  case Match_AddSubRegShift64:
+    return Error(Loc,
+       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
+  case Match_InvalidFPImm:
+    return Error(Loc,
+                 "expected compatible register or floating-point constant");
+  case Match_InvalidMemoryIndexedSImm9:
+    return Error(Loc, "index must be an integer in range [-256, 255].");
+  case Match_InvalidMemoryIndexed4SImm7:
+    return Error(Loc, "index must be a multiple of 4 in range [-256, 252].");
+  case Match_InvalidMemoryIndexed8SImm7:
+    return Error(Loc, "index must be a multiple of 8 in range [-512, 504].");
+  case Match_InvalidMemoryIndexed16SImm7:
+    return Error(Loc, "index must be a multiple of 16 in range [-1024, 1008].");
+  case Match_InvalidMemoryWExtend8:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
+  case Match_InvalidMemoryWExtend16:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
+  case Match_InvalidMemoryWExtend32:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
+  case Match_InvalidMemoryWExtend64:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
+  case Match_InvalidMemoryWExtend128:
+    return Error(Loc,
+                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #4");
+  case Match_InvalidMemoryXExtend8:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0");
+  case Match_InvalidMemoryXExtend16:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
+  case Match_InvalidMemoryXExtend32:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
+  case Match_InvalidMemoryXExtend64:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
+  case Match_InvalidMemoryXExtend128:
+    return Error(Loc,
+                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
+  case Match_InvalidMemoryIndexed1:
+    return Error(Loc, "index must be an integer in range [0, 4095].");
+  case Match_InvalidMemoryIndexed2:
+    return Error(Loc, "index must be a multiple of 2 in range [0, 8190].");
+  case Match_InvalidMemoryIndexed4:
+    return Error(Loc, "index must be a multiple of 4 in range [0, 16380].");
+  case Match_InvalidMemoryIndexed8:
+    return Error(Loc, "index must be a multiple of 8 in range [0, 32760].");
+  case Match_InvalidMemoryIndexed16:
+    return Error(Loc, "index must be a multiple of 16 in range [0, 65520].");
+  case Match_InvalidImm0_7:
+    return Error(Loc, "immediate must be an integer in range [0, 7].");
+  case Match_InvalidImm0_15:
+    return Error(Loc, "immediate must be an integer in range [0, 15].");
+  case Match_InvalidImm0_31:
+    return Error(Loc, "immediate must be an integer in range [0, 31].");
+  case Match_InvalidImm0_63:
+    return Error(Loc, "immediate must be an integer in range [0, 63].");
+  case Match_InvalidImm0_127:
+    return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_65535:
+    return Error(Loc, "immediate must be an integer in range [0, 65535].");
+  case Match_InvalidImm1_8:
+    return Error(Loc, "immediate must be an integer in range [1, 8].");
+  case Match_InvalidImm1_16:
+    return Error(Loc, "immediate must be an integer in range [1, 16].");
+  case Match_InvalidImm1_32:
+    return Error(Loc, "immediate must be an integer in range [1, 32].");
+  case Match_InvalidImm1_64:
+    return Error(Loc, "immediate must be an integer in range [1, 64].");
+  case Match_InvalidIndex1:
+    return Error(Loc, "expected lane specifier '[1]'");
+  case Match_InvalidIndexB:
+    return Error(Loc, "vector lane must be an integer in range [0, 15].");
+  case Match_InvalidIndexH:
+    return Error(Loc, "vector lane must be an integer in range [0, 7].");
+  case Match_InvalidIndexS:
+    return Error(Loc, "vector lane must be an integer in range [0, 3].");
+  case Match_InvalidIndexD:
+    return Error(Loc, "vector lane must be an integer in range [0, 1].");
+  case Match_InvalidLabel:
+    return Error(Loc, "expected label or encodable integer pc offset");
+  case Match_MRS:
+    return Error(Loc, "expected readable system register");
+  case Match_MSR:
+    return Error(Loc, "expected writable system register or pstate");
+  case Match_MnemonicFail:
+    return Error(Loc, "unrecognized instruction mnemonic");
+  default:
+    assert(0 && "unexpected error code!");
+    return Error(Loc, "invalid instruction format");
+  }
+}
+
+static const char *getSubtargetFeatureName(unsigned Val);
+
+bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                               OperandVector &Operands,
+                                               MCStreamer &Out,
+                                               unsigned &ErrorInfo,
+                                               bool MatchingInlineAsm) {
+  assert(!Operands.empty() && "Unexpect empty operand list!");
+  AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[0]);
+  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
+
+  StringRef Tok = Op->getToken();
+  unsigned NumOperands = Operands.size();
+
+  if (NumOperands == 4 && Tok == "lsl") {
+    AArch64Operand *Op2 = static_cast<AArch64Operand *>(Operands[2]);
+    AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+    if (Op2->isReg() && Op3->isImm()) {
+      const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+      if (Op3CE) {
+        uint64_t Op3Val = Op3CE->getValue();
+        uint64_t NewOp3Val = 0;
+        uint64_t NewOp4Val = 0;
+        if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                Op2->getReg())) {
+          NewOp3Val = (32 - Op3Val) & 0x1f;
+          NewOp4Val = 31 - Op3Val;
+        } else {
+          NewOp3Val = (64 - Op3Val) & 0x3f;
+          NewOp4Val = 63 - Op3Val;
+        }
+
+        const MCExpr *NewOp3 = MCConstantExpr::Create(NewOp3Val, getContext());
+        const MCExpr *NewOp4 = MCConstantExpr::Create(NewOp4Val, getContext());
+
+        Operands[0] = AArch64Operand::CreateToken(
+            "ubfm", false, Op->getStartLoc(), getContext());
+        Operands[3] = AArch64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
+                                                Op3->getEndLoc(), getContext());
+        Operands.push_back(AArch64Operand::CreateImm(
+            NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
+        delete Op3;
+        delete Op;
+      }
+    }
+  } else if (NumOperands == 5) {
+    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
+    // UBFIZ -> UBFM aliases.
+    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp3Val = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR32allRegClassID].contains(
+                  Op1->getReg()))
+            NewOp3Val = (32 - Op3Val) & 0x1f;
+          else
+            NewOp3Val = (64 - Op3Val) & 0x3f;
+
+          uint64_t NewOp4Val = Op4Val - 1;
+
+          if (NewOp3Val != 0 && NewOp4Val >= NewOp3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested insert overflows register");
+
+          const MCExpr *NewOp3 =
+              MCConstantExpr::Create(NewOp3Val, getContext());
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[3] = AArch64Operand::CreateImm(
+              NewOp3, Op3->getStartLoc(), Op3->getEndLoc(), getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfi")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfiz")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op3;
+          delete Op4;
+        }
+      }
+
+      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
+      // UBFX -> UBFM aliases.
+    } else if (NumOperands == 5 &&
+               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
+      AArch64Operand *Op1 = static_cast<AArch64Operand *>(Operands[1]);
+      AArch64Operand *Op3 = static_cast<AArch64Operand *>(Operands[3]);
+      AArch64Operand *Op4 = static_cast<AArch64Operand *>(Operands[4]);
+
+      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
+        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
+        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
+
+        if (Op3CE && Op4CE) {
+          uint64_t Op3Val = Op3CE->getValue();
+          uint64_t Op4Val = Op4CE->getValue();
+
+          uint64_t RegWidth = 0;
+          if (AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+              Op1->getReg()))
+            RegWidth = 64;
+          else
+            RegWidth = 32;
+
+          if (Op3Val >= RegWidth)
+            return Error(Op3->getStartLoc(),
+                         "expected integer in range [0, 31]");
+          if (Op4Val < 1 || Op4Val > RegWidth)
+            return Error(Op4->getStartLoc(),
+                         "expected integer in range [1, 32]");
+
+          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
+
+          if (NewOp4Val >= RegWidth || NewOp4Val < Op3Val)
+            return Error(Op4->getStartLoc(),
+                         "requested extract overflows register");
+
+          const MCExpr *NewOp4 =
+              MCConstantExpr::Create(NewOp4Val, getContext());
+          Operands[4] = AArch64Operand::CreateImm(
+              NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
+          if (Tok == "bfxil")
+            Operands[0] = AArch64Operand::CreateToken(
+                "bfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "sbfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "sbfm", false, Op->getStartLoc(), getContext());
+          else if (Tok == "ubfx")
+            Operands[0] = AArch64Operand::CreateToken(
+                "ubfm", false, Op->getStartLoc(), getContext());
+          else
+            llvm_unreachable("No valid mnemonic for alias?");
+
+          delete Op;
+          delete Op4;
+        }
+      }
+    }
+  }
+  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
+  //        InstAlias can't quite handle this since the reg classes aren't
+  //        subclasses.
+  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
+    // The source register can be Wn here, but the matcher expects a
+    // GPR64. Twiddle it here if necessary.
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+    if (Op->isReg()) {
+      unsigned Reg = getXRegFromWReg(Op->getReg());
+      Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete Op;
+    }
+  }
+  // FIXME: Likewise for sxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "sxtb" || Tok == "sxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR64. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[2]);
+      if (Op->isReg()) {
+        unsigned Reg = getXRegFromWReg(Op->getReg());
+        Operands[2] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+  // FIXME: Likewise for uxt[bh] with a Xd dst operand
+  else if (NumOperands == 3 && (Tok == "uxtb" || Tok == "uxth")) {
+    AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+    if (Op->isReg() &&
+        AArch64MCRegisterClasses[AArch64::GPR64allRegClassID].contains(
+            Op->getReg())) {
+      // The source register can be Wn here, but the matcher expects a
+      // GPR32. Twiddle it here if necessary.
+      AArch64Operand *Op = static_cast<AArch64Operand *>(Operands[1]);
+      if (Op->isReg()) {
+        unsigned Reg = getWRegFromXReg(Op->getReg());
+        Operands[1] = AArch64Operand::CreateReg(Reg, false, Op->getStartLoc(),
+                                                Op->getEndLoc(), getContext());
+        delete Op;
+      }
+    }
+  }
+
+  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
+  if (NumOperands == 3 && Tok == "fmov") {
+    AArch64Operand *RegOp = static_cast<AArch64Operand *>(Operands[1]);
+    AArch64Operand *ImmOp = static_cast<AArch64Operand *>(Operands[2]);
+    if (RegOp->isReg() && ImmOp->isFPImm() &&
+        ImmOp->getFPImm() == (unsigned)-1) {
+      unsigned zreg =
+          AArch64MCRegisterClasses[AArch64::FPR32RegClassID].contains(
+              RegOp->getReg())
+              ? AArch64::WZR
+              : AArch64::XZR;
+      Operands[2] = AArch64Operand::CreateReg(zreg, false, Op->getStartLoc(),
+                                              Op->getEndLoc(), getContext());
+      delete ImmOp;
+    }
+  }
+
+  MCInst Inst;
+  // First try to match against the secondary set of tables containing the
+  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
+
+  // If that fails, try against the alternate table containing long-form NEON:
+  // "fadd v0.2s, v1.2s, v2.2s"
+  if (MatchResult != Match_Success)
+    MatchResult =
+        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
+
+  switch (MatchResult) {
+  case Match_Success: {
+    // Perform range checking and other semantic validations
+    SmallVector<SMLoc, 8> OperandLocs;
+    NumOperands = Operands.size();
+    for (unsigned i = 1; i < NumOperands; ++i)
+      OperandLocs.push_back(Operands[i]->getStartLoc());
+    if (validateInstruction(Inst, OperandLocs))
+      return true;
+
+    Inst.setLoc(IDLoc);
+    Out.EmitInstruction(Inst, STI);
+    return false;
+  }
+  case Match_MissingFeature: {
+    assert(ErrorInfo && "Unknown missing feature!");
+    // Special case the error message for the very common case where only
+    // a single subtarget feature is missing (neon, e.g.).
+    std::string Msg = "instruction requires:";
+    unsigned Mask = 1;
+    for (unsigned i = 0; i < (sizeof(ErrorInfo)*8-1); ++i) {
+      if (ErrorInfo & Mask) {
+        Msg += " ";
+        Msg += getSubtargetFeatureName(ErrorInfo & Mask);
+      }
+      Mask <<= 1;
+    }
+    return Error(IDLoc, Msg);
+  }
+  case Match_MnemonicFail:
+    return showMatchError(IDLoc, MatchResult);
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0U) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+    // If the match failed on a suffix token operand, tweak the diagnostic
+    // accordingly.
+    if (((AArch64Operand *)Operands[ErrorInfo])->isToken() &&
+        ((AArch64Operand *)Operands[ErrorInfo])->isTokenSuffix())
+      MatchResult = Match_InvalidSuffix;
+
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  case Match_InvalidMemoryIndexed1:
+  case Match_InvalidMemoryIndexed2:
+  case Match_InvalidMemoryIndexed4:
+  case Match_InvalidMemoryIndexed8:
+  case Match_InvalidMemoryIndexed16:
+  case Match_InvalidCondCode:
+  case Match_AddSubRegExtendSmall:
+  case Match_AddSubRegExtendLarge:
+  case Match_AddSubSecondSource:
+  case Match_LogicalSecondSource:
+  case Match_AddSubRegShift32:
+  case Match_AddSubRegShift64:
+  case Match_InvalidMovImm32Shift:
+  case Match_InvalidMovImm64Shift:
+  case Match_InvalidFPImm:
+  case Match_InvalidMemoryWExtend8:
+  case Match_InvalidMemoryWExtend16:
+  case Match_InvalidMemoryWExtend32:
+  case Match_InvalidMemoryWExtend64:
+  case Match_InvalidMemoryWExtend128:
+  case Match_InvalidMemoryXExtend8:
+  case Match_InvalidMemoryXExtend16:
+  case Match_InvalidMemoryXExtend32:
+  case Match_InvalidMemoryXExtend64:
+  case Match_InvalidMemoryXExtend128:
+  case Match_InvalidMemoryIndexed4SImm7:
+  case Match_InvalidMemoryIndexed8SImm7:
+  case Match_InvalidMemoryIndexed16SImm7:
+  case Match_InvalidMemoryIndexedSImm9:
+  case Match_InvalidImm0_7:
+  case Match_InvalidImm0_15:
+  case Match_InvalidImm0_31:
+  case Match_InvalidImm0_63:
+  case Match_InvalidImm0_127:
+  case Match_InvalidImm0_65535:
+  case Match_InvalidImm1_8:
+  case Match_InvalidImm1_16:
+  case Match_InvalidImm1_32:
+  case Match_InvalidImm1_64:
+  case Match_InvalidIndex1:
+  case Match_InvalidIndexB:
+  case Match_InvalidIndexH:
+  case Match_InvalidIndexS:
+  case Match_InvalidIndexD:
+  case Match_InvalidLabel:
+  case Match_MSR:
+  case Match_MRS: {
+    // Any time we get here, there's nothing fancy to do. Just get the
+    // operand SMLoc and display the diagnostic.
+    SMLoc ErrorLoc = ((AArch64Operand *)Operands[ErrorInfo])->getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+    return showMatchError(ErrorLoc, MatchResult);
+  }
+  }
+
+  llvm_unreachable("Implement any new match types added!");
+  return true;
+}
+
+/// ParseDirective parses the arm specific directives
 bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getIdentifier();
+  SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".hword")
-    return ParseDirectiveWord(2, DirectiveID.getLoc());
-  else if (IDVal == ".word")
-    return ParseDirectiveWord(4, DirectiveID.getLoc());
-  else if (IDVal == ".xword")
-    return ParseDirectiveWord(8, DirectiveID.getLoc());
-  else if (IDVal == ".tlsdesccall")
-    return ParseDirectiveTLSDescCall(DirectiveID.getLoc());
-
-  return true;
+    return parseDirectiveWord(2, Loc);
+  if (IDVal == ".word")
+    return parseDirectiveWord(4, Loc);
+  if (IDVal == ".xword")
+    return parseDirectiveWord(8, Loc);
+  if (IDVal == ".tlsdesccall")
+    return parseDirectiveTLSDescCall(Loc);
+
+  return parseDirectiveLOH(IDVal, Loc);
 }
 
 /// parseDirectiveWord
 ///  ::= .word [ expression (, expression)* ]
-bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
+bool AArch64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
-        return false;
+        return true;
 
       getParser().getStreamer().EmitValue(Value, Size);
 
@@ -2374,10 +3838,8 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
         break;
 
       // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma)) {
-        Error(L, "unexpected token in directive");
-        return false;
-      }
+      if (getLexer().isNot(AsmToken::Comma))
+        return Error(L, "unexpected token in directive");
       Parser.Lex();
     }
   }
@@ -2388,15 +3850,14 @@ bool AArch64AsmParser::ParseDirectiveWord(unsigned Size, SMLoc L) {
 
 // parseDirectiveTLSDescCall:
 //   ::= .tlsdesccall symbol
-bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
+bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
   StringRef Name;
-  if (getParser().parseIdentifier(Name)) {
-    Error(L, "expected symbol after directive");
-    return false;
-  }
+  if (getParser().parseIdentifier(Name))
+    return Error(L, "expected symbol after directive");
 
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCSymbolRefExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
+  Expr = AArch64MCExpr::Create(Expr, AArch64MCExpr::VK_TLSDESC, getContext());
 
   MCInst Inst;
   Inst.setOpcode(AArch64::TLSDESCCALL);
@@ -2406,271 +3867,181 @@ bool AArch64AsmParser::ParseDirectiveTLSDescCall(SMLoc L) {
   return false;
 }
 
+/// ::= .loh <lohName | lohId> label1, ..., labelN
+/// The number of arguments depends on the loh identifier.
+bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
+  if (IDVal != MCLOHDirectiveName())
+    return true;
+  MCLOHType Kind;
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    if (getParser().getTok().isNot(AsmToken::Integer))
+      return TokError("expected an identifier or a number in directive");
+    // We successfully get a numeric value for the identifier.
+    // Check if it is valid.
+    int64_t Id = getParser().getTok().getIntVal();
+    Kind = (MCLOHType)Id;
+    // Check that Id does not overflow MCLOHType.
+    if (!isValidMCLOHType(Kind) || Id != Kind)
+      return TokError("invalid numeric identifier in directive");
+  } else {
+    StringRef Name = getTok().getIdentifier();
+    // We successfully parse an identifier.
+    // Check if it is a recognized one.
+    int Id = MCLOHNameToId(Name);
+
+    if (Id == -1)
+      return TokError("invalid identifier in directive");
+    Kind = (MCLOHType)Id;
+  }
+  // Consume the identifier.
+  Lex();
+  // Get the number of arguments of this LOH.
+  int NbArgs = MCLOHIdToNbArgs(Kind);
+
+  assert(NbArgs != -1 && "Invalid number of arguments");
+
+  SmallVector<MCSymbol *, 3> Args;
+  for (int Idx = 0; Idx < NbArgs; ++Idx) {
+    StringRef Name;
+    if (getParser().parseIdentifier(Name))
+      return TokError("expected identifier in directive");
+    Args.push_back(getContext().GetOrCreateSymbol(Name));
+
+    if (Idx + 1 == NbArgs)
+      break;
+    if (getLexer().isNot(AsmToken::Comma))
+      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
+    Lex();
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
 
-bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                 SmallVectorImpl<MCParsedAsmOperand*> &Operands,
-                                 MCStreamer &Out, unsigned &ErrorInfo,
-                                 bool MatchingInlineAsm) {
-  MCInst Inst;
-  unsigned MatchResult;
-  MatchResult = MatchInstructionImpl(Operands, Inst, ErrorInfo,
-                                     MatchingInlineAsm);
+  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  return false;
+}
 
-  if (ErrorInfo != ~0U && ErrorInfo >= Operands.size())
-    return Error(IDLoc, "too few operands for instruction");
+bool
+AArch64AsmParser::classifySymbolRef(const MCExpr *Expr,
+                                    AArch64MCExpr::VariantKind &ELFRefKind,
+                                    MCSymbolRefExpr::VariantKind &DarwinRefKind,
+                                    int64_t &Addend) {
+  ELFRefKind = AArch64MCExpr::VK_INVALID;
+  DarwinRefKind = MCSymbolRefExpr::VK_None;
+  Addend = 0;
+
+  if (const AArch64MCExpr *AE = dyn_cast<AArch64MCExpr>(Expr)) {
+    ELFRefKind = AE->getKind();
+    Expr = AE->getSubExpr();
+  }
+
+  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
+  if (SE) {
+    // It's a simple symbol reference with no addend.
+    DarwinRefKind = SE->getKind();
+    return true;
+  }
 
-  switch (MatchResult) {
-  default: break;
-  case Match_Success:
-    if (validateInstruction(Inst, Operands))
-      return true;
+  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
+  if (!BE)
+    return false;
 
-    Out.EmitInstruction(Inst, STI);
+  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+  if (!SE)
     return false;
-  case Match_MissingFeature:
-    Error(IDLoc, "instruction requires a CPU feature not currently enabled");
-    return true;
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      ErrorLoc = ((AArch64Operand*)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
-    }
+  DarwinRefKind = SE->getKind();
 
-    return Error(ErrorLoc, "invalid operand for instruction");
-  }
-  case Match_MnemonicFail:
-    return Error(IDLoc, "invalid instruction");
+  if (BE->getOpcode() != MCBinaryExpr::Add &&
+      BE->getOpcode() != MCBinaryExpr::Sub)
+    return false;
 
-  case Match_AddSubRegExtendSmall:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected '[su]xt[bhw]' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegExtendLarge:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-      "expected 'sxtx' 'uxtx' or 'lsl' with optional integer in range [0, 4]");
-  case Match_AddSubRegShift32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 31]");
-  case Match_AddSubRegShift64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-       "expected 'lsl', 'lsr' or 'asr' with optional integer in range [0, 63]");
-  case Match_AddSubSecondSource:
-      return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-          "expected compatible register, symbol or integer in range [0, 4095]");
-  case Match_CVTFixedPos32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_CVTFixedPos64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_CondCode:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected AArch64 condition code");
-  case Match_FPImm:
-    // Any situation which allows a nontrivial floating-point constant also
-    // allows a register.
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or floating-point constant");
-  case Match_FPZero:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected floating-point constant #0.0 or invalid register type");
-  case Match_Label:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected label or encodable integer pc offset");
-  case Match_Lane1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected lane specifier '[1]'");
-  case Match_LoadStoreExtend32_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0");
-  case Match_LoadStoreExtend32_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend32_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend32_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'uxtw' or 'sxtw' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend32_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtw' with optional shift of #0 or #4");
-  case Match_LoadStoreExtend64_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0");
-  case Match_LoadStoreExtend64_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #1");
-  case Match_LoadStoreExtend64_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #2");
-  case Match_LoadStoreExtend64_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #3");
-  case Match_LoadStoreExtend64_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'lsl' or 'sxtx' with optional shift of #0 or #4");
-  case Match_LoadStoreSImm7_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 4 in range [-256, 252]");
-  case Match_LoadStoreSImm7_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 8 in range [-512, 504]");
-  case Match_LoadStoreSImm7_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer multiple of 16 in range [-1024, 1008]");
-  case Match_LoadStoreSImm9:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [-256, 255]");
-  case Match_LoadStoreUImm12_1:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 4095]");
-  case Match_LoadStoreUImm12_2:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 8190]");
-  case Match_LoadStoreUImm12_4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 16380]");
-  case Match_LoadStoreUImm12_8:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 32760]");
-  case Match_LoadStoreUImm12_16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic reference or integer in range [0, 65520]");
-  case Match_LogicalSecondSource:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected compatible register or logical immediate");
-  case Match_MOVWUImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected relocated symbol or integer in range [0, 65535]");
-  case Match_MRS:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected readable system register");
-  case Match_MSR:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected writable system register or pstate");
-  case Match_NamedImm_at:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                "expected symbolic 'at' operand: s1e[0-3][rw] or s12e[01][rw]");
-  case Match_NamedImm_dbarrier:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-             "expected integer in range [0, 15] or symbolic barrier operand");
-  case Match_NamedImm_dc:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected symbolic 'dc' operand");
-  case Match_NamedImm_ic:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected 'ic' operand: 'ialluis', 'iallu' or 'ivau'");
-  case Match_NamedImm_isb:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15] or 'sy'");
-  case Match_NamedImm_prefetch:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected prefetch hint: p(ld|st|i)l[123](strm|keep)");
-  case Match_NamedImm_tlbi:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected translation buffer invalidation operand");
-  case Match_UImm16:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 65535]");
-  case Match_UImm3:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_UImm4:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_UImm5:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_UImm6:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  case Match_UImm7:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 127]");
-  case Match_Width32:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 31]");
-  case Match_Width64:
-    return Error(((AArch64Operand*)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [<lsb>, 63]");
-  case Match_ShrImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 8]");
-  case Match_ShrImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 16]");
-  case Match_ShrImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 32]");
-  case Match_ShrImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [1, 64]");
-  case Match_ShlImm8:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 7]");
-  case Match_ShlImm16:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 15]");
-  case Match_ShlImm32:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 31]");
-  case Match_ShlImm64:
-    return Error(((AArch64Operand *)Operands[ErrorInfo])->getStartLoc(),
-                 "expected integer in range [0, 63]");
-  }
+  // See if the addend is is a constant, otherwise there's more going
+  // on here than we can deal with.
+  auto AddendExpr = dyn_cast<MCConstantExpr>(BE->getRHS());
+  if (!AddendExpr)
+    return false;
 
-  llvm_unreachable("Implement any new match types added!");
-  return true;
+  Addend = AddendExpr->getValue();
+  if (BE->getOpcode() == MCBinaryExpr::Sub)
+    Addend = -Addend;
+
+  // It's some symbol reference + a constant addend, but really
+  // shouldn't use both Darwin and ELF syntax.
+  return ELFRefKind == AArch64MCExpr::VK_INVALID ||
+         DarwinRefKind == MCSymbolRefExpr::VK_None;
 }
 
-void AArch64Operand::print(raw_ostream &OS) const {
+/// Force static initialization.
+extern "C" void LLVMInitializeAArch64AsmParser() {
+  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
+
+  RegisterMCAsmParser<AArch64AsmParser> Z(TheARM64leTarget);
+  RegisterMCAsmParser<AArch64AsmParser> W(TheARM64beTarget);
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_SUBTARGET_FEATURE_NAME
+#define GET_MATCHER_IMPLEMENTATION
+#include "AArch64GenAsmMatcher.inc"
+
+// Define this matcher function after the auto-generated include so we
+// have the match class enum definitions.
+unsigned AArch64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
+                                                      unsigned Kind) {
+  AArch64Operand *Op = static_cast<AArch64Operand *>(AsmOp);
+  // If the kind is a token for a literal immediate, check if our asm
+  // operand matches. This is for InstAliases which have a fixed-value
+  // immediate in the syntax.
+  int64_t ExpectedVal;
   switch (Kind) {
-  case k_CondCode:
-    OS << "<CondCode: " << CondCode.Code << ">";
+  default:
+    return Match_InvalidOperand;
+  case MCK__35_0:
+    ExpectedVal = 0;
     break;
-  case k_FPImmediate:
-    OS << "<fpimm: " << FPImm.Val << ">";
+  case MCK__35_1:
+    ExpectedVal = 1;
     break;
-  case k_ImmWithLSL:
-    OS << "<immwithlsl: imm=" << ImmWithLSL.Val
-       << ", shift=" << ImmWithLSL.ShiftAmount << ">";
+  case MCK__35_12:
+    ExpectedVal = 12;
     break;
-  case k_Immediate:
-    getImm()->print(OS);
+  case MCK__35_16:
+    ExpectedVal = 16;
     break;
-  case k_Register:
-    OS << "<register " << getReg() << '>';
+  case MCK__35_2:
+    ExpectedVal = 2;
     break;
-  case k_Token:
-    OS << '\'' << getToken() << '\'';
+  case MCK__35_24:
+    ExpectedVal = 24;
     break;
-  case k_ShiftExtend:
-    OS << "<shift: type=" << ShiftExtend.ShiftType
-       << ", amount=" << ShiftExtend.Amount << ">";
+  case MCK__35_3:
+    ExpectedVal = 3;
     break;
-  case k_SysReg: {
-    StringRef Name(SysReg.Data, SysReg.Length);
-    OS << "<sysreg: " << Name << '>';
+  case MCK__35_32:
+    ExpectedVal = 32;
     break;
-  }
-  default:
-    llvm_unreachable("No idea how to print this kind of operand");
+  case MCK__35_4:
+    ExpectedVal = 4;
+    break;
+  case MCK__35_48:
+    ExpectedVal = 48;
+    break;
+  case MCK__35_6:
+    ExpectedVal = 6;
+    break;
+  case MCK__35_64:
+    ExpectedVal = 64;
+    break;
+  case MCK__35_8:
+    ExpectedVal = 8;
     break;
   }
+  if (!Op->isImm())
+    return Match_InvalidOperand;
+  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+  if (!CE)
+    return Match_InvalidOperand;
+  if (CE->getValue() == ExpectedVal)
+    return Match_Success;
+  return Match_InvalidOperand;
 }
-
-void AArch64Operand::dump() const {
-  print(errs());
-}
-
-
-/// Force static initialization.
-extern "C" void LLVMInitializeAArch64AsmParser() {
-  RegisterMCAsmParser<AArch64AsmParser> X(TheAArch64leTarget);
-  RegisterMCAsmParser<AArch64AsmParser> Y(TheAArch64beTarget);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "AArch64GenAsmMatcher.inc"
diff --git a/lib/Target/AArch64/AsmParser/CMakeLists.txt b/lib/Target/AArch64/AsmParser/CMakeLists.txt
index e81ec70..cc0a9d8 100644
--- a/lib/Target/AArch64/AsmParser/CMakeLists.txt
+++ b/lib/Target/AArch64/AsmParser/CMakeLists.txt
@@ -1,3 +1,6 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmParser
   AArch64AsmParser.cpp
   )
+
diff --git a/lib/Target/AArch64/AsmParser/LLVMBuild.txt b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
index 2d8f632..11eb9d5 100644
--- a/lib/Target/AArch64/AsmParser/LLVMBuild.txt
+++ b/lib/Target/AArch64/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/AsmParser/Makefile b/lib/Target/AArch64/AsmParser/Makefile
index 56c9ef5..00268c7 100644
--- a/lib/Target/AArch64/AsmParser/Makefile
+++ b/lib/Target/AArch64/AsmParser/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmParser
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' ARM target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index dfc10af..789d549 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -1,37 +1,51 @@
 set(LLVM_TARGET_DEFINITIONS AArch64.td)
 
-tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
+tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM AArch64GenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM AArch64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM AArch64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM AArch64GenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AArch64GenAsmWriter.inc -gen-asm-writer)
+tablegen(LLVM AArch64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
+tablegen(LLVM AArch64GenAsmMatcher.inc -gen-asm-matcher)
 tablegen(LLVM AArch64GenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AArch64GenFastISel.inc -gen-fast-isel)
+tablegen(LLVM AArch64GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM AArch64GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AArch64GenDisassemblerTables.inc -gen-disassembler)
 add_public_tablegen_target(AArch64CommonTableGen)
 
 add_llvm_target(AArch64CodeGen
+  AArch64AddressTypePromotion.cpp
+  AArch64AdvSIMDScalarPass.cpp
   AArch64AsmPrinter.cpp
-  AArch64BranchFixupPass.cpp
+  AArch64BranchRelaxation.cpp
+  AArch64CleanupLocalDynamicTLSPass.cpp
+  AArch64CollectLOH.cpp
+  AArch64ConditionalCompares.cpp
+  AArch64DeadRegisterDefinitionsPass.cpp
+  AArch64ExpandPseudoInsts.cpp
+  AArch64FastISel.cpp
   AArch64FrameLowering.cpp
   AArch64ISelDAGToDAG.cpp
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
-  AArch64MachineFunctionInfo.cpp
+  AArch64LoadStoreOptimizer.cpp
   AArch64MCInstLower.cpp
+  AArch64PromoteConstant.cpp
   AArch64RegisterInfo.cpp
   AArch64SelectionDAGInfo.cpp
+  AArch64StorePairSuppress.cpp
   AArch64Subtarget.cpp
   AArch64TargetMachine.cpp
   AArch64TargetObjectFile.cpp
   AArch64TargetTransformInfo.cpp
-  )
+)
 
+add_dependencies(LLVMAArch64CodeGen intrinsics_gen)
+
+add_subdirectory(TargetInfo)
 add_subdirectory(AsmParser)
 add_subdirectory(Disassembler)
 add_subdirectory(InstPrinter)
 add_subdirectory(MCTargetDesc)
-add_subdirectory(TargetInfo)
 add_subdirectory(Utils)
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 9bd363a..6de27d6 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -1,4 +1,4 @@
-//===- AArch64Disassembler.cpp - Disassembler for AArch64 ISA -------------===//
+//===- AArch64Disassembler.cpp - Disassembler for AArch64 -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,244 +7,169 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the functions necessary to decode AArch64 instruction
-// bitpatterns into MCInsts (with the help of TableGenerated information from
-// the instruction definitions).
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-disassembler"
-
-#include "AArch64.h"
-#include "AArch64RegisterInfo.h"
+#include "AArch64Disassembler.h"
+#include "AArch64ExternalSymbolizer.h"
 #include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-typedef MCDisassembler::DecodeStatus DecodeStatus;
-
-namespace {
-/// AArch64 disassembler for all AArch64 platforms.
-class AArch64Disassembler : public MCDisassembler {
-  OwningPtr<const MCRegisterInfo> RegInfo;
-public:
-  /// Initializes the disassembler.
-  ///
-  AArch64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info)
-    : MCDisassembler(STI), RegInfo(Info) {
-  }
-
-  ~AArch64Disassembler() {}
+#define DEBUG_TYPE "aarch64-disassembler"
 
-  /// See MCDisassembler.
-  DecodeStatus getInstruction(MCInst &instr,
-                              uint64_t &size,
-                              const MemoryObject &region,
-                              uint64_t address,
-                              raw_ostream &vStream,
-                              raw_ostream &cStream) const;
+// Pull DecodeStatus and its enum values into the global namespace.
+typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-};
-
-}
-
-// Forward-declarations used in the auto-generated files.
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                         uint64_t Address, const void *Decoder);
+// Forward declare these because the autogenerated code will reference them.
+// Definitions are further down.
 static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
                                               unsigned RegNo, uint64_t Address,
                                               const void *Decoder);
-static DecodeStatus DecodeFPR128LoRegisterClass(llvm::MCInst &Inst,
-                                                unsigned RegNo, uint64_t Address,
-                                                const void *Decoder);
-
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
+                                                 unsigned RegNo,
+                                                 uint64_t Address,
+                                                 const void *Decoder);
+static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
-
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder);
-
-
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
+                                               unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
-
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder);
-
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
+static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                           uint64_t Address,
                                           const void *Decoder);
-
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder);
-
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder);
+static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
+static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const void *Decoder);
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder);
-
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
                                             uint64_t Address,
                                             const void *Decoder);
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
+
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Address,
+                                               const void *Decoder);
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
+                                            uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Address,
+                                            const void *Decoder);
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Address,
+                                                const void *Decoder);
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn,
+                                                  uint64_t Address,
+                                                  const void *Decoder);
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Address, const void *Decoder);
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
+static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
                                               uint64_t Address,
                                               const void *Decoder);
-
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       unsigned Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder);
-
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static DecodeStatus
-DecodeSysRegOperand(const A64SysReg::SysRegMapper &InstMapper,
-                    llvm::MCInst &Inst, unsigned Val,
-                    uint64_t Address, const void *Decoder);
-
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder);
-
-
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Val,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Val,
-                                               uint64_t Address,
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
                                                const void *Decoder);
-
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder);
-
-static bool Check(DecodeStatus &Out, DecodeStatus In);
-
-#include "AArch64GenDisassemblerTables.inc"
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -261,486 +186,479 @@ static bool Check(DecodeStatus &Out, DecodeStatus In) {
   llvm_unreachable("Invalid DecodeStatus!");
 }
 
+#include "AArch64GenDisassemblerTables.inc"
+#include "AArch64GenInstrInfo.inc"
+
+#define Success llvm::MCDisassembler::Success
+#define Fail llvm::MCDisassembler::Fail
+#define SoftFail llvm::MCDisassembler::SoftFail
+
+static MCDisassembler *createAArch64Disassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new AArch64Disassembler(STI, Ctx);
+}
+
 DecodeStatus AArch64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                                 const MemoryObject &Region,
-                                                 uint64_t Address,
-                                                 raw_ostream &os,
-                                                 raw_ostream &cs) const {
+                                               const MemoryObject &Region,
+                                               uint64_t Address,
+                                               raw_ostream &os,
+                                               raw_ostream &cs) const {
   CommentStream = &cs;
 
   uint8_t bytes[4];
 
+  Size = 0;
   // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, bytes) == -1) {
-    Size = 0;
-    return MCDisassembler::Fail;
-  }
+  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
+    return Fail;
+  Size = 4;
 
   // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn = (bytes[3] << 24) |
-    (bytes[2] << 16) |
-    (bytes[1] <<  8) |
-    (bytes[0] <<  0);
+  uint32_t insn =
+      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
 
   // Calling the auto-generated decoder function.
-  DecodeStatus result = decodeInstruction(DecoderTableA6432, MI, insn, Address,
-                                          this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    return result;
-  }
-
-  MI.clear();
-  Size = 0;
-  return MCDisassembler::Fail;
+  return decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const AArch64Disassembler *Dis = static_cast<const AArch64Disassembler*>(D);
-  return Dis->getRegInfo()->getRegClass(RC).getRegister(RegNo);
+static MCSymbolizer *
+createAArch64ExternalSymbolizer(StringRef TT, LLVMOpInfoCallback GetOpInfo,
+                              LLVMSymbolLookupCallback SymbolLookUp,
+                              void *DisInfo, MCContext *Ctx,
+                              MCRelocationInfo *RelInfo) {
+  return new llvm::AArch64ExternalSymbolizer(
+                                     *Ctx,
+                                     std::unique_ptr<MCRelocationInfo>(RelInfo),
+                                     GetOpInfo, SymbolLookUp, DisInfo);
 }
 
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                        uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
+extern "C" void LLVMInitializeAArch64Disassembler() {
+  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheAArch64beTarget,
+                                       createAArch64ExternalSymbolizer);
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  TargetRegistry::RegisterMCDisassembler(TheARM64leTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCDisassembler(TheARM64beTarget,
+                                         createAArch64Disassembler);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64leTarget,
+                                       createAArch64ExternalSymbolizer);
+  TargetRegistry::RegisterMCSymbolizer(TheARM64beTarget,
+                                       createAArch64ExternalSymbolizer);
 }
 
-static DecodeStatus
-DecodeGPR64xspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR64xspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR128DecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
+static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR32RegClassID, RegNo);
+  unsigned Register = FPR128DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeGPR32wspRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::GPR32wspRegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
+                                                 uint64_t Addr,
+                                                 const void *Decoder) {
+  if (RegNo > 15)
+    return Fail;
+  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
 }
 
-static DecodeStatus
-DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, AArch64::FPR8RegClassID, RegNo);
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
-}
+static const unsigned FPR64DecoderTable[] = {
+    AArch64::D0,  AArch64::D1,  AArch64::D2,  AArch64::D3,  AArch64::D4,
+    AArch64::D5,  AArch64::D6,  AArch64::D7,  AArch64::D8,  AArch64::D9,
+    AArch64::D10, AArch64::D11, AArch64::D12, AArch64::D13, AArch64::D14,
+    AArch64::D15, AArch64::D16, AArch64::D17, AArch64::D18, AArch64::D19,
+    AArch64::D20, AArch64::D21, AArch64::D22, AArch64::D23, AArch64::D24,
+    AArch64::D25, AArch64::D26, AArch64::D27, AArch64::D28, AArch64::D29,
+    AArch64::D30, AArch64::D31
+};
 
-static DecodeStatus
-DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR16RegClassID, RegNo);
+  unsigned Register = FPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static const unsigned FPR32DecoderTable[] = {
+    AArch64::S0,  AArch64::S1,  AArch64::S2,  AArch64::S3,  AArch64::S4,
+    AArch64::S5,  AArch64::S6,  AArch64::S7,  AArch64::S8,  AArch64::S9,
+    AArch64::S10, AArch64::S11, AArch64::S12, AArch64::S13, AArch64::S14,
+    AArch64::S15, AArch64::S16, AArch64::S17, AArch64::S18, AArch64::S19,
+    AArch64::S20, AArch64::S21, AArch64::S22, AArch64::S23, AArch64::S24,
+    AArch64::S25, AArch64::S26, AArch64::S27, AArch64::S28, AArch64::S29,
+    AArch64::S30, AArch64::S31
+};
 
-static DecodeStatus
-DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR32RegClassID, RegNo);
+  unsigned Register = FPR32DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static const unsigned FPR16DecoderTable[] = {
+    AArch64::H0,  AArch64::H1,  AArch64::H2,  AArch64::H3,  AArch64::H4,
+    AArch64::H5,  AArch64::H6,  AArch64::H7,  AArch64::H8,  AArch64::H9,
+    AArch64::H10, AArch64::H11, AArch64::H12, AArch64::H13, AArch64::H14,
+    AArch64::H15, AArch64::H16, AArch64::H17, AArch64::H18, AArch64::H19,
+    AArch64::H20, AArch64::H21, AArch64::H22, AArch64::H23, AArch64::H24,
+    AArch64::H25, AArch64::H26, AArch64::H27, AArch64::H28, AArch64::H29,
+    AArch64::H30, AArch64::H31
+};
+
+static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR64RegClassID, RegNo);
+  unsigned Register = FPR16DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR64LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR64RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned FPR8DecoderTable[] = {
+    AArch64::B0,  AArch64::B1,  AArch64::B2,  AArch64::B3,  AArch64::B4,
+    AArch64::B5,  AArch64::B6,  AArch64::B7,  AArch64::B8,  AArch64::B9,
+    AArch64::B10, AArch64::B11, AArch64::B12, AArch64::B13, AArch64::B14,
+    AArch64::B15, AArch64::B16, AArch64::B17, AArch64::B18, AArch64::B19,
+    AArch64::B20, AArch64::B21, AArch64::B22, AArch64::B23, AArch64::B24,
+    AArch64::B25, AArch64::B26, AArch64::B27, AArch64::B28, AArch64::B29,
+    AArch64::B30, AArch64::B31
+};
 
-static DecodeStatus
-DecodeFPR128RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
+static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::FPR128RegClassID, RegNo);
+  unsigned Register = FPR8DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus
-DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                            uint64_t Address, const void *Decoder) {
-  if (RegNo > 15)
-    return MCDisassembler::Fail;
-
-  return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
-}
+static const unsigned GPR64DecoderTable[] = {
+    AArch64::X0,  AArch64::X1,  AArch64::X2,  AArch64::X3,  AArch64::X4,
+    AArch64::X5,  AArch64::X6,  AArch64::X7,  AArch64::X8,  AArch64::X9,
+    AArch64::X10, AArch64::X11, AArch64::X12, AArch64::X13, AArch64::X14,
+    AArch64::X15, AArch64::X16, AArch64::X17, AArch64::X18, AArch64::X19,
+    AArch64::X20, AArch64::X21, AArch64::X22, AArch64::X23, AArch64::X24,
+    AArch64::X25, AArch64::X26, AArch64::X27, AArch64::X28, AArch64::FP,
+    AArch64::LR,  AArch64::XZR
+};
 
-static DecodeStatus DecodeGPR64noxzrRegisterClass(llvm::MCInst &Inst,
-                                                  unsigned RegNo,
-                                                  uint64_t Address,
-                                                  const void *Decoder) {
-  if (RegNo > 30)
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  uint16_t Register = getReg(Decoder, AArch64::GPR64noxzrRegClassID, RegNo);
+  unsigned Register = GPR64DecoderTable[RegNo];
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
-                                            unsigned RegID,
-                                            const void *Decoder) {
+static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
   if (RegNo > 31)
-    return MCDisassembler::Fail;
-
-  uint16_t Register = getReg(Decoder, RegID, RegNo);
+    return Fail;
+  unsigned Register = GPR64DecoderTable[RegNo];
+  if (Register == AArch64::XZR)
+    Register = AArch64::SP;
   Inst.addOperand(MCOperand::CreateReg(Register));
-  return MCDisassembler::Success;
+  return Success;
 }
 
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
-                                 Decoder);
-}
+static const unsigned GPR32DecoderTable[] = {
+    AArch64::W0,  AArch64::W1,  AArch64::W2,  AArch64::W3,  AArch64::W4,
+    AArch64::W5,  AArch64::W6,  AArch64::W7,  AArch64::W8,  AArch64::W9,
+    AArch64::W10, AArch64::W11, AArch64::W12, AArch64::W13, AArch64::W14,
+    AArch64::W15, AArch64::W16, AArch64::W17, AArch64::W18, AArch64::W19,
+    AArch64::W20, AArch64::W21, AArch64::W22, AArch64::W23, AArch64::W24,
+    AArch64::W25, AArch64::W26, AArch64::W27, AArch64::W28, AArch64::W29,
+    AArch64::W30, AArch64::WZR
+};
 
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
+static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
                                              const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
+static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
+                                               uint64_t Addr,
                                                const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
-                                 Decoder);
-}
+  if (RegNo > 31)
+    return Fail;
 
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
-                                 Decoder);
+  unsigned Register = GPR32DecoderTable[RegNo];
+  if (Register == AArch64::WZR)
+    Register = AArch64::WSP;
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
-                                 Decoder);
-}
+static const unsigned VectorDecoderTable[] = {
+    AArch64::Q0,  AArch64::Q1,  AArch64::Q2,  AArch64::Q3,  AArch64::Q4,
+    AArch64::Q5,  AArch64::Q6,  AArch64::Q7,  AArch64::Q8,  AArch64::Q9,
+    AArch64::Q10, AArch64::Q11, AArch64::Q12, AArch64::Q13, AArch64::Q14,
+    AArch64::Q15, AArch64::Q16, AArch64::Q17, AArch64::Q18, AArch64::Q19,
+    AArch64::Q20, AArch64::Q21, AArch64::Q22, AArch64::Q23, AArch64::Q24,
+    AArch64::Q25, AArch64::Q26, AArch64::Q27, AArch64::Q28, AArch64::Q29,
+    AArch64::Q30, AArch64::Q31
+};
 
-static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
-                                               unsigned OptionHiS,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // Option{1} must be 1. OptionHiS is made up of {Option{2}, Option{1},
-  // S}. Hence we want to check bit 1.
-  if (!(OptionHiS & 2))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
 
-  Inst.addOperand(MCOperand::CreateImm(OptionHiS));
-  return MCDisassembler::Success;
+  unsigned Register = VectorDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeBitfield32ImmOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // In the 32-bit variant, bit 6 must be zero. I.e. the immediate must be
-  // between 0 and 31.
-  if (Imm6Bits > 31)
-    return MCDisassembler::Fail;
+static const unsigned QQDecoderTable[] = {
+  AArch64::Q0_Q1,   AArch64::Q1_Q2,   AArch64::Q2_Q3,   AArch64::Q3_Q4,
+  AArch64::Q4_Q5,   AArch64::Q5_Q6,   AArch64::Q6_Q7,   AArch64::Q7_Q8,
+  AArch64::Q8_Q9,   AArch64::Q9_Q10,  AArch64::Q10_Q11, AArch64::Q11_Q12,
+  AArch64::Q12_Q13, AArch64::Q13_Q14, AArch64::Q14_Q15, AArch64::Q15_Q16,
+  AArch64::Q16_Q17, AArch64::Q17_Q18, AArch64::Q18_Q19, AArch64::Q19_Q20,
+  AArch64::Q20_Q21, AArch64::Q21_Q22, AArch64::Q22_Q23, AArch64::Q23_Q24,
+  AArch64::Q24_Q25, AArch64::Q25_Q26, AArch64::Q26_Q27, AArch64::Q27_Q28,
+  AArch64::Q28_Q29, AArch64::Q29_Q30, AArch64::Q30_Q31, AArch64::Q31_Q0
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeCVT32FixedPosOperand(llvm::MCInst &Inst,
-                                               unsigned Imm6Bits,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  // 1 <= Imm <= 32. Encoded as 64 - Imm so: 63 >= Encoded >= 32.
-  if (Imm6Bits < 32)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Imm6Bits));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2,    AArch64::Q1_Q2_Q3,    AArch64::Q2_Q3_Q4,
+  AArch64::Q3_Q4_Q5,    AArch64::Q4_Q5_Q6,    AArch64::Q5_Q6_Q7,
+  AArch64::Q6_Q7_Q8,    AArch64::Q7_Q8_Q9,    AArch64::Q8_Q9_Q10,
+  AArch64::Q9_Q10_Q11,  AArch64::Q10_Q11_Q12, AArch64::Q11_Q12_Q13,
+  AArch64::Q12_Q13_Q14, AArch64::Q13_Q14_Q15, AArch64::Q14_Q15_Q16,
+  AArch64::Q15_Q16_Q17, AArch64::Q16_Q17_Q18, AArch64::Q17_Q18_Q19,
+  AArch64::Q18_Q19_Q20, AArch64::Q19_Q20_Q21, AArch64::Q20_Q21_Q22,
+  AArch64::Q21_Q22_Q23, AArch64::Q22_Q23_Q24, AArch64::Q23_Q24_Q25,
+  AArch64::Q24_Q25_Q26, AArch64::Q25_Q26_Q27, AArch64::Q26_Q27_Q28,
+  AArch64::Q27_Q28_Q29, AArch64::Q28_Q29_Q30, AArch64::Q29_Q30_Q31,
+  AArch64::Q30_Q31_Q0,  AArch64::Q31_Q0_Q1
+};
 
-static DecodeStatus DecodeFPZeroOperand(llvm::MCInst &Inst,
-                                        unsigned RmBits,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  // Any bits are valid in the instruction (they're architecturally ignored),
-  // but a code generator should insert 0.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm8(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(8 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned QQQQDecoderTable[] = {
+  AArch64::Q0_Q1_Q2_Q3,     AArch64::Q1_Q2_Q3_Q4,     AArch64::Q2_Q3_Q4_Q5,
+  AArch64::Q3_Q4_Q5_Q6,     AArch64::Q4_Q5_Q6_Q7,     AArch64::Q5_Q6_Q7_Q8,
+  AArch64::Q6_Q7_Q8_Q9,     AArch64::Q7_Q8_Q9_Q10,    AArch64::Q8_Q9_Q10_Q11,
+  AArch64::Q9_Q10_Q11_Q12,  AArch64::Q10_Q11_Q12_Q13, AArch64::Q11_Q12_Q13_Q14,
+  AArch64::Q12_Q13_Q14_Q15, AArch64::Q13_Q14_Q15_Q16, AArch64::Q14_Q15_Q16_Q17,
+  AArch64::Q15_Q16_Q17_Q18, AArch64::Q16_Q17_Q18_Q19, AArch64::Q17_Q18_Q19_Q20,
+  AArch64::Q18_Q19_Q20_Q21, AArch64::Q19_Q20_Q21_Q22, AArch64::Q20_Q21_Q22_Q23,
+  AArch64::Q21_Q22_Q23_Q24, AArch64::Q22_Q23_Q24_Q25, AArch64::Q23_Q24_Q25_Q26,
+  AArch64::Q24_Q25_Q26_Q27, AArch64::Q25_Q26_Q27_Q28, AArch64::Q26_Q27_Q28_Q29,
+  AArch64::Q27_Q28_Q29_Q30, AArch64::Q28_Q29_Q30_Q31, AArch64::Q29_Q30_Q31_Q0,
+  AArch64::Q30_Q31_Q0_Q1,   AArch64::Q31_Q0_Q1_Q2
+};
 
-static DecodeStatus DecodeShiftRightImm16(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(16 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = QQQQDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftRightImm32(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(32 - Val));
-  return MCDisassembler::Success;
-}
+static const unsigned DDDecoderTable[] = {
+  AArch64::D0_D1,   AArch64::D1_D2,   AArch64::D2_D3,   AArch64::D3_D4,
+  AArch64::D4_D5,   AArch64::D5_D6,   AArch64::D6_D7,   AArch64::D7_D8,
+  AArch64::D8_D9,   AArch64::D9_D10,  AArch64::D10_D11, AArch64::D11_D12,
+  AArch64::D12_D13, AArch64::D13_D14, AArch64::D14_D15, AArch64::D15_D16,
+  AArch64::D16_D17, AArch64::D17_D18, AArch64::D18_D19, AArch64::D19_D20,
+  AArch64::D20_D21, AArch64::D21_D22, AArch64::D22_D23, AArch64::D23_D24,
+  AArch64::D24_D25, AArch64::D25_D26, AArch64::D26_D27, AArch64::D27_D28,
+  AArch64::D28_D29, AArch64::D29_D30, AArch64::D30_D31, AArch64::D31_D0
+};
 
-static DecodeStatus DecodeShiftRightImm64(MCInst &Inst, unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                          uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm8(MCInst &Inst, unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  if (Val > 7)
-    return MCDisassembler::Fail;
+static const unsigned DDDDecoderTable[] = {
+  AArch64::D0_D1_D2,    AArch64::D1_D2_D3,    AArch64::D2_D3_D4,
+  AArch64::D3_D4_D5,    AArch64::D4_D5_D6,    AArch64::D5_D6_D7,
+  AArch64::D6_D7_D8,    AArch64::D7_D8_D9,    AArch64::D8_D9_D10,
+  AArch64::D9_D10_D11,  AArch64::D10_D11_D12, AArch64::D11_D12_D13,
+  AArch64::D12_D13_D14, AArch64::D13_D14_D15, AArch64::D14_D15_D16,
+  AArch64::D15_D16_D17, AArch64::D16_D17_D18, AArch64::D17_D18_D19,
+  AArch64::D18_D19_D20, AArch64::D19_D20_D21, AArch64::D20_D21_D22,
+  AArch64::D21_D22_D23, AArch64::D22_D23_D24, AArch64::D23_D24_D25,
+  AArch64::D24_D25_D26, AArch64::D25_D26_D27, AArch64::D26_D27_D28,
+  AArch64::D27_D28_D29, AArch64::D28_D29_D30, AArch64::D29_D30_D31,
+  AArch64::D30_D31_D0,  AArch64::D31_D0_D1
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Addr, const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm16(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 15)
-    return MCDisassembler::Fail;
+static const unsigned DDDDDecoderTable[] = {
+  AArch64::D0_D1_D2_D3,     AArch64::D1_D2_D3_D4,     AArch64::D2_D3_D4_D5,
+  AArch64::D3_D4_D5_D6,     AArch64::D4_D5_D6_D7,     AArch64::D5_D6_D7_D8,
+  AArch64::D6_D7_D8_D9,     AArch64::D7_D8_D9_D10,    AArch64::D8_D9_D10_D11,
+  AArch64::D9_D10_D11_D12,  AArch64::D10_D11_D12_D13, AArch64::D11_D12_D13_D14,
+  AArch64::D12_D13_D14_D15, AArch64::D13_D14_D15_D16, AArch64::D14_D15_D16_D17,
+  AArch64::D15_D16_D17_D18, AArch64::D16_D17_D18_D19, AArch64::D17_D18_D19_D20,
+  AArch64::D18_D19_D20_D21, AArch64::D19_D20_D21_D22, AArch64::D20_D21_D22_D23,
+  AArch64::D21_D22_D23_D24, AArch64::D22_D23_D24_D25, AArch64::D23_D24_D25_D26,
+  AArch64::D24_D25_D26_D27, AArch64::D25_D26_D27_D28, AArch64::D26_D27_D28_D29,
+  AArch64::D27_D28_D29_D30, AArch64::D28_D29_D30_D31, AArch64::D29_D30_D31_D0,
+  AArch64::D30_D31_D0_D1,   AArch64::D31_D0_D1_D2
+};
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return Fail;
+  unsigned Register = DDDDDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm32(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm32(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  // scale{5} is asserted as 1 in tblgen.
+  Imm |= 0x20;  
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-static DecodeStatus DecodeShiftLeftImm64(MCInst &Inst, unsigned Val,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  if (Val > 63)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Val));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeFixedPointScaleImm64(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeMoveWideImmOperand(llvm::MCInst &Inst,
-                                             unsigned FullImm,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-  unsigned Imm16 = FullImm & 0xffff;
-  unsigned Shift = FullImm >> 16;
+static DecodeStatus DecodePCRelLabel19(llvm::MCInst &Inst, unsigned Imm,
+                                       uint64_t Addr, const void *Decoder) {
+  int64_t ImmVal = Imm;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 19-bit immediate.
+  if (ImmVal & (1 << (19 - 1)))
+    ImmVal |= ~((1LL << 19) - 1);
 
-  if (RegWidth == 32 && Shift > 1) return MCDisassembler::Fail;
+  if (!Dis->tryAddingSymbolicOperand(Inst, ImmVal << 2, Addr,
+                                     Inst.getOpcode() != AArch64::LDRXl, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  return Success;
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Imm16));
-  Inst.addOperand(MCOperand::CreateImm(Shift));
-  return MCDisassembler::Success;
+static DecodeStatus DecodeMemExtend(llvm::MCInst &Inst, unsigned Imm,
+                                    uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm((Imm  >> 1) & 1));
+  Inst.addOperand(MCOperand::CreateImm(Imm & 1));
+  return Success;
 }
 
-template<int RegWidth>
-static DecodeStatus DecodeLogicalImmOperand(llvm::MCInst &Inst,
-                                            unsigned Bits,
+static DecodeStatus DecodeMRSSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  uint64_t Imm;
-  if (!A64Imms::isLogicalImmBits(RegWidth, Bits, Imm))
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(Bits));
-  return MCDisassembler::Success;
-}
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-static DecodeStatus DecodeRegExtendOperand(llvm::MCInst &Inst,
-                                           unsigned ShiftAmount,
-                                           uint64_t Address,
-                                           const void *Decoder) {
-  // Only values 0-4 are valid for this 3-bit field
-  if (ShiftAmount > 4)
-    return MCDisassembler::Fail;
+  bool ValidNamed;
+  (void)AArch64SysReg::MRSMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
-static DecodeStatus Decode32BitShiftOperand(llvm::MCInst &Inst,
-                                            unsigned ShiftAmount,
+static DecodeStatus DecodeMSRSystemRegister(llvm::MCInst &Inst, unsigned Imm,
                                             uint64_t Address,
                                             const void *Decoder) {
-  // Only values below 32 are valid for a 32-bit register
-  if (ShiftAmount > 31)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeBitfieldInstruction(llvm::MCInst &Inst, unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned ImmS = fieldFromInstruction(Insn, 10, 6);
-  unsigned ImmR = fieldFromInstruction(Insn, 16, 6);
-  unsigned SF = fieldFromInstruction(Insn, 31, 1);
-
-  // Undef for 0b11 just in case it occurs. Don't want the compiler to optimise
-  // out assertions that it thinks should never be hit.
-  enum OpcTypes { SBFM = 0, BFM, UBFM, Undef } Opc;
-  Opc = (OpcTypes)fieldFromInstruction(Insn, 29, 2);
-
-  if (!SF) {
-    // ImmR and ImmS must be between 0 and 31 for 32-bit instructions.
-    if (ImmR > 31 || ImmS > 31)
-      return MCDisassembler::Fail;
-  }
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  const MCSubtargetInfo &STI = Dis->getSubtargetInfo();
 
-  if (SF) {
-    DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR64RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Address, Decoder);
-  } else {
-    DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    // BFM MCInsts use Rd as a source too.
-    if (Opc == BFM) DecodeGPR32RegisterClass(Inst, Rd, Address, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // ASR and LSR have more specific patterns so they won't get here:
-  assert(!(ImmS == 31 && !SF && Opc != BFM)
-         && "shift should have used auto decode");
-  assert(!(ImmS == 63 && SF && Opc != BFM)
-         && "shift should have used auto decode");
-
-  // Extension instructions similarly:
-  if (Opc == SBFM && ImmR == 0) {
-    assert((ImmS != 7 && ImmS != 15) && "extension got here");
-    assert((ImmS != 31 || SF == 0) && "extension got here");
-  } else if (Opc == UBFM && ImmR == 0) {
-    assert((SF != 0 || (ImmS != 7 && ImmS != 15)) && "extension got here");
-  }
+  Imm |= 0x8000;
+  Inst.addOperand(MCOperand::CreateImm(Imm));
 
-  if (Opc == UBFM) {
-    // It might be a LSL instruction, which actually takes the shift amount
-    // itself as an MCInst operand.
-    if (SF && (ImmS + 1) % 64 == ImmR) {
-      Inst.setOpcode(AArch64::LSLxxi);
-      Inst.addOperand(MCOperand::CreateImm(63 - ImmS));
-      return MCDisassembler::Success;
-    } else if (!SF && (ImmS + 1) % 32 == ImmR) {
-      Inst.setOpcode(AArch64::LSLwwi);
-      Inst.addOperand(MCOperand::CreateImm(31 - ImmS));
-      return MCDisassembler::Success;
-    }
-  }
-
-  // Otherwise it's definitely either an extract or an insert depending on which
-  // of ImmR or ImmS is larger.
-  unsigned ExtractOp, InsertOp;
-  switch (Opc) {
-  default: llvm_unreachable("unexpected instruction trying to decode bitfield");
-  case SBFM:
-    ExtractOp = SF ? AArch64::SBFXxxii : AArch64::SBFXwwii;
-    InsertOp = SF ? AArch64::SBFIZxxii : AArch64::SBFIZwwii;
-    break;
-  case BFM:
-    ExtractOp = SF ? AArch64::BFXILxxii : AArch64::BFXILwwii;
-    InsertOp = SF ? AArch64::BFIxxii : AArch64::BFIwwii;
-    break;
-  case UBFM:
-    ExtractOp = SF ? AArch64::UBFXxxii : AArch64::UBFXwwii;
-    InsertOp = SF ? AArch64::UBFIZxxii : AArch64::UBFIZwwii;
-    break;
-  }
-
-  // Otherwise it's a boring insert or extract
-  Inst.addOperand(MCOperand::CreateImm(ImmR));
-  Inst.addOperand(MCOperand::CreateImm(ImmS));
-
-
-  if (ImmS < ImmR)
-    Inst.setOpcode(InsertOp);
-  else
-    Inst.setOpcode(ExtractOp);
+  bool ValidNamed;
+  (void)AArch64SysReg::MSRMapper(STI.getFeatureBits())
+      .toString(Imm, ValidNamed);
 
-  return MCDisassembler::Success;
+  return ValidNamed ? Success : Fail;
 }
 
 static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
@@ -763,811 +681,879 @@ static DecodeStatus DecodeFMOVLaneInstruction(llvm::MCInst &Inst, unsigned Insn,
   // Add the lane
   Inst.addOperand(MCOperand::CreateImm(1));
 
-  return MCDisassembler::Success;
+  return Success;
 }
 
+static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
+  return Success;
+}
 
-static DecodeStatus DecodeLDSTPairInstruction(llvm::MCInst &Inst,
-                                              unsigned Insn,
-                                              uint64_t Address,
-                                              const void *Decoder) {
-  DecodeStatus Result = MCDisassembler::Success;
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Insn, 10, 5);
-  unsigned SImm7 = fieldFromInstruction(Insn, 15, 7);
-  unsigned L = fieldFromInstruction(Insn, 22, 1);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Opc = fieldFromInstruction(Insn, 30, 2);
-
-  // Not an official name, but it turns out that bit 23 distinguishes indexed
-  // from non-indexed operations.
-  unsigned Indexed = fieldFromInstruction(Insn, 23, 1);
-
-  if (Indexed && L == 0) {
-    // The MCInst for an indexed store has an out operand and 4 ins:
-    //    Rn_wb, Rt, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-  // You shouldn't load to the same register twice in an instruction...
-  if (L && Rt == Rt2)
-    Result = MCDisassembler::SoftFail;
-
-  // ... or do any operation that writes-back to a transfer register. But note
-  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
-  if (Indexed && V == 0 && Rn != 31 && (Rt == Rn || Rt2 == Rn))
-    Result = MCDisassembler::SoftFail;
-
-  // Exactly how we decode the MCInst's registers depends on the Opc and V
-  // fields of the instruction. These also obviously determine the size of the
-  // operation so we can fill in that information while we're at it.
-  if (V) {
-    // The instruction operates on the FP/SIMD registers
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeFPR128RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  } else {
-    switch (Opc) {
-    default: return MCDisassembler::Fail;
-    case 0:
-      DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 1:
-      assert(L && "unexpected \"store signed\" attempt");
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    case 2:
-      DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-      DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder);
-      break;
-    }
-  }
-
-  if (Indexed && L == 1) {
-    // The MCInst for an indexed load has 3 out operands and an 3 ins:
-    //    Rt, Rt2, Rn_wb, Rt2, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
-
-
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(SImm7));
+static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
+                                       unsigned Add) {
+  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
+  return Success;
+}
 
-  return Result;
+static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 64);
 }
 
-static DecodeStatus DecodeLoadPairExclusiveInstruction(llvm::MCInst &Inst,
-                                                       uint32_t Val,
-                                                       uint64_t Address,
-                                                       const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Val, 0, 5);
-  unsigned Rn = fieldFromInstruction(Val, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(Val, 10, 5);
-  unsigned MemSize = fieldFromInstruction(Val, 30, 2);
-
-  DecodeStatus S = MCDisassembler::Success;
-  if (Rt == Rt2) S = MCDisassembler::SoftFail;
-
-  switch (MemSize) {
-    case 2:
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR32RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    case 3:
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder)))
-        return MCDisassembler::Fail;
-      if (!Check(S, DecodeGPR64RegisterClass(Inst, Rt2, Address, Decoder)))
-        return MCDisassembler::Fail;
-      break;
-    default:
-      llvm_unreachable("Invalid MemSize in DecodeLoadPairExclusiveInstruction");
-  }
+static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
+}
 
-  if (!Check(S, DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder)))
-    return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 32);
+}
 
-  return S;
+static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
 }
 
-template<typename SomeNamedImmMapper>
-static DecodeStatus DecodeNamedImmOperand(llvm::MCInst &Inst,
-                                          unsigned Val,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  SomeNamedImmMapper Mapper;
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
-  if (ValidNamed || Mapper.validImm(Val)) {
-    Inst.addOperand(MCOperand::CreateImm(Val));
-    return MCDisassembler::Success;
-  }
+static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 16);
+}
 
-  return MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
+                                               uint64_t Addr,
+                                               const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
 }
 
-static DecodeStatus DecodeSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                        llvm::MCInst &Inst,
-                                        unsigned Val,
-                                        uint64_t Address,
-                                        const void *Decoder) {
-  bool ValidNamed;
-  Mapper.toString(Val, ValidNamed);
+static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftRImm(Inst, Imm, 8);
+}
 
-  Inst.addOperand(MCOperand::CreateImm(Val));
+static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 64);
+}
 
-  return ValidNamed ? MCDisassembler::Success : MCDisassembler::Fail;
+static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 32);
 }
 
-static DecodeStatus DecodeMRSOperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MRSMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
+                                         uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 16);
 }
 
-static DecodeStatus DecodeMSROperand(llvm::MCInst &Inst,
-                                     unsigned Val,
-                                     uint64_t Address,
-                                     const void *Decoder) {
-  return DecodeSysRegOperand(A64SysReg::MSRMapper(), Inst, Val, Address,
-                             Decoder);
+static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
+                                        uint64_t Addr, const void *Decoder) {
+  return DecodeVecShiftLImm(Inst, Imm, 8);
 }
 
-static DecodeStatus DecodeSingleIndexedInstruction(llvm::MCInst &Inst,
-                                                   unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Imm9 = fieldFromInstruction(Insn, 12, 9);
-
-  unsigned Opc = fieldFromInstruction(Insn, 22, 2);
-  unsigned V = fieldFromInstruction(Insn, 26, 1);
-  unsigned Size = fieldFromInstruction(Insn, 30, 2);
-
-  if (Opc == 0 || (V == 1 && Opc == 2)) {
-    // It's a store, the MCInst gets: Rn_wb, Rt, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
+  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
+  unsigned shift = (shiftHi << 6) | shiftLo;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrs:
+  case AArch64::ADDSWrs:
+  case AArch64::SUBWrs:
+  case AArch64::SUBSWrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDWrs:
+  case AArch64::ANDSWrs:
+  case AArch64::BICWrs:
+  case AArch64::BICSWrs:
+  case AArch64::ORRWrs:
+  case AArch64::ORNWrs:
+  case AArch64::EORWrs:
+  case AArch64::EONWrs: {
+    // if sf == '0' and imm6<5> == '1' then ReservedValue()
+    if (shiftLo >> 5 == 1)
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
-
-  if (V == 0 && (Opc == 2 || Size == 3)) {
-    DecodeGPR64RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 0) {
-    DecodeGPR32RegisterClass(Inst, Rt, Address, Decoder);
-  } else if (V == 1 && (Opc & 2)) {
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-  } else {
-    switch (Size) {
-    case 0:
-      DecodeFPR8RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 1:
-      DecodeFPR16RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeFPR32RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  case AArch64::ADDXrs:
+  case AArch64::ADDSXrs:
+  case AArch64::SUBXrs:
+  case AArch64::SUBSXrs:
+    // if shift == '11' then ReservedValue()
+    if (shiftHi == 0x3)
+      return Fail;
+    // Deliberate fallthrough
+  case AArch64::ANDXrs:
+  case AArch64::ANDSXrs:
+  case AArch64::BICXrs:
+  case AArch64::BICSXrs:
+  case AArch64::ORRXrs:
+  case AArch64::ORNXrs:
+  case AArch64::EORXrs:
+  case AArch64::EONXrs:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
+    break;
   }
 
-  if (Opc != 0 && (V != 1 || Opc != 2)) {
-    // It's a load, the MCInst gets: Rt, Rn_wb, Rn, Imm
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  }
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
+}
 
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned imm = fieldFromInstruction(insn, 5, 16);
+  unsigned shift = fieldFromInstruction(insn, 21, 2);
+  shift <<= 4;
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::MOVZWi:
+  case AArch64::MOVNWi:
+  case AArch64::MOVKWi:
+    if (shift & (1U << 5))
+      return Fail;
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  case AArch64::MOVZXi:
+  case AArch64::MOVNXi:
+  case AArch64::MOVKXi:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    break;
+  }
 
-  Inst.addOperand(MCOperand::CreateImm(Imm9));
+  if (Inst.getOpcode() == AArch64::MOVKWi ||
+      Inst.getOpcode() == AArch64::MOVKXi)
+    Inst.addOperand(Inst.getOperand(0));
 
-  // N.b. The official documentation says undpredictable if Rt == Rn, but this
-  // takes place at the architectural rather than encoding level:
-  //
-  // "STR xzr, [sp], #4" is perfectly valid.
-  if (V == 0 && Rt == Rn && Rn != 31)
-    return MCDisassembler::SoftFail;
-  else
-    return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm(shift));
+  return Success;
 }
 
-static MCDisassembler *createAArch64Disassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI) {
-  return new AArch64Disassembler(STI, T.createMCRegInfo(""));
-}
+static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned offset = fieldFromInstruction(insn, 10, 12);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
 
-extern "C" void LLVMInitializeAArch64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheAArch64leTarget,
-                                         createAArch64Disassembler);
-  TargetRegistry::RegisterMCDisassembler(TheAArch64beTarget,
-                                         createAArch64Disassembler);
-}
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFMui:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STRBBui:
+  case AArch64::LDRBBui:
+  case AArch64::LDRSBWui:
+  case AArch64::STRHHui:
+  case AArch64::LDRHHui:
+  case AArch64::LDRSHWui:
+  case AArch64::STRWui:
+  case AArch64::LDRWui:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSBXui:
+  case AArch64::LDRSHXui:
+  case AArch64::LDRSWui:
+  case AArch64::STRXui:
+  case AArch64::LDRXui:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRQui:
+  case AArch64::STRQui:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRDui:
+  case AArch64::STRDui:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRSui:
+  case AArch64::STRSui:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRHui:
+  case AArch64::STRHui:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDRBui:
+  case AArch64::STRBui:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-static DecodeStatus
-DecodeNeonMovImmShiftOperand(llvm::MCInst &Inst, unsigned ShiftAmount,
-                             uint64_t Address, const void *Decoder) {
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    return MCDisassembler::Fail;
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && IsHalf)) && ShiftAmount != 0 && ShiftAmount != 1)
-    return MCDisassembler::Fail;
-
-  // LSL  accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && ShiftAmount > 3)
-    return MCDisassembler::Fail;
-
-  Inst.addOperand(MCOperand::CreateImm(ShiftAmount));
-  return MCDisassembler::Success;
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, offset, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(offset));
+  return Success;
 }
 
-// Decode post-index vector load/store instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of vector list in bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTPostInstruction(MCInst &Inst, unsigned Insn,
-                                               uint64_t Address,
-                                               const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-  unsigned Opcode = fieldFromInstruction(Insn, 12, 4);
-  unsigned IsLoad = fieldFromInstruction(Insn, 22, 1);
-  // 0 for 64bit vector list, 1 for 128bit vector list
-  unsigned Is128BitVec = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  int64_t offset = fieldFromInstruction(insn, 12, 9);
 
-  unsigned NumVecs;
-  switch (Opcode) {
-  case 0: // ld4/st4
-  case 2: // ld1/st1 with 4 vectors
-    NumVecs = 4; break;
-  case 4: // ld3/st3
-  case 6: // ld1/st1 with 3 vectors
-    NumVecs = 3; break;
-  case 7: // ld1/st1 with 1 vector
-    NumVecs = 1; break;
-  case 8:  // ld2/st2
-  case 10: // ld1/st1 with 2 vectors
-    NumVecs = 2; break;
+  // offset is a 9-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (9 - 1)))
+    offset |= ~((1LL << 9) - 1);
+
+  // First operand is always the writeback to the address register, if needed.
+  switch (Inst.getOpcode()) {
   default:
-    llvm_unreachable("Invalid opcode for post-index load/store instructions");
+    break;
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    break;
   }
 
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::PRFUMi:
+    // Rt is an immediate in prefetch.
+    Inst.addOperand(MCOperand::CreateImm(Rt));
+    break;
+  case AArch64::STURBBi:
+  case AArch64::LDURBBi:
+  case AArch64::LDURSBWi:
+  case AArch64::STURHHi:
+  case AArch64::LDURHHi:
+  case AArch64::LDURSHWi:
+  case AArch64::STURWi:
+  case AArch64::LDURWi:
+  case AArch64::LDTRSBWi:
+  case AArch64::LDTRSHWi:
+  case AArch64::STTRWi:
+  case AArch64::LDTRWi:
+  case AArch64::STTRHi:
+  case AArch64::LDTRHi:
+  case AArch64::LDTRBi:
+  case AArch64::STTRBi:
+  case AArch64::LDRSBWpre:
+  case AArch64::LDRSHWpre:
+  case AArch64::STRBBpre:
+  case AArch64::LDRBBpre:
+  case AArch64::STRHHpre:
+  case AArch64::LDRHHpre:
+  case AArch64::STRWpre:
+  case AArch64::LDRWpre:
+  case AArch64::LDRSBWpost:
+  case AArch64::LDRSHWpost:
+  case AArch64::STRBBpost:
+  case AArch64::LDRBBpost:
+  case AArch64::STRHHpost:
+  case AArch64::LDRHHpost:
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSBXi:
+  case AArch64::LDURSHXi:
+  case AArch64::LDURSWi:
+  case AArch64::STURXi:
+  case AArch64::LDURXi:
+  case AArch64::LDTRSBXi:
+  case AArch64::LDTRSHXi:
+  case AArch64::LDTRSWi:
+  case AArch64::STTRXi:
+  case AArch64::LDTRXi:
+  case AArch64::LDRSBXpre:
+  case AArch64::LDRSHXpre:
+  case AArch64::STRXpre:
+  case AArch64::LDRSWpre:
+  case AArch64::LDRXpre:
+  case AArch64::LDRSBXpost:
+  case AArch64::LDRSHXpost:
+  case AArch64::STRXpost:
+  case AArch64::LDRSWpost:
+  case AArch64::LDRXpost:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURQi:
+  case AArch64::STURQi:
+  case AArch64::LDRQpre:
+  case AArch64::STRQpre:
+  case AArch64::LDRQpost:
+  case AArch64::STRQpost:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURDi:
+  case AArch64::STURDi:
+  case AArch64::LDRDpre:
+  case AArch64::STRDpre:
+  case AArch64::LDRDpost:
+  case AArch64::STRDpost:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURSi:
+  case AArch64::STURSi:
+  case AArch64::LDRSpre:
+  case AArch64::STRSpre:
+  case AArch64::LDRSpost:
+  case AArch64::STRSpost:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURHi:
+  case AArch64::STURHi:
+  case AArch64::LDRHpre:
+  case AArch64::STRHpre:
+  case AArch64::LDRHpost:
+  case AArch64::STRHpost:
+    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
+  case AArch64::LDURBi:
+  case AArch64::STURBi:
+  case AArch64::LDRBpre:
+  case AArch64::STRBpre:
+  case AArch64::LDRBpost:
+  case AArch64::STRBpost:
+    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
+    break;
   }
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-  if (Rm == 31) // If Rm is 0x11111, add the vector list length in byte
-    Inst.addOperand(MCOperand::CreateImm(NumVecs * (Is128BitVec ? 16 : 8)));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-  // Decode vector list of 1/2/3/4 vectors for load instructions.
-  if (!IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      Is128BitVec ? DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is128BitVec ? DecodeQPairRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is128BitVec ? DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is128BitVec ? DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder)
-                  : DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    }
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
 
-  return MCDisassembler::Success;
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
+  bool IsIndexed = fieldFromInstruction(insn, 10, 2) != 0;
+  bool IsFP = fieldFromInstruction(insn, 26, 1);
+
+  // Cannot write back to a transfer register (but xzr != sp).
+  if (IsLoad && IsIndexed && !IsFP && Rn != 31 && Rt == Rn)
+    return SoftFail;
+
+  return Success;
 }
 
-// Decode post-index vector load/store lane instructions.
-// This is necessary as we need to decode Rm: if Rm == 0b11111, the last
-// operand is an immediate equal the the length of the changed bytes,
-// or Rm is decoded to a GPR64noxzr register.
-static DecodeStatus DecodeVLDSTLanePostInstruction(MCInst &Inst, unsigned Insn,
-                                                   uint64_t Address,
+static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
+                                                   uint32_t insn, uint64_t Addr,
                                                    const void *Decoder) {
-  bool Is64bitVec = false;
-  bool IsLoadDup = false;
-  bool IsLoad = false;
-  // The total number of bytes transferred.
-  // TransferBytes = NumVecs * OneLaneBytes
-  unsigned TransferBytes = 0;
-  unsigned NumVecs = 0;
-  unsigned Opc = Inst.getOpcode();
-  switch (Opc) {
-  case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-  case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-  case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-  case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_8B_fixed: case AArch64::LD1R_WB_8B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_4H_fixed: case AArch64::LD1R_WB_4H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_2S_fixed: case AArch64::LD1R_WB_2S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_1D_fixed: case AArch64::LD1R_WB_1D_register:
-      TransferBytes = 8; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 1;
-    break;
-  }
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  unsigned Rs = fieldFromInstruction(insn, 16, 5);
 
-  case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-  case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-  case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-  case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD1R_WB_16B_fixed: case AArch64::LD1R_WB_16B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1R_WB_8H_fixed: case AArch64::LD1R_WB_8H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1R_WB_4S_fixed: case AArch64::LD1R_WB_4S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1R_WB_2D_fixed: case AArch64::LD1R_WB_2D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 1;
+  unsigned Opcode = Inst.getOpcode();
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::STLXRW:
+  case AArch64::STLXRB:
+  case AArch64::STLXRH:
+  case AArch64::STXRW:
+  case AArch64::STXRB:
+  case AArch64::STXRH:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARW:
+  case AArch64::LDARB:
+  case AArch64::LDARH:
+  case AArch64::LDAXRW:
+  case AArch64::LDAXRB:
+  case AArch64::LDAXRH:
+  case AArch64::LDXRW:
+  case AArch64::LDXRB:
+  case AArch64::LDXRH:
+  case AArch64::STLRW:
+  case AArch64::STLRB:
+  case AArch64::STLRH:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-  case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-  case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-  case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_8B_fixed: case AArch64::LD2R_WB_8B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_4H_fixed: case AArch64::LD2R_WB_4H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_2S_fixed: case AArch64::LD2R_WB_2S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_1D_fixed: case AArch64::LD2R_WB_1D_register:
-      TransferBytes = 16; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXRX:
+  case AArch64::STXRX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDARX:
+  case AArch64::LDAXRX:
+  case AArch64::LDXRX:
+  case AArch64::STLRX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-  case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-  case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-  case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD2R_WB_16B_fixed: case AArch64::LD2R_WB_16B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2R_WB_8H_fixed: case AArch64::LD2R_WB_8H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2R_WB_4S_fixed: case AArch64::LD2R_WB_4S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2R_WB_2D_fixed: case AArch64::LD2R_WB_2D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 2;
+  case AArch64::STLXPW:
+  case AArch64::STXPW:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPW:
+  case AArch64::LDXPW:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-  case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-  case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-  case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_8B_fixed: case AArch64::LD3R_WB_8B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_4H_fixed: case AArch64::LD3R_WB_4H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_2S_fixed: case AArch64::LD3R_WB_2S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_1D_fixed: case AArch64::LD3R_WB_1D_register:
-      TransferBytes = 24; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 3;
+  case AArch64::STLXPX:
+  case AArch64::STXPX:
+    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
+  // FALLTHROUGH
+  case AArch64::LDAXPX:
+  case AArch64::LDXPX:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-  case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_8H_register:
-  case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_4S_register:
-  case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD3R_WB_16B_fixed: case AArch64::LD3R_WB_16B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3R_WB_8H_fixed: case AArch64::LD3R_WB_8H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3R_WB_4S_fixed: case AArch64::LD3R_WB_4S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3R_WB_2D_fixed: case AArch64::LD3R_WB_2D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 3;
-    break;
-  }
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
 
-  case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-  case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-  case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-  case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_8B_fixed: case AArch64::LD4R_WB_8B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_4H_fixed: case AArch64::LD4R_WB_4H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_2S_fixed: case AArch64::LD4R_WB_2S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_1D_fixed: case AArch64::LD4R_WB_1D_register:
-      TransferBytes = 32; break;
-    }
-    Is64bitVec = true;
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if ((Opcode == AArch64::LDAXPW || Opcode == AArch64::LDXPW ||
+       Opcode == AArch64::LDAXPX || Opcode == AArch64::LDXPX) &&
+      Rt == Rt2)
+    return SoftFail;
 
-  case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-  case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_8H_register:
-  case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_4S_register:
-  case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register: {
-    switch (Opc) {
-    case AArch64::LD4R_WB_16B_fixed: case AArch64::LD4R_WB_16B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4R_WB_8H_fixed: case AArch64::LD4R_WB_8H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4R_WB_4S_fixed: case AArch64::LD4R_WB_4S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4R_WB_2D_fixed: case AArch64::LD4R_WB_2D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoadDup = true;
-    NumVecs = 4;
-    break;
-  }
+  return Success;
+}
 
-  case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-  case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-  case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-  case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD1LN_WB_B_fixed: case AArch64::LD1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::LD1LN_WB_H_fixed: case AArch64::LD1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::LD1LN_WB_S_fixed: case AArch64::LD1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::LD1LN_WB_D_fixed: case AArch64::LD1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    IsLoad = true;
-    NumVecs = 1;
-    break;
-  }
+static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
+  int64_t offset = fieldFromInstruction(insn, 15, 7);
+  bool IsLoad = fieldFromInstruction(insn, 22, 1);
 
-  case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-  case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-  case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-  case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD2LN_WB_B_fixed: case AArch64::LD2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::LD2LN_WB_H_fixed: case AArch64::LD2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::LD2LN_WB_S_fixed: case AArch64::LD2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::LD2LN_WB_D_fixed: case AArch64::LD2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    IsLoad = true;
-    NumVecs = 2;
-    break;
-  }
+  // offset is a 7-bit signed immediate, so sign extend it to
+  // fill the unsigned.
+  if (offset & (1 << (7 - 1)))
+    offset |= ~((1LL << 7) - 1);
 
-  case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-  case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-  case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-  case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD3LN_WB_B_fixed: case AArch64::LD3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::LD3LN_WB_H_fixed: case AArch64::LD3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::LD3LN_WB_S_fixed: case AArch64::LD3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::LD3LN_WB_D_fixed: case AArch64::LD3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    IsLoad = true;
-    NumVecs = 3;
-    break;
-  }
+  unsigned Opcode = Inst.getOpcode();
+  bool NeedsDisjointWritebackTransfer = false;
 
-  case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-  case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-  case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-  case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::LD4LN_WB_B_fixed: case AArch64::LD4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::LD4LN_WB_H_fixed: case AArch64::LD4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::LD4LN_WB_S_fixed: case AArch64::LD4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::LD4LN_WB_D_fixed: case AArch64::LD4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    IsLoad = true;
-    NumVecs = 4;
+  // First operand is always writeback of base register.
+  switch (Opcode) {
+  default:
     break;
-  }
-
-  case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-  case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-  case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-  case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST1LN_WB_B_fixed: case AArch64::ST1LN_WB_B_register:
-      TransferBytes = 1; break;
-    case AArch64::ST1LN_WB_H_fixed: case AArch64::ST1LN_WB_H_register:
-      TransferBytes = 2; break;
-    case AArch64::ST1LN_WB_S_fixed: case AArch64::ST1LN_WB_S_register:
-      TransferBytes = 4; break;
-    case AArch64::ST1LN_WB_D_fixed: case AArch64::ST1LN_WB_D_register:
-      TransferBytes = 8; break;
-    }
-    NumVecs = 1;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
     break;
   }
 
-  case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-  case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-  case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-  case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST2LN_WB_B_fixed: case AArch64::ST2LN_WB_B_register:
-      TransferBytes = 2; break;
-    case AArch64::ST2LN_WB_H_fixed: case AArch64::ST2LN_WB_H_register:
-      TransferBytes = 4; break;
-    case AArch64::ST2LN_WB_S_fixed: case AArch64::ST2LN_WB_S_register:
-      TransferBytes = 8; break;
-    case AArch64::ST2LN_WB_D_fixed: case AArch64::ST2LN_WB_D_register:
-      TransferBytes = 16; break;
-    }
-    NumVecs = 2;
+  switch (Opcode) {
+  default:
+    return Fail;
+  case AArch64::LDPXpost:
+  case AArch64::STPXpost:
+  case AArch64::LDPSWpost:
+  case AArch64::LDPXpre:
+  case AArch64::STPXpre:
+  case AArch64::LDPSWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPXi:
+  case AArch64::STNPXi:
+  case AArch64::LDPXi:
+  case AArch64::STPXi:
+  case AArch64::LDPSWi:
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-  case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-  case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-  case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST3LN_WB_B_fixed: case AArch64::ST3LN_WB_B_register:
-      TransferBytes = 3; break;
-    case AArch64::ST3LN_WB_H_fixed: case AArch64::ST3LN_WB_H_register:
-      TransferBytes = 6; break;
-    case AArch64::ST3LN_WB_S_fixed: case AArch64::ST3LN_WB_S_register:
-      TransferBytes = 12; break;
-    case AArch64::ST3LN_WB_D_fixed: case AArch64::ST3LN_WB_D_register:
-      TransferBytes = 24; break;
-    }
-    NumVecs = 3;
+  case AArch64::LDPWpost:
+  case AArch64::STPWpost:
+  case AArch64::LDPWpre:
+  case AArch64::STPWpre:
+    NeedsDisjointWritebackTransfer = true;
+    // Fallthrough
+  case AArch64::LDNPWi:
+  case AArch64::STNPWi:
+  case AArch64::LDPWi:
+  case AArch64::STPWi:
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
-  }
-
-  case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-  case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-  case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-  case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register: {
-    switch (Opc) {
-    case AArch64::ST4LN_WB_B_fixed: case AArch64::ST4LN_WB_B_register:
-      TransferBytes = 4; break;
-    case AArch64::ST4LN_WB_H_fixed: case AArch64::ST4LN_WB_H_register:
-      TransferBytes = 8; break;
-    case AArch64::ST4LN_WB_S_fixed: case AArch64::ST4LN_WB_S_register:
-      TransferBytes = 16; break;
-    case AArch64::ST4LN_WB_D_fixed: case AArch64::ST4LN_WB_D_register:
-      TransferBytes = 32; break;
-    }
-    NumVecs = 4;
+  case AArch64::LDNPQi:
+  case AArch64::STNPQi:
+  case AArch64::LDPQpost:
+  case AArch64::STPQpost:
+  case AArch64::LDPQi:
+  case AArch64::STPQi:
+  case AArch64::LDPQpre:
+  case AArch64::STPQpre:
+    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPDi:
+  case AArch64::STNPDi:
+  case AArch64::LDPDpost:
+  case AArch64::STPDpost:
+  case AArch64::LDPDi:
+  case AArch64::STPDi:
+  case AArch64::LDPDpre:
+  case AArch64::STPDpre:
+    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
+    break;
+  case AArch64::LDNPSi:
+  case AArch64::STNPSi:
+  case AArch64::LDPSpost:
+  case AArch64::STPSpost:
+  case AArch64::LDPSi:
+  case AArch64::STPSi:
+  case AArch64::LDPSpre:
+  case AArch64::STPSpre:
+    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
+    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
     break;
   }
 
-  default:
-    return MCDisassembler::Fail;
-  } // End of switch (Opc)
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(offset));
 
-  unsigned Rt = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(Insn, 16, 5);
-
-  // Decode post-index of load duplicate lane
-  if (IsLoadDup) {
-    switch (NumVecs) {
-    case 1:
-      Is64bitVec ? DecodeFPR64RegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      Is64bitVec ? DecodeDPairRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      Is64bitVec ? DecodeDTripleRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      Is64bitVec ? DecodeDQuadRegisterClass(Inst, Rt, Address, Decoder)
-                 : DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-
-    // Decode write back register, which is equal to Rn.
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-    DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-
-    if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-      Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-    else // Decode Rm
-      DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
-
-    return MCDisassembler::Success;
-  }
+  // You shouldn't load to the same register twice in an instruction...
+  if (IsLoad && Rt == Rt2)
+    return SoftFail;
 
-  // Decode post-index of load/store lane
-  // Loads have a vector list as output.
-  if (IsLoad) {
-    switch (NumVecs) {
-    case 1:
-      DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 2:
-      DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 3:
-      DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
-      break;
-    case 4:
-      DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-    }
-  }
+  // ... or do any operation that writes-back to a transfer register. But note
+  // that "stp xzr, xzr, [sp], #4" is fine because xzr and sp are different.
+  if (NeedsDisjointWritebackTransfer && Rn != 31 && (Rt == Rn || Rt2 == Rn))
+    return SoftFail;
+
+  return Success;
+}
 
-  // Decode write back register, which is equal to Rn.
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
-  DecodeGPR64xspRegisterClass(Inst, Rn, Address, Decoder);
+static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Rm = fieldFromInstruction(insn, 16, 5);
+  unsigned extend = fieldFromInstruction(insn, 10, 6);
 
-  if (Rm == 31) // If Rm is 0x11111, add the number of transferred bytes
-    Inst.addOperand(MCOperand::CreateImm(TransferBytes));
-  else // Decode Rm
-    DecodeGPR64noxzrRegisterClass(Inst, Rm, Address, Decoder);
+  unsigned shift = extend & 0x7;
+  if (shift > 4)
+    return Fail;
 
-  // Decode the source vector list.
-  switch (NumVecs) {
-  case 1:
-    DecodeFPR128RegisterClass(Inst, Rt, Address, Decoder);
-    break;
-  case 2:
-    DecodeQPairRegisterClass(Inst, Rt, Address, Decoder);
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::ADDWrx:
+  case AArch64::SUBWrx:
+    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 3:
-    DecodeQTripleRegisterClass(Inst, Rt, Address, Decoder);
+  case AArch64::ADDSWrx:
+  case AArch64::SUBSWrx:
+    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    DecodeQQuadRegisterClass(Inst, Rt, Address, Decoder);
-  }
-
-  // Decode lane
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
-  unsigned S = fieldFromInstruction(Insn, 10, 3);
-  unsigned lane = 0;
-  // Calculate the number of lanes by number of vectors and transferred bytes.
-  // NumLanes = 16 bytes / bytes of each lane
-  unsigned NumLanes = 16 / (TransferBytes / NumVecs);
-  switch (NumLanes) {
-  case 16: // A vector has 16 lanes, each lane is 1 bytes.
-    lane = (Q << 3) | S;
+  case AArch64::ADDXrx:
+  case AArch64::SUBXrx:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 8:
-    lane = (Q << 2) | (S >> 1);
+  case AArch64::ADDSXrx:
+  case AArch64::SUBSXrx:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 4:
-    lane = (Q << 1) | (S >> 2);
+  case AArch64::ADDXrx64:
+  case AArch64::SUBXrx64:
+    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
-  case 2:
-    lane = Q;
+  case AArch64::SUBSXrx64:
+  case AArch64::ADDSXrx64:
+    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
     break;
   }
-  Inst.addOperand(MCOperand::CreateImm(lane));
 
-  return MCDisassembler::Success;
+  Inst.addOperand(MCOperand::CreateImm(extend));
+  return Success;
 }
 
-static DecodeStatus DecodeSHLLInstruction(MCInst &Inst, unsigned Insn,
-                                          uint64_t Address,
-                                          const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(Insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(Insn, 5, 5);
-  unsigned size = fieldFromInstruction(Insn, 22, 2);
-  unsigned Q = fieldFromInstruction(Insn, 30, 1);
+static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+  unsigned imm;
+
+  if (Datasize) {
+    if (Inst.getOpcode() == AArch64::ANDSXri)
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 13);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 64))
+      return Fail;
+  } else {
+    if (Inst.getOpcode() == AArch64::ANDSWri)
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
+    imm = fieldFromInstruction(insn, 10, 12);
+    if (!AArch64_AM::isValidDecodeLogicalImmediate(imm, 32))
+      return Fail;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  return Success;
+}
 
-  DecodeFPR128RegisterClass(Inst, Rd, Address, Decoder);
+static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                            uint64_t Addr,
+                                            const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
 
-  if(Q)
-    DecodeFPR128RegisterClass(Inst, Rn, Address, Decoder);
+  if (Inst.getOpcode() == AArch64::MOVID)
+    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
   else
-    DecodeFPR64RegisterClass(Inst, Rn, Address, Decoder);
+    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
 
-  switch (size) {
-  case 0:
-    Inst.addOperand(MCOperand::CreateImm(8));
+  switch (Inst.getOpcode()) {
+  default:
     break;
-  case 1:
-    Inst.addOperand(MCOperand::CreateImm(16));
+  case AArch64::MOVIv4i16:
+  case AArch64::MOVIv8i16:
+  case AArch64::MVNIv4i16:
+  case AArch64::MVNIv8i16:
+  case AArch64::MOVIv2i32:
+  case AArch64::MOVIv4i32:
+  case AArch64::MVNIv2i32:
+  case AArch64::MVNIv4i32:
+    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
     break;
-  case 2:
-    Inst.addOperand(MCOperand::CreateImm(32));
+  case AArch64::MOVIv2s_msl:
+  case AArch64::MOVIv4s_msl:
+  case AArch64::MVNIv2s_msl:
+  case AArch64::MVNIv4s_msl:
+    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
     break;
-  default :
-    return MCDisassembler::Fail;
   }
-  return MCDisassembler::Success;
+
+  return Success;
 }
 
+static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
+                                                uint32_t insn, uint64_t Addr,
+                                                const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned cmode = fieldFromInstruction(insn, 12, 4);
+  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
+  imm |= fieldFromInstruction(insn, 5, 5);
+
+  // Tied operands added twice.
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
+
+  Inst.addOperand(MCOperand::CreateImm(imm));
+  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
+
+  return Success;
+}
+
+static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
+                                         uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
+  imm |= fieldFromInstruction(insn, 29, 2);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 21-bit immediate.
+  if (imm & (1 << (21 - 1)))
+    imm |= ~((1LL << 21) - 1);
+
+  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  unsigned Rd = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  unsigned Imm = fieldFromInstruction(insn, 10, 14);
+  unsigned S = fieldFromInstruction(insn, 29, 1);
+  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
+
+  unsigned ShifterVal = (Imm >> 12) & 3;
+  unsigned ImmVal = Imm & 0xFFF;
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  if (ShifterVal != 0 && ShifterVal != 1)
+    return Fail;
+
+  if (Datasize) {
+    if (Rd == 31 && !S)
+      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  } else {
+    if (Rd == 31 && !S)
+      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
+    else
+      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
+    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
+  }
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, Imm, Addr, Fail, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(ImmVal));
+  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
+  return Success;
+}
+
+static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  int64_t imm = fieldFromInstruction(insn, 0, 26);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend the 26-bit immediate.
+  if (imm & (1 << (26 - 1)))
+    imm |= ~((1LL << 26) - 1);
+
+  if (!Dis->tryAddingSymbolicOperand(Inst, imm << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(imm));
+
+  return Success;
+}
+
+static DecodeStatus DecodeSystemPStateInstruction(llvm::MCInst &Inst,
+                                                  uint32_t insn, uint64_t Addr,
+                                                  const void *Decoder) {
+  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
+  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
+  uint64_t crm = fieldFromInstruction(insn, 8, 4);
+
+  uint64_t pstate_field = (op1 << 3) | op2;
+
+  Inst.addOperand(MCOperand::CreateImm(pstate_field));
+  Inst.addOperand(MCOperand::CreateImm(crm));
+
+  bool ValidNamed;
+  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  
+  return ValidNamed ? Success : Fail;
+}
+
+static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
+                                        uint64_t Addr, const void *Decoder) {
+  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
+  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
+  bit |= fieldFromInstruction(insn, 19, 5);
+  int64_t dst = fieldFromInstruction(insn, 5, 14);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+
+  // Sign-extend 14-bit immediate.
+  if (dst & (1 << (14 - 1)))
+    dst |= ~((1LL << 14) - 1);
+
+  if (fieldFromInstruction(insn, 31, 1) == 0)
+    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
+  else
+    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  Inst.addOperand(MCOperand::CreateImm(bit));
+  if (!Dis->tryAddingSymbolicOperand(Inst, dst << 2, Addr, true, 0, 4))
+    Inst.addOperand(MCOperand::CreateImm(dst));
+
+  return Success;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.h b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
new file mode 100644
index 0000000..68d4867
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.h
@@ -0,0 +1,40 @@
+//===- AArch64Disassembler.h - Disassembler for AArch64 ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64DISASSEMBLER_H
+#define AArch64DISASSEMBLER_H
+
+#include "llvm/MC/MCDisassembler.h"
+
+namespace llvm {
+
+class MCInst;
+class MemoryObject;
+class raw_ostream;
+
+class AArch64Disassembler : public MCDisassembler {
+public:
+  AArch64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
+
+  ~AArch64Disassembler() {}
+
+  /// getInstruction - See MCDisassembler.
+  MCDisassembler::DecodeStatus
+  getInstruction(MCInst &instr, uint64_t &size, const MemoryObject &region,
+                 uint64_t address, raw_ostream &vStream,
+                 raw_ostream &cStream) const override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
new file mode 100644
index 0000000..2466368
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.cpp
@@ -0,0 +1,221 @@
+//===- AArch64ExternalSymbolizer.cpp - Symbolizer for AArch64 ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64ExternalSymbolizer.h"
+#include "AArch64Subtarget.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-disassembler"
+
+static MCSymbolRefExpr::VariantKind
+getVariant(uint64_t LLVMDisassembler_VariantKind) {
+  switch (LLVMDisassembler_VariantKind) {
+  case LLVMDisassembler_VariantKind_None:
+    return MCSymbolRefExpr::VK_None;
+  case LLVMDisassembler_VariantKind_ARM64_PAGE:
+    return MCSymbolRefExpr::VK_PAGE;
+  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
+    return MCSymbolRefExpr::VK_PAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
+    return MCSymbolRefExpr::VK_GOTPAGE;
+  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
+    return MCSymbolRefExpr::VK_GOTPAGEOFF;
+  case LLVMDisassembler_VariantKind_ARM64_TLVP:
+  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
+  default:
+    assert(0 && "bad LLVMDisassembler_VariantKind");
+    return MCSymbolRefExpr::VK_None;
+  }
+}
+
+/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
+/// operand in place of the immediate Value in the MCInst.  The immediate
+/// Value has not had any PC adjustment made by the caller. If the instruction
+/// is a branch that adds the PC to the immediate Value then isBranch is
+/// Success, else Fail. If GetOpInfo is non-null, then it is called to get any
+/// symbolic information at the Address for this instrution.  If that returns
+/// non-zero then the symbolic information it returns is used to create an
+/// MCExpr and that is added as an operand to the MCInst.  If GetOpInfo()
+/// returns zero and isBranch is Success then a symbol look up for
+/// Address + Value is done and if a symbol is found an MCExpr is created with
+/// that, else an MCExpr with Address + Value is created.  If GetOpInfo()
+/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
+/// tested and for ADRP an other instructions that help to load of pointers
+/// a symbol look up is done to see it is returns a specific reference type
+/// to add to the comment stream.  This function returns Success if it adds
+/// an operand to the MCInst and Fail otherwise.
+bool AArch64ExternalSymbolizer::tryAddingSymbolicOperand(
+    MCInst &MI, raw_ostream &CommentStream, int64_t Value, uint64_t Address,
+    bool IsBranch, uint64_t Offset, uint64_t InstSize) {
+  // FIXME: This method shares a lot of code with
+  //        MCExternalSymbolizer::tryAddingSymbolicOperand. It may be possible
+  //        refactor the MCExternalSymbolizer interface to allow more of this
+  //        implementation to be shared.
+  //
+  struct LLVMOpInfo1 SymbolicOp;
+  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
+  SymbolicOp.Value = Value;
+  uint64_t ReferenceType;
+  const char *ReferenceName;
+  if (!GetOpInfo ||
+      !GetOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
+    if (IsBranch) {
+      ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
+      const char *Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType,
+                                      Address, &ReferenceName);
+      if (Name) {
+        SymbolicOp.AddSymbol.Name = Name;
+        SymbolicOp.AddSymbol.Present = true;
+        SymbolicOp.Value = 0;
+      } else {
+        SymbolicOp.Value = Address + Value;
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
+        CommentStream << "symbol stub for: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+    } else if (MI.getOpcode() == AArch64::ADRP) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
+        // otool expects the fully encoded ADRP instruction to be passed in as
+        // the value here, so reconstruct it:
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        uint32_t EncodedInst = 0x90000000;
+        EncodedInst |= (Value & 0x3) << 29; // immlo
+        EncodedInst |= ((Value >> 2) & 0x7FFFF) << 5; // immhi
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // reg
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+        CommentStream << format("0x%llx",
+                                0xfffffffffffff000LL & (Address + Value));
+    } else if (MI.getOpcode() == AArch64::ADDXri ||
+               MI.getOpcode() == AArch64::LDRXui ||
+               MI.getOpcode() == AArch64::LDRXl ||
+               MI.getOpcode() == AArch64::ADR) {
+      if (MI.getOpcode() == AArch64::ADDXri)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
+      else if (MI.getOpcode() == AArch64::LDRXui)
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
+      if (MI.getOpcode() == AArch64::LDRXl) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                     &ReferenceName);
+      } else if (MI.getOpcode() == AArch64::ADR) {
+        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
+        SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
+                            &ReferenceName);
+      } else {
+        const MCRegisterInfo &MCRI = *Ctx.getRegisterInfo();
+        // otool expects the fully encoded ADD/LDR instruction to be passed in
+        // as the value here, so reconstruct it:
+        unsigned EncodedInst =
+          MI.getOpcode() == AArch64::ADDXri ? 0x91000000: 0xF9400000;
+        EncodedInst |= Value << 10; // imm12 [+ shift:2 for ADD]
+        EncodedInst |=
+          MCRI.getEncodingValue(MI.getOperand(1).getReg()) << 5; // Rn
+        EncodedInst |= MCRI.getEncodingValue(MI.getOperand(0).getReg()); // Rd
+
+        SymbolLookUp(DisInfo, EncodedInst, &ReferenceType, Address,
+                     &ReferenceName);
+      }
+      if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
+        CommentStream << "literal pool symbol address: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
+        CommentStream << "literal pool for: \"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
+        CommentStream << "Objc cfstring ref: @\"" << ReferenceName << "\"";
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message)
+        CommentStream << "Objc message: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
+        CommentStream << "Objc message ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
+        CommentStream << "Objc selector ref: " << ReferenceName;
+      else if (ReferenceType ==
+               LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
+        CommentStream << "Objc class ref: " << ReferenceName;
+      // For these instructions, the SymbolLookUp() above is just to get the
+      // ReferenceType and ReferenceName.  We want to make sure not to
+      // fall through so we don't build an MCExpr to leave the disassembly
+      // of the immediate values of these instructions to the InstPrinter.
+      return false;
+    } else {
+      return false;
+    }
+  }
+
+  const MCExpr *Add = nullptr;
+  if (SymbolicOp.AddSymbol.Present) {
+    if (SymbolicOp.AddSymbol.Name) {
+      StringRef Name(SymbolicOp.AddSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
+      if (Variant != MCSymbolRefExpr::VK_None)
+        Add = MCSymbolRefExpr::Create(Sym, Variant, Ctx);
+      else
+        Add = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Sub = nullptr;
+  if (SymbolicOp.SubtractSymbol.Present) {
+    if (SymbolicOp.SubtractSymbol.Name) {
+      StringRef Name(SymbolicOp.SubtractSymbol.Name);
+      MCSymbol *Sym = Ctx.GetOrCreateSymbol(Name);
+      Sub = MCSymbolRefExpr::Create(Sym, Ctx);
+    } else {
+      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, Ctx);
+    }
+  }
+
+  const MCExpr *Off = nullptr;
+  if (SymbolicOp.Value != 0)
+    Off = MCConstantExpr::Create(SymbolicOp.Value, Ctx);
+
+  const MCExpr *Expr;
+  if (Sub) {
+    const MCExpr *LHS;
+    if (Add)
+      LHS = MCBinaryExpr::CreateSub(Add, Sub, Ctx);
+    else
+      LHS = MCUnaryExpr::CreateMinus(Sub, Ctx);
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(LHS, Off, Ctx);
+    else
+      Expr = LHS;
+  } else if (Add) {
+    if (Off)
+      Expr = MCBinaryExpr::CreateAdd(Add, Off, Ctx);
+    else
+      Expr = Add;
+  } else {
+    if (Off)
+      Expr = Off;
+    else
+      Expr = MCConstantExpr::Create(0, Ctx);
+  }
+
+  MI.addOperand(MCOperand::CreateExpr(Expr));
+
+  return true;
+}
diff --git a/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
new file mode 100644
index 0000000..171d31c
--- /dev/null
+++ b/lib/Target/AArch64/Disassembler/AArch64ExternalSymbolizer.h
@@ -0,0 +1,38 @@
+//===- AArch64ExternalSymbolizer.h - Symbolizer for AArch64 -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Symbolize AArch64 assembly code during disassembly using callbacks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AArch64EXTERNALSYMBOLIZER_H
+#define AArch64EXTERNALSYMBOLIZER_H
+
+#include "llvm/MC/MCExternalSymbolizer.h"
+
+namespace llvm {
+
+class AArch64ExternalSymbolizer : public MCExternalSymbolizer {
+public:
+  AArch64ExternalSymbolizer(MCContext &Ctx,
+                            std::unique_ptr<MCRelocationInfo> RelInfo,
+                            LLVMOpInfoCallback GetOpInfo,
+                            LLVMSymbolLookupCallback SymbolLookUp,
+                            void *DisInfo)
+      : MCExternalSymbolizer(Ctx, std::move(RelInfo), GetOpInfo, SymbolLookUp,
+                             DisInfo) {}
+
+  bool tryAddingSymbolicOperand(MCInst &MI, raw_ostream &CommentStream,
+                                int64_t Value, uint64_t Address, bool IsBranch,
+                                uint64_t Offset, uint64_t InstSize) override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/lib/Target/AArch64/Disassembler/Android.mk b/lib/Target/AArch64/Disassembler/Android.mk
index fcc53ad..b89538d 100644
--- a/lib/Target/AArch64/Disassembler/Android.mk
+++ b/lib/Target/AArch64/Disassembler/Android.mk
@@ -7,7 +7,8 @@ arm64_disassembler_TBLGEN_TABLES := \
   AArch64GenRegisterInfo.inc
 
 arm64_disassembler_SRC_FILES := \
-  AArch64Disassembler.cpp
+  AArch64Disassembler.cpp \
+  AArch64ExternalSymbolizer.cpp
 
 # For the device
 # =====================================================
diff --git a/lib/Target/AArch64/Disassembler/CMakeLists.txt b/lib/Target/AArch64/Disassembler/CMakeLists.txt
index 21baf25..be4ccad 100644
--- a/lib/Target/AArch64/Disassembler/CMakeLists.txt
+++ b/lib/Target/AArch64/Disassembler/CMakeLists.txt
@@ -1,3 +1,14 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Disassembler
   AArch64Disassembler.cpp
+  AArch64ExternalSymbolizer.cpp
   )
+# workaround for hanging compilation on MSVC8, 9 and 10
+#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
+#set_property(
+#  SOURCE ARMDisassembler.cpp
+#  PROPERTY COMPILE_FLAGS "/Od"
+#  )
+#endif()
+add_dependencies(LLVMAArch64Disassembler AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/Disassembler/LLVMBuild.txt b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
index 05c4ed1..a4224f4 100644
--- a/lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ b/lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Disassembler/Makefile b/lib/Target/AArch64/Disassembler/Makefile
index 5c86120..741bb81 100644
--- a/lib/Target/AArch64/Disassembler/Makefile
+++ b/lib/Target/AArch64/Disassembler/Makefile
@@ -10,7 +10,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Disassembler
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index fd3f009..f484a5b 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -11,529 +11,1306 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "AArch64InstPrinter.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define GET_INSTRUCTION_NAME
 #define PRINT_ALIAS_INSTR
 #include "AArch64GenAsmWriter.inc"
-
-static int64_t unpackSignedImm(int BitWidth, uint64_t Value) {
-  assert(!(Value & ~((1ULL << BitWidth)-1)) && "immediate not n-bit");
-  if (Value & (1ULL <<  (BitWidth - 1)))
-    return static_cast<int64_t>(Value) - (1LL << BitWidth);
-  else
-    return Value;
-}
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "AArch64GenAsmWriter1.inc"
 
 AArch64InstPrinter::AArch64InstPrinter(const MCAsmInfo &MAI,
                                        const MCInstrInfo &MII,
                                        const MCRegisterInfo &MRI,
-                                       const MCSubtargetInfo &STI) :
-  MCInstPrinter(MAI, MII, MRI) {
+                                       const MCSubtargetInfo &STI)
+    : MCInstPrinter(MAI, MII, MRI) {
   // Initialize the set of available features.
   setAvailableFeatures(STI.getFeatureBits());
 }
 
+AArch64AppleInstPrinter::AArch64AppleInstPrinter(const MCAsmInfo &MAI,
+                                                 const MCInstrInfo &MII,
+                                                 const MCRegisterInfo &MRI,
+                                                 const MCSubtargetInfo &STI)
+    : AArch64InstPrinter(MAI, MII, MRI, STI) {}
+
 void AArch64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // This is for .cfi directives.
   OS << getRegisterName(RegNo);
 }
 
-void
-AArch64InstPrinter::printOffsetSImm9Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(9, MOImm.getImm());
-
-  O << '#' << Imm;
-}
-
-void
-AArch64InstPrinter::printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                          raw_ostream &O, unsigned MemSize,
-                                          unsigned RmSize) {
-  unsigned ExtImm = MI->getOperand(OpNum).getImm();
-  unsigned OptionHi = ExtImm >> 1;
-  unsigned S = ExtImm & 1;
-  bool IsLSL = OptionHi == 1 && RmSize == 64;
-
-  const char *Ext;
-  switch (OptionHi) {
-  case 1:
-    Ext = (RmSize == 32) ? "uxtw" : "lsl";
-    break;
-  case 3:
-    Ext = (RmSize == 32) ? "sxtw" : "sxtx";
-    break;
-  default:
-    llvm_unreachable("Incorrect Option on load/store (reg offset)");
-  }
-  O << Ext;
+void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                   StringRef Annot) {
+  // Check for special encodings and print the canonical alias instead.
 
-  if (S) {
-    unsigned ShiftAmt = Log2_32(MemSize);
-    O << " #" << ShiftAmt;
-  } else if (IsLSL) {
-    O << " #0";
-  }
-}
+  unsigned Opcode = MI->getOpcode();
 
-void
-AArch64InstPrinter::printAddSubImmLSL0Operand(const MCInst *MI,
-                                              unsigned OpNum, raw_ostream &O) {
-  const MCOperand &Imm12Op = MI->getOperand(OpNum);
+  if (Opcode == AArch64::SYSxt)
+    if (printSysAlias(MI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
 
-  if (Imm12Op.isImm()) {
-    int64_t Imm12 = Imm12Op.getImm();
-    assert(Imm12 >= 0 && "Invalid immediate for add/sub imm");
-    O << "#" << Imm12;
-  } else {
-    assert(Imm12Op.isExpr() && "Unexpected shift operand type");
-    O << "#" << *Imm12Op.getExpr();
-  }
-}
+  // SBFM/UBFM should print to a nicer aliased form if possible.
+  if (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri ||
+      Opcode == AArch64::UBFMXri || Opcode == AArch64::UBFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0);
+    const MCOperand &Op1 = MI->getOperand(1);
+    const MCOperand &Op2 = MI->getOperand(2);
+    const MCOperand &Op3 = MI->getOperand(3);
+
+    bool IsSigned = (Opcode == AArch64::SBFMXri || Opcode == AArch64::SBFMWri);
+    bool Is64Bit = (Opcode == AArch64::SBFMXri || Opcode == AArch64::UBFMXri);
+    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+
+      switch (Op3.getImm()) {
+      default:
+        break;
+      case 7:
+        if (IsSigned)
+          AsmMnemonic = "sxtb";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxtb";
+        break;
+      case 15:
+        if (IsSigned)
+          AsmMnemonic = "sxth";
+        else if (!Is64Bit)
+          AsmMnemonic = "uxth";
+        break;
+      case 31:
+        // *xtw is only valid for signed 64-bit operations.
+        if (Is64Bit && IsSigned)
+          AsmMnemonic = "sxtw";
+        break;
+      }
+
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(getWRegFromXReg(Op1.getReg()));
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-void
-AArch64InstPrinter::printAddSubImmLSL12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
+    // All immediate shifts are aliases, implemented using the Bitfield
+    // instruction. In all cases the immediate shift amount shift must be in
+    // the range 0 to (reg.size -1).
+    if (Op2.isImm() && Op3.isImm()) {
+      const char *AsmMnemonic = nullptr;
+      int shift = 0;
+      int64_t immr = Op2.getImm();
+      int64_t imms = Op3.getImm();
+      if (Opcode == AArch64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
+        AsmMnemonic = "lsl";
+        shift = 31 - imms;
+      } else if (Opcode == AArch64::UBFMXri && imms != 0x3f &&
+                 ((imms + 1 == immr))) {
+        AsmMnemonic = "lsl";
+        shift = 63 - imms;
+      } else if (Opcode == AArch64::UBFMWri && imms == 0x1f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::UBFMXri && imms == 0x3f) {
+        AsmMnemonic = "lsr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMWri && imms == 0x1f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      } else if (Opcode == AArch64::SBFMXri && imms == 0x3f) {
+        AsmMnemonic = "asr";
+        shift = immr;
+      }
+      if (AsmMnemonic) {
+        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
+          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
+        printAnnotation(O, Annot);
+        return;
+      }
+    }
 
-  printAddSubImmLSL0Operand(MI, OpNum, O);
+    // SBFIZ/UBFIZ aliases
+    if (Op2.getImm() > Op3.getImm()) {
+      O << '\t' << (IsSigned ? "sbfiz" : "ubfiz") << '\t'
+        << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+        << ", #" << (Is64Bit ? 64 : 32) - Op2.getImm() << ", #" << Op3.getImm() + 1;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-  O << ", lsl #12";
-}
+    // Otherwise SBFX/UBFX is the preferred form
+    O << '\t' << (IsSigned ? "sbfx" : "ubfx") << '\t'
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op1.getReg())
+      << ", #" << Op2.getImm() << ", #" << Op3.getImm() - Op2.getImm() + 1;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-void
-AArch64InstPrinter::printBareImmOperand(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << MO.getImm();
-}
+  if (Opcode == AArch64::BFMXri || Opcode == AArch64::BFMWri) {
+    const MCOperand &Op0 = MI->getOperand(0); // Op1 == Op0
+    const MCOperand &Op2 = MI->getOperand(2);
+    int ImmR = MI->getOperand(3).getImm();
+    int ImmS = MI->getOperand(4).getImm();
+
+    // BFI alias
+    if (ImmS < ImmR) {
+      int BitWidth = Opcode == AArch64::BFMXri ? 64 : 32;
+      int LSB = (BitWidth - ImmR) % BitWidth;
+      int Width = ImmS + 1;
+      O << "\tbfi\t" << getRegisterName(Op0.getReg()) << ", "
+        << getRegisterName(Op2.getReg()) << ", #" << LSB << ", #" << Width;
+      printAnnotation(O, Annot);
+      return;
+    }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printBFILSBOperand(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  const MCOperand &ImmROp = MI->getOperand(OpNum);
-  unsigned LSB = ImmROp.getImm() == 0 ? 0 : RegWidth - ImmROp.getImm();
+    int LSB = ImmR;
+    int Width = ImmS - ImmR + 1;
+    // Otherwise BFXIL the preferred form
+    O << "\tbfxil\t"
+      << getRegisterName(Op0.getReg()) << ", " << getRegisterName(Op2.getReg())
+      << ", #" << LSB << ", #" << Width;
+    printAnnotation(O, Annot);
+    return;
+  }
 
-  O << '#' << LSB;
-}
+  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
+  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
+  // printed.
+  if ((Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi ||
+       Opcode == AArch64::MOVNXi || Opcode == AArch64::MOVNWi) &&
+      MI->getOperand(1).isExpr()) {
+    if (Opcode == AArch64::MOVZXi || Opcode == AArch64::MOVZWi)
+      O << "\tmovz\t";
+    else
+      O << "\tmovn\t";
 
-void AArch64InstPrinter::printBFIWidthOperand(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  unsigned Width = ImmSOp.getImm() + 1;
+    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(1).getExpr();
+    return;
+  }
 
-  O << '#' << Width;
-}
+  if ((Opcode == AArch64::MOVKXi || Opcode == AArch64::MOVKWi) &&
+      MI->getOperand(2).isExpr()) {
+    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
+      << *MI->getOperand(2).getExpr();
+    return;
+  }
 
-void
-AArch64InstPrinter::printBFXWidthOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &ImmSOp = MI->getOperand(OpNum);
-  const MCOperand &ImmROp = MI->getOperand(OpNum - 1);
+  if (!printAliasInstr(MI, O))
+    printInstruction(MI, O);
 
-  unsigned ImmR = ImmROp.getImm();
-  unsigned ImmS = ImmSOp.getImm();
+  printAnnotation(O, Annot);
+}
 
-  assert(ImmS >= ImmR && "Invalid ImmR, ImmS combination for bitfield extract");
+static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
+                                bool &IsTbx) {
+  switch (Opcode) {
+  case AArch64::TBXv8i8One:
+  case AArch64::TBXv8i8Two:
+  case AArch64::TBXv8i8Three:
+  case AArch64::TBXv8i8Four:
+    IsTbx = true;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBLv8i8One:
+  case AArch64::TBLv8i8Two:
+  case AArch64::TBLv8i8Three:
+  case AArch64::TBLv8i8Four:
+    IsTbx = false;
+    Layout = ".8b";
+    return true;
+  case AArch64::TBXv16i8One:
+  case AArch64::TBXv16i8Two:
+  case AArch64::TBXv16i8Three:
+  case AArch64::TBXv16i8Four:
+    IsTbx = true;
+    Layout = ".16b";
+    return true;
+  case AArch64::TBLv16i8One:
+  case AArch64::TBLv16i8Two:
+  case AArch64::TBLv16i8Three:
+  case AArch64::TBLv16i8Four:
+    IsTbx = false;
+    Layout = ".16b";
+    return true;
+  default:
+    return false;
+  }
+}
 
-  O << '#' << (ImmS - ImmR + 1);
+struct LdStNInstrDesc {
+  unsigned Opcode;
+  const char *Mnemonic;
+  const char *Layout;
+  int ListOperand;
+  bool HasLane;
+  int NaturalOffset;
+};
+
+static LdStNInstrDesc LdStNInstInfo[] = {
+  { AArch64::LD1i8,             "ld1",  ".b",     1, true,  0  },
+  { AArch64::LD1i16,            "ld1",  ".h",     1, true,  0  },
+  { AArch64::LD1i32,            "ld1",  ".s",     1, true,  0  },
+  { AArch64::LD1i64,            "ld1",  ".d",     1, true,  0  },
+  { AArch64::LD1i8_POST,        "ld1",  ".b",     2, true,  1  },
+  { AArch64::LD1i16_POST,       "ld1",  ".h",     2, true,  2  },
+  { AArch64::LD1i32_POST,       "ld1",  ".s",     2, true,  4  },
+  { AArch64::LD1i64_POST,       "ld1",  ".d",     2, true,  8  },
+  { AArch64::LD1Rv16b,          "ld1r", ".16b",   0, false, 0  },
+  { AArch64::LD1Rv8h,           "ld1r", ".8h",    0, false, 0  },
+  { AArch64::LD1Rv4s,           "ld1r", ".4s",    0, false, 0  },
+  { AArch64::LD1Rv2d,           "ld1r", ".2d",    0, false, 0  },
+  { AArch64::LD1Rv8b,           "ld1r", ".8b",    0, false, 0  },
+  { AArch64::LD1Rv4h,           "ld1r", ".4h",    0, false, 0  },
+  { AArch64::LD1Rv2s,           "ld1r", ".2s",    0, false, 0  },
+  { AArch64::LD1Rv1d,           "ld1r", ".1d",    0, false, 0  },
+  { AArch64::LD1Rv16b_POST,     "ld1r", ".16b",   1, false, 1  },
+  { AArch64::LD1Rv8h_POST,      "ld1r", ".8h",    1, false, 2  },
+  { AArch64::LD1Rv4s_POST,      "ld1r", ".4s",    1, false, 4  },
+  { AArch64::LD1Rv2d_POST,      "ld1r", ".2d",    1, false, 8  },
+  { AArch64::LD1Rv8b_POST,      "ld1r", ".8b",    1, false, 1  },
+  { AArch64::LD1Rv4h_POST,      "ld1r", ".4h",    1, false, 2  },
+  { AArch64::LD1Rv2s_POST,      "ld1r", ".2s",    1, false, 4  },
+  { AArch64::LD1Rv1d_POST,      "ld1r", ".1d",    1, false, 8  },
+  { AArch64::LD1Onev16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Onev8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Onev4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Onev2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Onev8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Onev4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Onev2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Onev1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Onev16b_POST,   "ld1",  ".16b",   1, false, 16 },
+  { AArch64::LD1Onev8h_POST,    "ld1",  ".8h",    1, false, 16 },
+  { AArch64::LD1Onev4s_POST,    "ld1",  ".4s",    1, false, 16 },
+  { AArch64::LD1Onev2d_POST,    "ld1",  ".2d",    1, false, 16 },
+  { AArch64::LD1Onev8b_POST,    "ld1",  ".8b",    1, false, 8  },
+  { AArch64::LD1Onev4h_POST,    "ld1",  ".4h",    1, false, 8  },
+  { AArch64::LD1Onev2s_POST,    "ld1",  ".2s",    1, false, 8  },
+  { AArch64::LD1Onev1d_POST,    "ld1",  ".1d",    1, false, 8  },
+  { AArch64::LD1Twov16b,        "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Twov8h,         "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Twov4s,         "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Twov2d,         "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Twov8b,         "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Twov4h,         "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Twov2s,         "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Twov1d,         "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Twov16b_POST,   "ld1",  ".16b",   1, false, 32 },
+  { AArch64::LD1Twov8h_POST,    "ld1",  ".8h",    1, false, 32 },
+  { AArch64::LD1Twov4s_POST,    "ld1",  ".4s",    1, false, 32 },
+  { AArch64::LD1Twov2d_POST,    "ld1",  ".2d",    1, false, 32 },
+  { AArch64::LD1Twov8b_POST,    "ld1",  ".8b",    1, false, 16 },
+  { AArch64::LD1Twov4h_POST,    "ld1",  ".4h",    1, false, 16 },
+  { AArch64::LD1Twov2s_POST,    "ld1",  ".2s",    1, false, 16 },
+  { AArch64::LD1Twov1d_POST,    "ld1",  ".1d",    1, false, 16 },
+  { AArch64::LD1Threev16b,      "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Threev8h,       "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Threev4s,       "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Threev2d,       "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Threev8b,       "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Threev4h,       "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Threev2s,       "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Threev1d,       "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Threev16b_POST, "ld1",  ".16b",   1, false, 48 },
+  { AArch64::LD1Threev8h_POST,  "ld1",  ".8h",    1, false, 48 },
+  { AArch64::LD1Threev4s_POST,  "ld1",  ".4s",    1, false, 48 },
+  { AArch64::LD1Threev2d_POST,  "ld1",  ".2d",    1, false, 48 },
+  { AArch64::LD1Threev8b_POST,  "ld1",  ".8b",    1, false, 24 },
+  { AArch64::LD1Threev4h_POST,  "ld1",  ".4h",    1, false, 24 },
+  { AArch64::LD1Threev2s_POST,  "ld1",  ".2s",    1, false, 24 },
+  { AArch64::LD1Threev1d_POST,  "ld1",  ".1d",    1, false, 24 },
+  { AArch64::LD1Fourv16b,       "ld1",  ".16b",   0, false, 0  },
+  { AArch64::LD1Fourv8h,        "ld1",  ".8h",    0, false, 0  },
+  { AArch64::LD1Fourv4s,        "ld1",  ".4s",    0, false, 0  },
+  { AArch64::LD1Fourv2d,        "ld1",  ".2d",    0, false, 0  },
+  { AArch64::LD1Fourv8b,        "ld1",  ".8b",    0, false, 0  },
+  { AArch64::LD1Fourv4h,        "ld1",  ".4h",    0, false, 0  },
+  { AArch64::LD1Fourv2s,        "ld1",  ".2s",    0, false, 0  },
+  { AArch64::LD1Fourv1d,        "ld1",  ".1d",    0, false, 0  },
+  { AArch64::LD1Fourv16b_POST,  "ld1",  ".16b",   1, false, 64 },
+  { AArch64::LD1Fourv8h_POST,   "ld1",  ".8h",    1, false, 64 },
+  { AArch64::LD1Fourv4s_POST,   "ld1",  ".4s",    1, false, 64 },
+  { AArch64::LD1Fourv2d_POST,   "ld1",  ".2d",    1, false, 64 },
+  { AArch64::LD1Fourv8b_POST,   "ld1",  ".8b",    1, false, 32 },
+  { AArch64::LD1Fourv4h_POST,   "ld1",  ".4h",    1, false, 32 },
+  { AArch64::LD1Fourv2s_POST,   "ld1",  ".2s",    1, false, 32 },
+  { AArch64::LD1Fourv1d_POST,   "ld1",  ".1d",    1, false, 32 },
+  { AArch64::LD2i8,             "ld2",  ".b",     1, true,  0  },
+  { AArch64::LD2i16,            "ld2",  ".h",     1, true,  0  },
+  { AArch64::LD2i32,            "ld2",  ".s",     1, true,  0  },
+  { AArch64::LD2i64,            "ld2",  ".d",     1, true,  0  },
+  { AArch64::LD2i8_POST,        "ld2",  ".b",     2, true,  2  },
+  { AArch64::LD2i16_POST,       "ld2",  ".h",     2, true,  4  },
+  { AArch64::LD2i32_POST,       "ld2",  ".s",     2, true,  8  },
+  { AArch64::LD2i64_POST,       "ld2",  ".d",     2, true,  16  },
+  { AArch64::LD2Rv16b,          "ld2r", ".16b",   0, false, 0  },
+  { AArch64::LD2Rv8h,           "ld2r", ".8h",    0, false, 0  },
+  { AArch64::LD2Rv4s,           "ld2r", ".4s",    0, false, 0  },
+  { AArch64::LD2Rv2d,           "ld2r", ".2d",    0, false, 0  },
+  { AArch64::LD2Rv8b,           "ld2r", ".8b",    0, false, 0  },
+  { AArch64::LD2Rv4h,           "ld2r", ".4h",    0, false, 0  },
+  { AArch64::LD2Rv2s,           "ld2r", ".2s",    0, false, 0  },
+  { AArch64::LD2Rv1d,           "ld2r", ".1d",    0, false, 0  },
+  { AArch64::LD2Rv16b_POST,     "ld2r", ".16b",   1, false, 2  },
+  { AArch64::LD2Rv8h_POST,      "ld2r", ".8h",    1, false, 4  },
+  { AArch64::LD2Rv4s_POST,      "ld2r", ".4s",    1, false, 8  },
+  { AArch64::LD2Rv2d_POST,      "ld2r", ".2d",    1, false, 16 },
+  { AArch64::LD2Rv8b_POST,      "ld2r", ".8b",    1, false, 2  },
+  { AArch64::LD2Rv4h_POST,      "ld2r", ".4h",    1, false, 4  },
+  { AArch64::LD2Rv2s_POST,      "ld2r", ".2s",    1, false, 8  },
+  { AArch64::LD2Rv1d_POST,      "ld2r", ".1d",    1, false, 16 },
+  { AArch64::LD2Twov16b,        "ld2",  ".16b",   0, false, 0  },
+  { AArch64::LD2Twov8h,         "ld2",  ".8h",    0, false, 0  },
+  { AArch64::LD2Twov4s,         "ld2",  ".4s",    0, false, 0  },
+  { AArch64::LD2Twov2d,         "ld2",  ".2d",    0, false, 0  },
+  { AArch64::LD2Twov8b,         "ld2",  ".8b",    0, false, 0  },
+  { AArch64::LD2Twov4h,         "ld2",  ".4h",    0, false, 0  },
+  { AArch64::LD2Twov2s,         "ld2",  ".2s",    0, false, 0  },
+  { AArch64::LD2Twov16b_POST,   "ld2",  ".16b",   1, false, 32 },
+  { AArch64::LD2Twov8h_POST,    "ld2",  ".8h",    1, false, 32 },
+  { AArch64::LD2Twov4s_POST,    "ld2",  ".4s",    1, false, 32 },
+  { AArch64::LD2Twov2d_POST,    "ld2",  ".2d",    1, false, 32 },
+  { AArch64::LD2Twov8b_POST,    "ld2",  ".8b",    1, false, 16 },
+  { AArch64::LD2Twov4h_POST,    "ld2",  ".4h",    1, false, 16 },
+  { AArch64::LD2Twov2s_POST,    "ld2",  ".2s",    1, false, 16 },
+  { AArch64::LD3i8,             "ld3",  ".b",     1, true,  0  },
+  { AArch64::LD3i16,            "ld3",  ".h",     1, true,  0  },
+  { AArch64::LD3i32,            "ld3",  ".s",     1, true,  0  },
+  { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
+  { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
+  { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
+  { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
+  { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
+  { AArch64::LD3Rv2d,           "ld3r", ".2d",    0, false, 0  },
+  { AArch64::LD3Rv8b,           "ld3r", ".8b",    0, false, 0  },
+  { AArch64::LD3Rv4h,           "ld3r", ".4h",    0, false, 0  },
+  { AArch64::LD3Rv2s,           "ld3r", ".2s",    0, false, 0  },
+  { AArch64::LD3Rv1d,           "ld3r", ".1d",    0, false, 0  },
+  { AArch64::LD3Rv16b_POST,     "ld3r", ".16b",   1, false, 3  },
+  { AArch64::LD3Rv8h_POST,      "ld3r", ".8h",    1, false, 6  },
+  { AArch64::LD3Rv4s_POST,      "ld3r", ".4s",    1, false, 12 },
+  { AArch64::LD3Rv2d_POST,      "ld3r", ".2d",    1, false, 24 },
+  { AArch64::LD3Rv8b_POST,      "ld3r", ".8b",    1, false, 3  },
+  { AArch64::LD3Rv4h_POST,      "ld3r", ".4h",    1, false, 6  },
+  { AArch64::LD3Rv2s_POST,      "ld3r", ".2s",    1, false, 12 },
+  { AArch64::LD3Rv1d_POST,      "ld3r", ".1d",    1, false, 24 },
+  { AArch64::LD3Threev16b,      "ld3",  ".16b",   0, false, 0  },
+  { AArch64::LD3Threev8h,       "ld3",  ".8h",    0, false, 0  },
+  { AArch64::LD3Threev4s,       "ld3",  ".4s",    0, false, 0  },
+  { AArch64::LD3Threev2d,       "ld3",  ".2d",    0, false, 0  },
+  { AArch64::LD3Threev8b,       "ld3",  ".8b",    0, false, 0  },
+  { AArch64::LD3Threev4h,       "ld3",  ".4h",    0, false, 0  },
+  { AArch64::LD3Threev2s,       "ld3",  ".2s",    0, false, 0  },
+  { AArch64::LD3Threev16b_POST, "ld3",  ".16b",   1, false, 48 },
+  { AArch64::LD3Threev8h_POST,  "ld3",  ".8h",    1, false, 48 },
+  { AArch64::LD3Threev4s_POST,  "ld3",  ".4s",    1, false, 48 },
+  { AArch64::LD3Threev2d_POST,  "ld3",  ".2d",    1, false, 48 },
+  { AArch64::LD3Threev8b_POST,  "ld3",  ".8b",    1, false, 24 },
+  { AArch64::LD3Threev4h_POST,  "ld3",  ".4h",    1, false, 24 },
+  { AArch64::LD3Threev2s_POST,  "ld3",  ".2s",    1, false, 24 },
+  { AArch64::LD4i8,             "ld4",  ".b",     1, true,  0  },
+  { AArch64::LD4i16,            "ld4",  ".h",     1, true,  0  },
+  { AArch64::LD4i32,            "ld4",  ".s",     1, true,  0  },
+  { AArch64::LD4i64,            "ld4",  ".d",     1, true,  0  },
+  { AArch64::LD4i8_POST,        "ld4",  ".b",     2, true,  4  },
+  { AArch64::LD4i16_POST,       "ld4",  ".h",     2, true,  8  },
+  { AArch64::LD4i32_POST,       "ld4",  ".s",     2, true,  16 },
+  { AArch64::LD4i64_POST,       "ld4",  ".d",     2, true,  32 },
+  { AArch64::LD4Rv16b,          "ld4r", ".16b",   0, false, 0  },
+  { AArch64::LD4Rv8h,           "ld4r", ".8h",    0, false, 0  },
+  { AArch64::LD4Rv4s,           "ld4r", ".4s",    0, false, 0  },
+  { AArch64::LD4Rv2d,           "ld4r", ".2d",    0, false, 0  },
+  { AArch64::LD4Rv8b,           "ld4r", ".8b",    0, false, 0  },
+  { AArch64::LD4Rv4h,           "ld4r", ".4h",    0, false, 0  },
+  { AArch64::LD4Rv2s,           "ld4r", ".2s",    0, false, 0  },
+  { AArch64::LD4Rv1d,           "ld4r", ".1d",    0, false, 0  },
+  { AArch64::LD4Rv16b_POST,     "ld4r", ".16b",   1, false, 4  },
+  { AArch64::LD4Rv8h_POST,      "ld4r", ".8h",    1, false, 8  },
+  { AArch64::LD4Rv4s_POST,      "ld4r", ".4s",    1, false, 16 },
+  { AArch64::LD4Rv2d_POST,      "ld4r", ".2d",    1, false, 32 },
+  { AArch64::LD4Rv8b_POST,      "ld4r", ".8b",    1, false, 4  },
+  { AArch64::LD4Rv4h_POST,      "ld4r", ".4h",    1, false, 8  },
+  { AArch64::LD4Rv2s_POST,      "ld4r", ".2s",    1, false, 16 },
+  { AArch64::LD4Rv1d_POST,      "ld4r", ".1d",    1, false, 32 },
+  { AArch64::LD4Fourv16b,       "ld4",  ".16b",   0, false, 0  },
+  { AArch64::LD4Fourv8h,        "ld4",  ".8h",    0, false, 0  },
+  { AArch64::LD4Fourv4s,        "ld4",  ".4s",    0, false, 0  },
+  { AArch64::LD4Fourv2d,        "ld4",  ".2d",    0, false, 0  },
+  { AArch64::LD4Fourv8b,        "ld4",  ".8b",    0, false, 0  },
+  { AArch64::LD4Fourv4h,        "ld4",  ".4h",    0, false, 0  },
+  { AArch64::LD4Fourv2s,        "ld4",  ".2s",    0, false, 0  },
+  { AArch64::LD4Fourv16b_POST,  "ld4",  ".16b",   1, false, 64 },
+  { AArch64::LD4Fourv8h_POST,   "ld4",  ".8h",    1, false, 64 },
+  { AArch64::LD4Fourv4s_POST,   "ld4",  ".4s",    1, false, 64 },
+  { AArch64::LD4Fourv2d_POST,   "ld4",  ".2d",    1, false, 64 },
+  { AArch64::LD4Fourv8b_POST,   "ld4",  ".8b",    1, false, 32 },
+  { AArch64::LD4Fourv4h_POST,   "ld4",  ".4h",    1, false, 32 },
+  { AArch64::LD4Fourv2s_POST,   "ld4",  ".2s",    1, false, 32 },
+  { AArch64::ST1i8,             "st1",  ".b",     0, true,  0  },
+  { AArch64::ST1i16,            "st1",  ".h",     0, true,  0  },
+  { AArch64::ST1i32,            "st1",  ".s",     0, true,  0  },
+  { AArch64::ST1i64,            "st1",  ".d",     0, true,  0  },
+  { AArch64::ST1i8_POST,        "st1",  ".b",     1, true,  1  },
+  { AArch64::ST1i16_POST,       "st1",  ".h",     1, true,  2  },
+  { AArch64::ST1i32_POST,       "st1",  ".s",     1, true,  4  },
+  { AArch64::ST1i64_POST,       "st1",  ".d",     1, true,  8  },
+  { AArch64::ST1Onev16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Onev8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Onev4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Onev2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Onev8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Onev4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Onev2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Onev1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Onev16b_POST,   "st1",  ".16b",   1, false, 16 },
+  { AArch64::ST1Onev8h_POST,    "st1",  ".8h",    1, false, 16 },
+  { AArch64::ST1Onev4s_POST,    "st1",  ".4s",    1, false, 16 },
+  { AArch64::ST1Onev2d_POST,    "st1",  ".2d",    1, false, 16 },
+  { AArch64::ST1Onev8b_POST,    "st1",  ".8b",    1, false, 8  },
+  { AArch64::ST1Onev4h_POST,    "st1",  ".4h",    1, false, 8  },
+  { AArch64::ST1Onev2s_POST,    "st1",  ".2s",    1, false, 8  },
+  { AArch64::ST1Onev1d_POST,    "st1",  ".1d",    1, false, 8  },
+  { AArch64::ST1Twov16b,        "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Twov8h,         "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Twov4s,         "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Twov2d,         "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Twov8b,         "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Twov4h,         "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Twov2s,         "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Twov1d,         "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Twov16b_POST,   "st1",  ".16b",   1, false, 32 },
+  { AArch64::ST1Twov8h_POST,    "st1",  ".8h",    1, false, 32 },
+  { AArch64::ST1Twov4s_POST,    "st1",  ".4s",    1, false, 32 },
+  { AArch64::ST1Twov2d_POST,    "st1",  ".2d",    1, false, 32 },
+  { AArch64::ST1Twov8b_POST,    "st1",  ".8b",    1, false, 16 },
+  { AArch64::ST1Twov4h_POST,    "st1",  ".4h",    1, false, 16 },
+  { AArch64::ST1Twov2s_POST,    "st1",  ".2s",    1, false, 16 },
+  { AArch64::ST1Twov1d_POST,    "st1",  ".1d",    1, false, 16 },
+  { AArch64::ST1Threev16b,      "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Threev8h,       "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Threev4s,       "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Threev2d,       "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Threev8b,       "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Threev4h,       "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Threev2s,       "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Threev1d,       "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Threev16b_POST, "st1",  ".16b",   1, false, 48 },
+  { AArch64::ST1Threev8h_POST,  "st1",  ".8h",    1, false, 48 },
+  { AArch64::ST1Threev4s_POST,  "st1",  ".4s",    1, false, 48 },
+  { AArch64::ST1Threev2d_POST,  "st1",  ".2d",    1, false, 48 },
+  { AArch64::ST1Threev8b_POST,  "st1",  ".8b",    1, false, 24 },
+  { AArch64::ST1Threev4h_POST,  "st1",  ".4h",    1, false, 24 },
+  { AArch64::ST1Threev2s_POST,  "st1",  ".2s",    1, false, 24 },
+  { AArch64::ST1Threev1d_POST,  "st1",  ".1d",    1, false, 24 },
+  { AArch64::ST1Fourv16b,       "st1",  ".16b",   0, false, 0  },
+  { AArch64::ST1Fourv8h,        "st1",  ".8h",    0, false, 0  },
+  { AArch64::ST1Fourv4s,        "st1",  ".4s",    0, false, 0  },
+  { AArch64::ST1Fourv2d,        "st1",  ".2d",    0, false, 0  },
+  { AArch64::ST1Fourv8b,        "st1",  ".8b",    0, false, 0  },
+  { AArch64::ST1Fourv4h,        "st1",  ".4h",    0, false, 0  },
+  { AArch64::ST1Fourv2s,        "st1",  ".2s",    0, false, 0  },
+  { AArch64::ST1Fourv1d,        "st1",  ".1d",    0, false, 0  },
+  { AArch64::ST1Fourv16b_POST,  "st1",  ".16b",   1, false, 64 },
+  { AArch64::ST1Fourv8h_POST,   "st1",  ".8h",    1, false, 64 },
+  { AArch64::ST1Fourv4s_POST,   "st1",  ".4s",    1, false, 64 },
+  { AArch64::ST1Fourv2d_POST,   "st1",  ".2d",    1, false, 64 },
+  { AArch64::ST1Fourv8b_POST,   "st1",  ".8b",    1, false, 32 },
+  { AArch64::ST1Fourv4h_POST,   "st1",  ".4h",    1, false, 32 },
+  { AArch64::ST1Fourv2s_POST,   "st1",  ".2s",    1, false, 32 },
+  { AArch64::ST1Fourv1d_POST,   "st1",  ".1d",    1, false, 32 },
+  { AArch64::ST2i8,             "st2",  ".b",     0, true,  0  },
+  { AArch64::ST2i16,            "st2",  ".h",     0, true,  0  },
+  { AArch64::ST2i32,            "st2",  ".s",     0, true,  0  },
+  { AArch64::ST2i64,            "st2",  ".d",     0, true,  0  },
+  { AArch64::ST2i8_POST,        "st2",  ".b",     1, true,  2  },
+  { AArch64::ST2i16_POST,       "st2",  ".h",     1, true,  4  },
+  { AArch64::ST2i32_POST,       "st2",  ".s",     1, true,  8  },
+  { AArch64::ST2i64_POST,       "st2",  ".d",     1, true,  16 },
+  { AArch64::ST2Twov16b,        "st2",  ".16b",   0, false, 0  },
+  { AArch64::ST2Twov8h,         "st2",  ".8h",    0, false, 0  },
+  { AArch64::ST2Twov4s,         "st2",  ".4s",    0, false, 0  },
+  { AArch64::ST2Twov2d,         "st2",  ".2d",    0, false, 0  },
+  { AArch64::ST2Twov8b,         "st2",  ".8b",    0, false, 0  },
+  { AArch64::ST2Twov4h,         "st2",  ".4h",    0, false, 0  },
+  { AArch64::ST2Twov2s,         "st2",  ".2s",    0, false, 0  },
+  { AArch64::ST2Twov16b_POST,   "st2",  ".16b",   1, false, 32 },
+  { AArch64::ST2Twov8h_POST,    "st2",  ".8h",    1, false, 32 },
+  { AArch64::ST2Twov4s_POST,    "st2",  ".4s",    1, false, 32 },
+  { AArch64::ST2Twov2d_POST,    "st2",  ".2d",    1, false, 32 },
+  { AArch64::ST2Twov8b_POST,    "st2",  ".8b",    1, false, 16 },
+  { AArch64::ST2Twov4h_POST,    "st2",  ".4h",    1, false, 16 },
+  { AArch64::ST2Twov2s_POST,    "st2",  ".2s",    1, false, 16 },
+  { AArch64::ST3i8,             "st3",  ".b",     0, true,  0  },
+  { AArch64::ST3i16,            "st3",  ".h",     0, true,  0  },
+  { AArch64::ST3i32,            "st3",  ".s",     0, true,  0  },
+  { AArch64::ST3i64,            "st3",  ".d",     0, true,  0  },
+  { AArch64::ST3i8_POST,        "st3",  ".b",     1, true,  3  },
+  { AArch64::ST3i16_POST,       "st3",  ".h",     1, true,  6  },
+  { AArch64::ST3i32_POST,       "st3",  ".s",     1, true,  12 },
+  { AArch64::ST3i64_POST,       "st3",  ".d",     1, true,  24 },
+  { AArch64::ST3Threev16b,      "st3",  ".16b",   0, false, 0  },
+  { AArch64::ST3Threev8h,       "st3",  ".8h",    0, false, 0  },
+  { AArch64::ST3Threev4s,       "st3",  ".4s",    0, false, 0  },
+  { AArch64::ST3Threev2d,       "st3",  ".2d",    0, false, 0  },
+  { AArch64::ST3Threev8b,       "st3",  ".8b",    0, false, 0  },
+  { AArch64::ST3Threev4h,       "st3",  ".4h",    0, false, 0  },
+  { AArch64::ST3Threev2s,       "st3",  ".2s",    0, false, 0  },
+  { AArch64::ST3Threev16b_POST, "st3",  ".16b",   1, false, 48 },
+  { AArch64::ST3Threev8h_POST,  "st3",  ".8h",    1, false, 48 },
+  { AArch64::ST3Threev4s_POST,  "st3",  ".4s",    1, false, 48 },
+  { AArch64::ST3Threev2d_POST,  "st3",  ".2d",    1, false, 48 },
+  { AArch64::ST3Threev8b_POST,  "st3",  ".8b",    1, false, 24 },
+  { AArch64::ST3Threev4h_POST,  "st3",  ".4h",    1, false, 24 },
+  { AArch64::ST3Threev2s_POST,  "st3",  ".2s",    1, false, 24 },
+  { AArch64::ST4i8,             "st4",  ".b",     0, true,  0  },
+  { AArch64::ST4i16,            "st4",  ".h",     0, true,  0  },
+  { AArch64::ST4i32,            "st4",  ".s",     0, true,  0  },
+  { AArch64::ST4i64,            "st4",  ".d",     0, true,  0  },
+  { AArch64::ST4i8_POST,        "st4",  ".b",     1, true,  4  },
+  { AArch64::ST4i16_POST,       "st4",  ".h",     1, true,  8  },
+  { AArch64::ST4i32_POST,       "st4",  ".s",     1, true,  16 },
+  { AArch64::ST4i64_POST,       "st4",  ".d",     1, true,  32 },
+  { AArch64::ST4Fourv16b,       "st4",  ".16b",   0, false, 0  },
+  { AArch64::ST4Fourv8h,        "st4",  ".8h",    0, false, 0  },
+  { AArch64::ST4Fourv4s,        "st4",  ".4s",    0, false, 0  },
+  { AArch64::ST4Fourv2d,        "st4",  ".2d",    0, false, 0  },
+  { AArch64::ST4Fourv8b,        "st4",  ".8b",    0, false, 0  },
+  { AArch64::ST4Fourv4h,        "st4",  ".4h",    0, false, 0  },
+  { AArch64::ST4Fourv2s,        "st4",  ".2s",    0, false, 0  },
+  { AArch64::ST4Fourv16b_POST,  "st4",  ".16b",   1, false, 64 },
+  { AArch64::ST4Fourv8h_POST,   "st4",  ".8h",    1, false, 64 },
+  { AArch64::ST4Fourv4s_POST,   "st4",  ".4s",    1, false, 64 },
+  { AArch64::ST4Fourv2d_POST,   "st4",  ".2d",    1, false, 64 },
+  { AArch64::ST4Fourv8b_POST,   "st4",  ".8b",    1, false, 32 },
+  { AArch64::ST4Fourv4h_POST,   "st4",  ".4h",    1, false, 32 },
+  { AArch64::ST4Fourv2s_POST,   "st4",  ".2s",    1, false, 32 },
+};
+
+static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
+  unsigned Idx;
+  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
+    if (LdStNInstInfo[Idx].Opcode == Opcode)
+      return &LdStNInstInfo[Idx];
+
+  return nullptr;
 }
 
-void
-AArch64InstPrinter::printCRxOperand(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-    const MCOperand &CRx = MI->getOperand(OpNum);
+void AArch64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                        StringRef Annot) {
+  unsigned Opcode = MI->getOpcode();
+  StringRef Layout, Mnemonic;
 
-    O << 'c' << CRx.getImm();
-}
+  bool IsTbx;
+  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
+    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
+      << getRegisterName(MI->getOperand(0).getReg(), AArch64::vreg) << ", ";
 
+    unsigned ListOpNum = IsTbx ? 2 : 1;
+    printVectorList(MI, ListOpNum, O, "");
 
-void
-AArch64InstPrinter::printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-    const MCOperand &ScaleOp = MI->getOperand(OpNum);
+    O << ", "
+      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), AArch64::vreg);
+    printAnnotation(O, Annot);
+    return;
+  }
 
-    O << '#' << (64 - ScaleOp.getImm());
-}
+  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
+    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
+
+    // Now onto the operands: first a vector list with possible lane
+    // specifier. E.g. { v0 }[2]
+    int OpNum = LdStDesc->ListOperand;
+    printVectorList(MI, OpNum++, O, "");
+
+    if (LdStDesc->HasLane)
+      O << '[' << MI->getOperand(OpNum++).getImm() << ']';
+
+    // Next the address: [xN]
+    unsigned AddrReg = MI->getOperand(OpNum++).getReg();
+    O << ", [" << getRegisterName(AddrReg) << ']';
+
+    // Finally, there might be a post-indexed offset.
+    if (LdStDesc->NaturalOffset != 0) {
+      unsigned Reg = MI->getOperand(OpNum++).getReg();
+      if (Reg != AArch64::XZR)
+        O << ", " << getRegisterName(Reg);
+      else {
+        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
+        O << ", #" << LdStDesc->NaturalOffset;
+      }
+    }
 
+    printAnnotation(O, Annot);
+    return;
+  }
 
-void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                           raw_ostream &o) {
-  const MCOperand &MOImm8 = MI->getOperand(OpNum);
+  AArch64InstPrinter::printInst(MI, O, Annot);
+}
 
-  assert(MOImm8.isImm()
-         && "Immediate operand required for floating-point immediate inst");
+bool AArch64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
+#endif
+
+  const char *Asm = nullptr;
+  const MCOperand &Op1 = MI->getOperand(0);
+  const MCOperand &Cn = MI->getOperand(1);
+  const MCOperand &Cm = MI->getOperand(2);
+  const MCOperand &Op2 = MI->getOperand(3);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  if (CnVal == 7) {
+    switch (CmVal) {
+    default:
+      break;
+
+    // IC aliases
+    case 1:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tialluis";
+      break;
+    case 5:
+      if (Op1Val == 0 && Op2Val == 0)
+        Asm = "ic\tiallu";
+      else if (Op1Val == 3 && Op2Val == 1)
+        Asm = "ic\tivau";
+      break;
+
+    // DC aliases
+    case 4:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tzva";
+      break;
+    case 6:
+      if (Op1Val == 0 && Op2Val == 1)
+        Asm = "dc\tivac";
+      if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tisw";
+      break;
+    case 10:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcsw";
+      break;
+    case 11:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcvau";
+      break;
+    case 14:
+      if (Op1Val == 3 && Op2Val == 1)
+        Asm = "dc\tcivac";
+      else if (Op1Val == 0 && Op2Val == 2)
+        Asm = "dc\tcisw";
+      break;
+
+    // AT aliases
+    case 8:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e1r"; break;
+        case 1: Asm = "at\ts1e1w"; break;
+        case 2: Asm = "at\ts1e0r"; break;
+        case 3: Asm = "at\ts1e0w"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e2r"; break;
+        case 1: Asm = "at\ts1e2w"; break;
+        case 4: Asm = "at\ts12e1r"; break;
+        case 5: Asm = "at\ts12e1w"; break;
+        case 6: Asm = "at\ts12e0r"; break;
+        case 7: Asm = "at\ts12e0w"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "at\ts1e3r"; break;
+        case 1: Asm = "at\ts1e3w"; break;
+        }
+        break;
+      }
+      break;
+    }
+  } else if (CnVal == 8) {
+    // TLBI aliases
+    switch (CmVal) {
+    default:
+      break;
+    case 3:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1is"; break;
+        case 1: Asm = "tlbi\tvae1is"; break;
+        case 2: Asm = "tlbi\taside1is"; break;
+        case 3: Asm = "tlbi\tvaae1is"; break;
+        case 5: Asm = "tlbi\tvale1is"; break;
+        case 7: Asm = "tlbi\tvaale1is"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2is"; break;
+        case 1: Asm = "tlbi\tvae2is"; break;
+        case 4: Asm = "tlbi\talle1is"; break;
+        case 5: Asm = "tlbi\tvale2is"; break;
+        case 6: Asm = "tlbi\tvmalls12e1is"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3is"; break;
+        case 1: Asm = "tlbi\tvae3is"; break;
+        case 5: Asm = "tlbi\tvale3is"; break;
+        }
+        break;
+      }
+      break;
+    case 0:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1is"; break;
+        case 5: Asm = "tlbi\tipas2le1is"; break;
+        }
+        break;
+      }
+      break;
+    case 4:
+      switch (Op1Val) {
+      default:
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 1: Asm = "tlbi\tipas2e1"; break;
+        case 5: Asm = "tlbi\tipas2le1"; break;
+        }
+        break;
+      }
+      break;
+    case 7:
+      switch (Op1Val) {
+      default:
+        break;
+      case 0:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\tvmalle1"; break;
+        case 1: Asm = "tlbi\tvae1"; break;
+        case 2: Asm = "tlbi\taside1"; break;
+        case 3: Asm = "tlbi\tvaae1"; break;
+        case 5: Asm = "tlbi\tvale1"; break;
+        case 7: Asm = "tlbi\tvaale1"; break;
+        }
+        break;
+      case 4:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle2"; break;
+        case 1: Asm = "tlbi\tvae2"; break;
+        case 4: Asm = "tlbi\talle1"; break;
+        case 5: Asm = "tlbi\tvale2"; break;
+        case 6: Asm = "tlbi\tvmalls12e1"; break;
+        }
+        break;
+      case 6:
+        switch (Op2Val) {
+        default:
+          break;
+        case 0: Asm = "tlbi\talle3"; break;
+        case 1: Asm = "tlbi\tvae3";  break;
+        case 5: Asm = "tlbi\tvale3"; break;
+        }
+        break;
+      }
+      break;
+    }
+  }
+
+  if (Asm) {
+    unsigned Reg = MI->getOperand(4).getReg();
 
-  uint32_t Imm8 = MOImm8.getImm();
-  uint32_t Fraction = Imm8 & 0xf;
-  uint32_t Exponent = (Imm8 >> 4) & 0x7;
-  uint32_t Negative = (Imm8 >> 7) & 0x1;
+    O << '\t' << Asm;
+    if (StringRef(Asm).lower().find("all") == StringRef::npos)
+      O << ", " << getRegisterName(Reg);
+  }
 
-  float Val = 1.0f + Fraction / 16.0f;
+  return Asm != nullptr;
+}
 
-  // That is:
-  // 000 -> 2^1,  001 -> 2^2,  010 -> 2^3,  011 -> 2^4,
-  // 100 -> 2^-3, 101 -> 2^-2, 110 -> 2^-1, 111 -> 2^0
-  if (Exponent & 0x4) {
-    Val /= 1 << (7 - Exponent);
+void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                      raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    O << getRegisterName(Reg);
+  } else if (Op.isImm()) {
+    O << '#' << Op.getImm();
   } else {
-    Val *= 1 << (Exponent + 1);
+    assert(Op.isExpr() && "unknown operand kind in printOperand");
+    O << *Op.getExpr();
   }
+}
 
-  Val = Negative ? -Val : Val;
-
-  o << '#' << format("%.8f", Val);
+void AArch64InstPrinter::printHexImm(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  O << format("#%#llx", Op.getImm());
 }
 
-void AArch64InstPrinter::printFPZeroOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &o) {
-  o << "#0.0";
+void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
+                                             unsigned Imm, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    unsigned Reg = Op.getReg();
+    if (Reg == AArch64::XZR)
+      O << "#" << Imm;
+    else
+      O << getRegisterName(Reg);
+  } else
+    assert(0 && "unknown operand kind in printPostIncOperand64");
 }
 
-void
-AArch64InstPrinter::printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
+                                          raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isReg() && "Non-register vreg operand!");
+  unsigned Reg = Op.getReg();
+  O << getRegisterName(Reg, AArch64::vreg);
+}
 
-  O << A64CondCodeToString(static_cast<A64CC::CondCodes>(MO.getImm()));
+void AArch64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
+  O << "c" << Op.getImm();
 }
 
-template <unsigned field_width, unsigned scale> void
-AArch64InstPrinter::printLabelOperand(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
+void AArch64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-
-  if (!MO.isImm()) {
-    printOperand(MI, OpNum, O);
-    return;
+  if (MO.isImm()) {
+    unsigned Val = (MO.getImm() & 0xfff);
+    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
+    unsigned Shift =
+        AArch64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
+    O << '#' << Val;
+    if (Shift != 0)
+      printShifter(MI, OpNum + 1, O);
+
+    if (CommentStream)
+      *CommentStream << '=' << (Val << Shift) << '\n';
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
+    printShifter(MI, OpNum + 1, O);
   }
+}
 
-  // The immediate of LDR (lit) instructions is a signed 19-bit immediate, which
-  // is multiplied by 4 (because all A64 instructions are 32-bits wide).
-  uint64_t UImm = MO.getImm();
-  uint64_t Sign = UImm & (1LL << (field_width - 1));
-  int64_t SImm = scale * ((UImm & ~Sign) - Sign);
-
-  O << "#" << SImm;
+void AArch64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  uint64_t Val = MI->getOperand(OpNum).getImm();
+  O << "#0x";
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 32));
 }
 
-template<unsigned RegWidth> void
-AArch64InstPrinter::printLogicalImmOperand(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  uint64_t Val;
-  A64Imms::isLogicalImmBits(RegWidth, MO.getImm(), Val);
+  uint64_t Val = MI->getOperand(OpNum).getImm();
   O << "#0x";
-  O.write_hex(Val);
+  O.write_hex(AArch64_AM::decodeLogicalImmediate(Val, 64));
 }
 
-void
-AArch64InstPrinter::printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O, int MemSize) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  // LSL #0 should not be printed.
+  if (AArch64_AM::getShiftType(Val) == AArch64_AM::LSL &&
+      AArch64_AM::getShiftValue(Val) == 0)
+    return;
+  O << ", " << AArch64_AM::getShiftExtendName(AArch64_AM::getShiftType(Val))
+    << " #" << AArch64_AM::getShiftValue(Val);
+}
 
-  if (MOImm.isImm()) {
-    uint32_t Imm = MOImm.getImm() * MemSize;
+void AArch64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printShifter(MI, OpNum + 1, O);
+}
 
-    O << "#" << Imm;
-  } else {
-    O << "#" << *MOImm.getExpr();
+void AArch64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
+                                               raw_ostream &O) {
+  O << getRegisterName(MI->getOperand(OpNum).getReg());
+  printArithExtend(MI, OpNum + 1, O);
+}
+
+void AArch64InstPrinter::printArithExtend(const MCInst *MI, unsigned OpNum,
+                                          raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNum).getImm();
+  AArch64_AM::ShiftExtendType ExtType = AArch64_AM::getArithExtendType(Val);
+  unsigned ShiftVal = AArch64_AM::getArithShiftValue(Val);
+
+  // If the destination or first source register operand is [W]SP, print
+  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
+  // all.
+  if (ExtType == AArch64_AM::UXTW || ExtType == AArch64_AM::UXTX) {
+    unsigned Dest = MI->getOperand(0).getReg();
+    unsigned Src1 = MI->getOperand(1).getReg();
+    if ( ((Dest == AArch64::SP || Src1 == AArch64::SP) &&
+          ExtType == AArch64_AM::UXTX) ||
+         ((Dest == AArch64::WSP || Src1 == AArch64::WSP) &&
+          ExtType == AArch64_AM::UXTW) ) {
+      if (ShiftVal != 0)
+        O << ", lsl #" << ShiftVal;
+      return;
+    }
   }
+  O << ", " << AArch64_AM::getShiftExtendName(ExtType);
+  if (ShiftVal != 0)
+    O << " #" << ShiftVal;
 }
 
-void
-AArch64InstPrinter::printShiftOperand(const MCInst *MI,  unsigned OpNum,
-                                      raw_ostream &O,
-                                      A64SE::ShiftExtSpecifiers Shift) {
-    const MCOperand &MO = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMemExtend(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O, char SrcRegKind,
+                                        unsigned Width) {
+  unsigned SignExtend = MI->getOperand(OpNum).getImm();
+  unsigned DoShift = MI->getOperand(OpNum + 1).getImm();
 
-    // LSL #0 is not printed
-    if (Shift == A64SE::LSL && MO.isImm() && MO.getImm() == 0)
-        return;
+  // sxtw, sxtx, uxtw or lsl (== uxtx)
+  bool IsLSL = !SignExtend && SrcRegKind == 'x';
+  if (IsLSL)
+    O << "lsl";
+  else
+    O << (SignExtend ? 's' : 'u') << "xt" << SrcRegKind;
 
-    switch (Shift) {
-    case A64SE::LSL: O << "lsl"; break;
-    case A64SE::LSR: O << "lsr"; break;
-    case A64SE::ASR: O << "asr"; break;
-    case A64SE::ROR: O << "ror"; break;
-    default: llvm_unreachable("Invalid shift specifier in logical instruction");
-    }
+  if (DoShift || IsLSL)
+    O << " #" << Log2_32(Width / 8);
+}
 
-  O << " #" << MO.getImm();
+void AArch64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(CC);
 }
 
-void
-AArch64InstPrinter::printMoveWideImmOperand(const MCInst *MI,  unsigned OpNum,
-                                            raw_ostream &O) {
-  const MCOperand &UImm16MO = MI->getOperand(OpNum);
-  const MCOperand &ShiftMO = MI->getOperand(OpNum + 1);
+void AArch64InstPrinter::printInverseCondCode(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)MI->getOperand(OpNum).getImm();
+  O << AArch64CC::getCondCodeName(AArch64CC::getInvertedCondCode(CC));
+}
 
-  if (UImm16MO.isImm()) {
-    O << '#' << UImm16MO.getImm();
+void AArch64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
+}
 
-    if (ShiftMO.getImm() != 0)
-      O << ", lsl #" << (ShiftMO.getImm() * 16);
+template<int Scale>
+void AArch64InstPrinter::printImmScale(const MCInst *MI, unsigned OpNum,
+                                       raw_ostream &O) {
+  O << '#' << Scale * MI->getOperand(OpNum).getImm();
+}
 
-    return;
+void AArch64InstPrinter::printUImm12Offset(const MCInst *MI, unsigned OpNum,
+                                           unsigned Scale, raw_ostream &O) {
+  const MCOperand MO = MI->getOperand(OpNum);
+  if (MO.isImm()) {
+    O << "#" << (MO.getImm() * Scale);
+  } else {
+    assert(MO.isExpr() && "Unexpected operand type!");
+    O << *MO.getExpr();
   }
-
-  O << "#" << *UImm16MO.getExpr();
 }
 
-void AArch64InstPrinter::printNamedImmOperand(const NamedImmMapper &Mapper,
-                                              const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  bool ValidName;
-  const MCOperand &MO = MI->getOperand(OpNum);
-  StringRef Name = Mapper.toString(MO.getImm(), ValidName);
+void AArch64InstPrinter::printAMIndexedWB(const MCInst *MI, unsigned OpNum,
+                                          unsigned Scale, raw_ostream &O) {
+  const MCOperand MO1 = MI->getOperand(OpNum + 1);
+  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
+  if (MO1.isImm()) {
+      O << ", #" << (MO1.getImm() * Scale);
+  } else {
+    assert(MO1.isExpr() && "Unexpected operand type!");
+    O << ", " << *MO1.getExpr();
+  }
+  O << ']';
+}
 
-  if (ValidName)
+void AArch64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O) {
+  unsigned prfop = MI->getOperand(OpNum).getImm();
+  bool Valid;
+  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  if (Valid)
     O << Name;
   else
-    O << '#' << MO.getImm();
+    O << '#' << prfop;
 }
 
-void
-AArch64InstPrinter::printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                                       const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
+void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
+  float FPImm =
+      MO.isFPImm() ? MO.getFPImm() : AArch64_AM::getFPImmFloat(MO.getImm());
 
-  bool ValidName;
-  std::string Name = Mapper.toString(MO.getImm(), ValidName);
-  if (ValidName) {
-    O << Name;
-    return;
-  }
+  // 8 decimal places are enough to perfectly represent permitted floats.
+  O << format("#%.8f", FPImm);
 }
 
+static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
+  while (Stride--) {
+    switch (Reg) {
+    default:
+      assert(0 && "Vector register expected!");
+    case AArch64::Q0:  Reg = AArch64::Q1;  break;
+    case AArch64::Q1:  Reg = AArch64::Q2;  break;
+    case AArch64::Q2:  Reg = AArch64::Q3;  break;
+    case AArch64::Q3:  Reg = AArch64::Q4;  break;
+    case AArch64::Q4:  Reg = AArch64::Q5;  break;
+    case AArch64::Q5:  Reg = AArch64::Q6;  break;
+    case AArch64::Q6:  Reg = AArch64::Q7;  break;
+    case AArch64::Q7:  Reg = AArch64::Q8;  break;
+    case AArch64::Q8:  Reg = AArch64::Q9;  break;
+    case AArch64::Q9:  Reg = AArch64::Q10; break;
+    case AArch64::Q10: Reg = AArch64::Q11; break;
+    case AArch64::Q11: Reg = AArch64::Q12; break;
+    case AArch64::Q12: Reg = AArch64::Q13; break;
+    case AArch64::Q13: Reg = AArch64::Q14; break;
+    case AArch64::Q14: Reg = AArch64::Q15; break;
+    case AArch64::Q15: Reg = AArch64::Q16; break;
+    case AArch64::Q16: Reg = AArch64::Q17; break;
+    case AArch64::Q17: Reg = AArch64::Q18; break;
+    case AArch64::Q18: Reg = AArch64::Q19; break;
+    case AArch64::Q19: Reg = AArch64::Q20; break;
+    case AArch64::Q20: Reg = AArch64::Q21; break;
+    case AArch64::Q21: Reg = AArch64::Q22; break;
+    case AArch64::Q22: Reg = AArch64::Q23; break;
+    case AArch64::Q23: Reg = AArch64::Q24; break;
+    case AArch64::Q24: Reg = AArch64::Q25; break;
+    case AArch64::Q25: Reg = AArch64::Q26; break;
+    case AArch64::Q26: Reg = AArch64::Q27; break;
+    case AArch64::Q27: Reg = AArch64::Q28; break;
+    case AArch64::Q28: Reg = AArch64::Q29; break;
+    case AArch64::Q29: Reg = AArch64::Q30; break;
+    case AArch64::Q30: Reg = AArch64::Q31; break;
+    // Vector lists can wrap around.
+    case AArch64::Q31:
+      Reg = AArch64::Q0;
+      break;
+    }
+  }
+  return Reg;
+}
 
-void AArch64InstPrinter::printRegExtendOperand(const MCInst *MI,
-                                               unsigned OpNum,
-                                               raw_ostream &O,
-                                               A64SE::ShiftExtSpecifiers Ext) {
-  // FIXME: In principle TableGen should be able to detect this itself far more
-  // easily. We will only accumulate more of these hacks.
-  unsigned Reg0 = MI->getOperand(0).getReg();
-  unsigned Reg1 = MI->getOperand(1).getReg();
-
-  if (isStackReg(Reg0) || isStackReg(Reg1)) {
-    A64SE::ShiftExtSpecifiers LSLEquiv;
-
-    if (Reg0 == AArch64::XSP || Reg1 == AArch64::XSP)
-      LSLEquiv = A64SE::UXTX;
-    else
-      LSLEquiv = A64SE::UXTW;
+void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
+                                         raw_ostream &O,
+                                         StringRef LayoutSuffix) {
+  unsigned Reg = MI->getOperand(OpNum).getReg();
 
-    if (Ext == LSLEquiv) {
-      O << "lsl #" << MI->getOperand(OpNum).getImm();
-      return;
-    }
+  O << "{ ";
+
+  // Work out how many registers there are in the list (if there is an actual
+  // list).
+  unsigned NumRegs = 1;
+  if (MRI.getRegClass(AArch64::DDRegClassID).contains(Reg) ||
+      MRI.getRegClass(AArch64::QQRegClassID).contains(Reg))
+    NumRegs = 2;
+  else if (MRI.getRegClass(AArch64::DDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQRegClassID).contains(Reg))
+    NumRegs = 3;
+  else if (MRI.getRegClass(AArch64::DDDDRegClassID).contains(Reg) ||
+           MRI.getRegClass(AArch64::QQQQRegClassID).contains(Reg))
+    NumRegs = 4;
+
+  // Now forget about the list and find out what the first register is.
+  if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::dsub0))
+    Reg = FirstReg;
+  else if (unsigned FirstReg = MRI.getSubReg(Reg, AArch64::qsub0))
+    Reg = FirstReg;
+
+  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
+  // printing (otherwise getRegisterName fails).
+  if (MRI.getRegClass(AArch64::FPR64RegClassID).contains(Reg)) {
+    const MCRegisterClass &FPR128RC =
+        MRI.getRegClass(AArch64::FPR128RegClassID);
+    Reg = MRI.getMatchingSuperReg(Reg, AArch64::dsub, &FPR128RC);
   }
 
-  switch (Ext) {
-  case A64SE::UXTB: O << "uxtb"; break;
-  case A64SE::UXTH: O << "uxth"; break;
-  case A64SE::UXTW: O << "uxtw"; break;
-  case A64SE::UXTX: O << "uxtx"; break;
-  case A64SE::SXTB: O << "sxtb"; break;
-  case A64SE::SXTH: O << "sxth"; break;
-  case A64SE::SXTW: O << "sxtw"; break;
-  case A64SE::SXTX: O << "sxtx"; break;
-  default: llvm_unreachable("Unexpected shift type for printing");
+  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
+    O << getRegisterName(Reg, AArch64::vreg) << LayoutSuffix;
+    if (i + 1 != NumRegs)
+      O << ", ";
   }
 
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.getImm() != 0)
-    O << " #" << MO.getImm();
+  O << " }";
 }
 
-template<int MemScale> void
-AArch64InstPrinter::printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MOImm = MI->getOperand(OpNum);
-  int32_t Imm = unpackSignedImm(7, MOImm.getImm());
+void AArch64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
+                                                        unsigned OpNum,
+                                                        raw_ostream &O) {
+  printVectorList(MI, OpNum, O, "");
+}
+
+template <unsigned NumLanes, char LaneKind>
+void AArch64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                              raw_ostream &O) {
+  std::string Suffix(".");
+  if (NumLanes)
+    Suffix += itostr(NumLanes) + LaneKind;
+  else
+    Suffix += LaneKind;
 
-  O << "#" << (Imm * MemScale);
+  printVectorList(MI, OpNum, O, Suffix);
 }
 
-void AArch64InstPrinter::printVPRRegister(const MCInst *MI, unsigned OpNo,
+void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
                                           raw_ostream &O) {
-  unsigned Reg = MI->getOperand(OpNo).getReg();
-  std::string Name = getRegisterName(Reg);
-  Name[0] = 'v';
-  O << Name;
+  O << "[" << MI->getOperand(OpNum).getImm() << "]";
 }
 
-void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+                                           raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
+
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 2);
+    return;
+  }
+
+  // If the branch target is simply an address then print it in hex.
+  const MCConstantExpr *BranchTarget =
+      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
+  int64_t Address;
+  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
+    O << "0x";
+    O.write_hex(Address);
   } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    // If a symbolic branch target was added as a constant expression then print
-    // that address in hex.
-    const MCConstantExpr *BranchTarget = dyn_cast<MCConstantExpr>(Op.getExpr());
-    int64_t Address;
-    if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-      O << "0x";
-      O.write_hex(Address);
-    }
-    else {
-      // Otherwise, just print the expression.
-      O << *Op.getExpr();
-    }
+    // Otherwise, just print the expression.
+    O << *MI->getOperand(OpNum).getExpr();
   }
 }
 
+void AArch64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
+                                        raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNum);
 
-void AArch64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                   StringRef Annot) {
-  if (MI->getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a special assembler directive which applies an
-    // R_AARCH64_TLSDESC_CALL to the following (BLR) instruction. It has a fixed
-    // form outside the normal TableGenerated scheme.
-    O << "\t.tlsdesccall " << *MI->getOperand(0).getExpr();
-  } else if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
+  // If the label has already been resolved to an immediate offset (say, when
+  // we're running the disassembler), just print the immediate.
+  if (Op.isImm()) {
+    O << "#" << (Op.getImm() << 12);
+    return;
+  }
 
-  printAnnotation(O, Annot);
+  // Otherwise, just print the expression.
+  O << *MI->getOperand(OpNum).getExpr();
 }
 
-template <A64SE::ShiftExtSpecifiers Ext, bool isHalf>
-void AArch64InstPrinter::printNeonMovImmShiftOperand(const MCInst *MI,
-                                                     unsigned OpNum,
-                                                     raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-
-  assert(MO.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
-
-  bool IsLSL = false;
-  if (Ext == A64SE::LSL)
-    IsLSL = true;
-  else if (Ext != A64SE::MSL)
-    llvm_unreachable("Invalid shift specifier in movi instruction");
-
-  int64_t Imm = MO.getImm();
-
-  // MSL and LSLH accepts encoded shift amount 0 or 1.
-  if ((!IsLSL || (IsLSL && isHalf)) && Imm != 0 && Imm != 1)
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // LSH accepts encoded shift amount 0, 1, 2 or 3.
-  if (IsLSL && (Imm < 0 || Imm > 3))
-    llvm_unreachable("Invalid shift amount in movi instruction");
-
-  // Print shift amount as multiple of 8 with MSL encoded shift amount
-  // 0 and 1 printed as 8 and 16.
-  if (!IsLSL)
-    Imm++;
-  Imm *= 8;
-
-  // LSL #0 is not printed
-  if (IsLSL) {
-    if (Imm == 0)
-      return;
-    O << ", lsl";
-  } else
-    O << ", msl";
-
-  O << " #" << Imm;
-}
+void AArch64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
+  unsigned Opcode = MI->getOpcode();
 
-void AArch64InstPrinter::printNeonUImm0Operand(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &o) {
-  o << "#0x0";
+  bool Valid;
+  StringRef Name;
+  if (Opcode == AArch64::ISB)
+    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
+  else
+    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  if (Valid)
+    O << Name;
+  else
+    O << "#" << Val;
 }
 
-void AArch64InstPrinter::printUImmHexOperand(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
-
-  assert(MOUImm.isImm() &&
-         "Immediate operand required for Neon vector immediate inst.");
+void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  unsigned Imm = MOUImm.getImm();
+  bool Valid;
+  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  O << "#0x";
-  O.write_hex(Imm);
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printUImmBareOperand(const MCInst *MI,
-                                              unsigned OpNum,
-                                              raw_ostream &O) {
-  const MCOperand &MOUImm = MI->getOperand(OpNum);
+void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  assert(MOUImm.isImm()
-         && "Immediate operand required for Neon vector immediate inst.");
+  bool Valid;
+  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = Mapper.toString(Val, Valid);
 
-  unsigned Imm = MOUImm.getImm();
-  O << Imm;
+  if (Valid)
+    O << StringRef(Name).upper();
 }
 
-void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
-                                                    unsigned OpNum,
-                                                    raw_ostream &O) {
-  const MCOperand &MOUImm8 = MI->getOperand(OpNum);
-
-  assert(MOUImm8.isImm() &&
-         "Immediate operand required for Neon vector immediate bytemask inst.");
+void AArch64InstPrinter::printSystemPStateField(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned Val = MI->getOperand(OpNo).getImm();
 
-  uint32_t UImm8 = MOUImm8.getImm();
-  uint64_t Mask = 0;
-
-  // Replicates 0x00 or 0xff byte in a 64-bit vector
-  for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-    if ((UImm8 >> ByteNum) & 1)
-      Mask |= (uint64_t)0xff << (8 * ByteNum);
-  }
-
-  O << "#0x";
-  O.write_hex(Mask);
+  bool Valid;
+  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  if (Valid)
+    O << StringRef(Name.str()).upper();
+  else
+    O << "#" << Val;
 }
 
-// If Count > 1, there are two valid kinds of vector list:
-//   (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-//   (2) {Vn.layout - Vm.layout}
-// We choose the first kind as output.
-template <A64Layout::VectorLayout Layout, unsigned Count>
-void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
-
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-  std::string LayoutStr = A64VectorLayoutToString(Layout);
-  O << "{";
-  if (Count > 1) { // Print sub registers separately
-    bool IsVec64 = (Layout < A64Layout::VL_16B);
-    unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
-    for (unsigned I = 0; I < Count; I++) {
-      std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
-      Name[0] = 'v';
-      O << Name << LayoutStr;
-      if (I != Count - 1)
-        O << ", ";
-    }
-  } else { // Print the register directly when NumVecs is 1.
-    std::string Name = getRegisterName(Reg);
-    Name[0] = 'v';
-    O << Name << LayoutStr;
-  }
-  O << "}";
+void AArch64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
+                                                raw_ostream &O) {
+  unsigned RawVal = MI->getOperand(OpNo).getImm();
+  uint64_t Val = AArch64_AM::decodeAdvSIMDModImmType10(RawVal);
+  O << format("#%#016llx", Val);
 }
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 37b7273..fe7666e 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -11,11 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64INSTPRINTER_H
-#define LLVM_AARCH64INSTPRINTER_H
+#ifndef AArch64INSTPRINTER_H
+#define AArch64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
-#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
@@ -28,154 +28,112 @@ public:
   AArch64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                      const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  // Autogenerated by tblgen
-  void printInstruction(const MCInst *MI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  static const char *getRegisterName(unsigned RegNo);
-  static const char *getInstructionName(unsigned Opcode);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
-  void printRegName(raw_ostream &O, unsigned RegNum) const;
-
-  template<unsigned MemSize, unsigned RmSize>
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O) {
-    printAddrRegExtendOperand(MI, OpNum, O, MemSize, RmSize);
+  // Autogenerated by tblgen.
+  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
+  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  virtual StringRef getRegName(unsigned RegNo) const {
+    return getRegisterName(RegNo);
   }
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 
-
-  void printAddrRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O, unsigned MemSize,
-                                 unsigned RmSize);
-
-  void printAddSubImmLSL0Operand(const MCInst *MI,
-                                 unsigned OpNum, raw_ostream &O);
-  void printAddSubImmLSL12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &O);
-
-  void printBareImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printBFILSBOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFIWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBFXWidthOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-
-  void printCondCodeOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-
-  void printCRxOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O);
-
-  void printCVTFixedPosOperand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  void printFPZeroOperand(const MCInst *MI, unsigned OpNum, raw_ostream &o);
-
-  template<int MemScale>
-  void printOffsetUImm12Operand(const MCInst *MI,
-                                  unsigned OpNum, raw_ostream &o) {
-    printOffsetUImm12Operand(MI, OpNum, o, MemScale);
+protected:
+  bool printSysAlias(const MCInst *MI, raw_ostream &O);
+  // Operand printers
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printHexImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
+                           raw_ostream &O);
+  template<int Amount>
+  void printPostIncOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+    printPostIncOperand(MI, OpNo, Amount, O);
   }
 
-  void printOffsetUImm12Operand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &o, int MemScale);
-
-  template<unsigned field_width, unsigned scale>
-  void printLabelOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O);
-
-  template<unsigned RegWidth>
-  void printLogicalImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<typename SomeNamedImmMapper>
-  void printNamedImmOperand(const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O) {
-    printNamedImmOperand(SomeNamedImmMapper(), MI, OpNum, O);
+  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printArithExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                      char SrcRegKind, unsigned Width);
+  template <char SrcRegKind, unsigned Width>
+  void printMemExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printMemExtend(MI, OpNum, O, SrcRegKind, Width);
   }
 
-  void printNamedImmOperand(const NamedImmMapper &Mapper,
-                            const MCInst *MI, unsigned OpNum,
-                            raw_ostream &O);
-
-  void printSysRegOperand(const A64SysReg::SysRegMapper &Mapper,
-                          const MCInst *MI, unsigned OpNum,
-                          raw_ostream &O);
+  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printInverseCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAlignedLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                         raw_ostream &O);
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, unsigned Scale,
+                        raw_ostream &O);
 
-  void printMRSOperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MRSMapper(), MI, OpNum, O);
+  template<int Scale>
+  void printUImm12Offset(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printUImm12Offset(MI, OpNum, Scale, O);
   }
 
-  void printMSROperand(const MCInst *MI, unsigned OpNum,
-                       raw_ostream &O) {
-    printSysRegOperand(A64SysReg::MSRMapper(), MI, OpNum, O);
+  template<int BitWidth>
+  void printAMIndexedWB(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+    printAMIndexedWB(MI, OpNum, BitWidth / 8, O);
   }
 
-  void printShiftOperand(const char *name, const MCInst *MI,
-                         unsigned OpIdx, raw_ostream &O);
-
-  void printLSLOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printLSROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("lsr", MI, OpNum, O);
-  }
-  void printASROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("asr", MI, OpNum, O);
-  }
-  void printROROperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand("ror", MI, OpNum, O);
-  }
+  template<int Scale>
+  void printImmScale(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  template<A64SE::ShiftExtSpecifiers Shift>
-  void printShiftOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printShiftOperand(MI, OpNum, O, Shift);
-  }
+  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printShiftOperand(const MCInst *MI, unsigned OpNum,
-                         raw_ostream &O, A64SE::ShiftExtSpecifiers Sh);
+  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
+                       StringRef LayoutSuffix);
 
-  void printMoveWideImmOperand(const  MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
+  /// Print a list of vector registers where the type suffix is implicit
+  /// (i.e. attached to the instruction rather than the registers).
+  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
+                                      raw_ostream &O);
 
-  template<int MemSize> void
-  printSImm7ScaledOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template <unsigned NumLanes, char LaneKind>
+  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
-  void printOffsetSImm9Operand(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printPRFMOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  template<A64SE::ShiftExtSpecifiers EXT>
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O) {
-    printRegExtendOperand(MI, OpNum, O, EXT);
-  }
+  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMSRSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+};
 
-  void printRegExtendOperand(const MCInst *MI, unsigned OpNum,
-                             raw_ostream &O, A64SE::ShiftExtSpecifiers Ext);
+class AArch64AppleInstPrinter : public AArch64InstPrinter {
+public:
+  AArch64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  void printVPRRegister(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
-  bool isStackReg(unsigned RegNo) {
-    return RegNo == AArch64::XSP || RegNo == AArch64::WSP;
+  void printInstruction(const MCInst *MI, raw_ostream &O) override;
+  bool printAliasInstr(const MCInst *MI, raw_ostream &O) override;
+  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                                       unsigned PrintMethodIdx, raw_ostream &O);
+  StringRef getRegName(unsigned RegNo) const override {
+    return getRegisterName(RegNo);
   }
-
-  template <A64SE::ShiftExtSpecifiers Ext, bool IsHalf>
-  void printNeonMovImmShiftOperand(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O);
-  void printNeonUImm0Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmHexOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printUImmBareOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
-                                  raw_ostream &O);
-
-  template <A64Layout::VectorLayout Layout, unsigned Count>
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = AArch64::NoRegAltName);
 };
 }
 
diff --git a/lib/Target/AArch64/InstPrinter/Android.mk b/lib/Target/AArch64/InstPrinter/Android.mk
index ac9b0df..de6aa89 100644
--- a/lib/Target/AArch64/InstPrinter/Android.mk
+++ b/lib/Target/AArch64/InstPrinter/Android.mk
@@ -2,6 +2,7 @@ LOCAL_PATH := $(call my-dir)
 
 arm64_asm_printer_TBLGEN_TABLES := \
   AArch64GenAsmWriter.inc \
+  AArch64GenAsmWriter1.inc \
   AArch64GenRegisterInfo.inc \
   AArch64GenSubtargetInfo.inc \
   AArch64GenInstrInfo.inc
diff --git a/lib/Target/AArch64/InstPrinter/CMakeLists.txt b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
index 3db56e4..363f502 100644
--- a/lib/Target/AArch64/InstPrinter/CMakeLists.txt
+++ b/lib/Target/AArch64/InstPrinter/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64AsmPrinter
   AArch64InstPrinter.cpp
   )
+
+add_dependencies(LLVMAArch64AsmPrinter AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
index 4836c7c..a13e842 100644
--- a/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
+++ b/lib/Target/AArch64/InstPrinter/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/InstPrinter/Makefile b/lib/Target/AArch64/InstPrinter/Makefile
index 1c36a8d..b17e8d0 100644
--- a/lib/Target/AArch64/InstPrinter/Makefile
+++ b/lib/Target/AArch64/InstPrinter/Makefile
@@ -9,7 +9,7 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64AsmPrinter
 
-# Hack: we need to include 'main' target directory to grab private headers
+# Hack: we need to include 'main' arm target directory to grab private headers
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/AArch64/LLVMBuild.txt b/lib/Target/AArch64/LLVMBuild.txt
index 4c8f101..642c183 100644
--- a/lib/Target/AArch64/LLVMBuild.txt
+++ b/lib/Target/AArch64/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/LLVMBuild.txt -----------------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/LLVMBuild.txt -------------------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -31,5 +31,5 @@ has_jit = 1
 type = Library
 name = AArch64CodeGen
 parent = AArch64
-required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils AsmPrinter CodeGen Core MC SelectionDAG Support Target
+required_libraries = AArch64AsmPrinter AArch64Desc AArch64Info AArch64Utils Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support Target
 add_to_library_groups = AArch64
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 7717743..8b1e44e 100644
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AddressingModes.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -1,4 +1,4 @@
-//===- ARM64AddressingModes.h - ARM64 Addressing Modes ----------*- C++ -*-===//
+//===- AArch64AddressingModes.h - AArch64 Addressing Modes ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the ARM64 addressing mode implementation stuff.
+// This file contains the AArch64 addressing mode implementation stuff.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
-#define LLVM_TARGET_ARM64_ARM64ADDRESSINGMODES_H
+#ifndef LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
+#define LLVM_TARGET_AArch64_AArch64ADDRESSINGMODES_H
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
@@ -22,38 +22,63 @@
 
 namespace llvm {
 
-/// ARM64_AM - ARM64 Addressing Mode Stuff
-namespace ARM64_AM {
+/// AArch64_AM - AArch64 Addressing Mode Stuff
+namespace AArch64_AM {
 
 //===----------------------------------------------------------------------===//
 // Shifts
 //
 
-enum ShiftType {
-  InvalidShift = -1,
+enum ShiftExtendType {
+  InvalidShiftExtend = -1,
   LSL = 0,
-  LSR = 1,
-  ASR = 2,
-  ROR = 3,
-  MSL = 4
+  LSR,
+  ASR,
+  ROR,
+  MSL,
+
+  UXTB,
+  UXTH,
+  UXTW,
+  UXTX,
+
+  SXTB,
+  SXTH,
+  SXTW,
+  SXTX,
 };
 
 /// getShiftName - Get the string encoding for the shift type.
-static inline const char *getShiftName(ARM64_AM::ShiftType ST) {
+static inline const char *getShiftExtendName(AArch64_AM::ShiftExtendType ST) {
   switch (ST) {
   default: assert(false && "unhandled shift type!");
-  case ARM64_AM::LSL: return "lsl";
-  case ARM64_AM::LSR: return "lsr";
-  case ARM64_AM::ASR: return "asr";
-  case ARM64_AM::ROR: return "ror";
-  case ARM64_AM::MSL: return "msl";
+  case AArch64_AM::LSL: return "lsl";
+  case AArch64_AM::LSR: return "lsr";
+  case AArch64_AM::ASR: return "asr";
+  case AArch64_AM::ROR: return "ror";
+  case AArch64_AM::MSL: return "msl";
+  case AArch64_AM::UXTB: return "uxtb";
+  case AArch64_AM::UXTH: return "uxth";
+  case AArch64_AM::UXTW: return "uxtw";
+  case AArch64_AM::UXTX: return "uxtx";
+  case AArch64_AM::SXTB: return "sxtb";
+  case AArch64_AM::SXTH: return "sxth";
+  case AArch64_AM::SXTW: return "sxtw";
+  case AArch64_AM::SXTX: return "sxtx";
   }
-  return 0;
+  return nullptr;
 }
 
 /// getShiftType - Extract the shift type.
-static inline ARM64_AM::ShiftType getShiftType(unsigned Imm) {
-  return ARM64_AM::ShiftType((Imm >> 6) & 0x7);
+static inline AArch64_AM::ShiftExtendType getShiftType(unsigned Imm) {
+  switch ((Imm >> 6) & 0x7) {
+  default: return AArch64_AM::InvalidShiftExtend;
+  case 0: return AArch64_AM::LSL;
+  case 1: return AArch64_AM::LSR;
+  case 2: return AArch64_AM::ASR;
+  case 3: return AArch64_AM::ROR;
+  case 4: return AArch64_AM::MSL;
+  }
 }
 
 /// getShiftValue - Extract the shift value.
@@ -70,56 +95,51 @@ static inline unsigned getShiftValue(unsigned Imm) {
 ///            100 ==> msl
 ///   {8-6}  = shifter
 ///   {5-0}  = imm
-static inline unsigned getShifterImm(ARM64_AM::ShiftType ST, unsigned Imm) {
+static inline unsigned getShifterImm(AArch64_AM::ShiftExtendType ST,
+                                     unsigned Imm) {
   assert((Imm & 0x3f) == Imm && "Illegal shifted immedate value!");
-  return (unsigned(ST) << 6) | (Imm & 0x3f);
+  unsigned STEnc = 0;
+  switch (ST) {
+  default:  llvm_unreachable("Invalid shift requested");
+  case AArch64_AM::LSL: STEnc = 0; break;
+  case AArch64_AM::LSR: STEnc = 1; break;
+  case AArch64_AM::ASR: STEnc = 2; break;
+  case AArch64_AM::ROR: STEnc = 3; break;
+  case AArch64_AM::MSL: STEnc = 4; break;
+  }
+  return (STEnc << 6) | (Imm & 0x3f);
 }
 
 //===----------------------------------------------------------------------===//
 // Extends
 //
 
-enum ExtendType {
-  InvalidExtend = -1,
-  UXTB = 0,
-  UXTH = 1,
-  UXTW = 2,
-  UXTX = 3,
-  SXTB = 4,
-  SXTH = 5,
-  SXTW = 6,
-  SXTX = 7
-};
-
-/// getExtendName - Get the string encoding for the extend type.
-static inline const char *getExtendName(ARM64_AM::ExtendType ET) {
-  switch (ET) {
-  default: assert(false && "unhandled extend type!");
-  case ARM64_AM::UXTB: return "uxtb";
-  case ARM64_AM::UXTH: return "uxth";
-  case ARM64_AM::UXTW: return "uxtw";
-  case ARM64_AM::UXTX: return "uxtx";
-  case ARM64_AM::SXTB: return "sxtb";
-  case ARM64_AM::SXTH: return "sxth";
-  case ARM64_AM::SXTW: return "sxtw";
-  case ARM64_AM::SXTX: return "sxtx";
-  }
-  return 0;
-}
-
 /// getArithShiftValue - get the arithmetic shift value.
 static inline unsigned getArithShiftValue(unsigned Imm) {
   return Imm & 0x7;
 }
 
 /// getExtendType - Extract the extend type for operands of arithmetic ops.
-static inline ARM64_AM::ExtendType getArithExtendType(unsigned Imm) {
-  return ARM64_AM::ExtendType((Imm >> 3) & 0x7);
+static inline AArch64_AM::ShiftExtendType getExtendType(unsigned Imm) {
+  assert((Imm & 0x7) == Imm && "invalid immediate!");
+  switch (Imm) {
+  default: llvm_unreachable("Compiler bug!");
+  case 0: return AArch64_AM::UXTB;
+  case 1: return AArch64_AM::UXTH;
+  case 2: return AArch64_AM::UXTW;
+  case 3: return AArch64_AM::UXTX;
+  case 4: return AArch64_AM::SXTB;
+  case 5: return AArch64_AM::SXTH;
+  case 6: return AArch64_AM::SXTW;
+  case 7: return AArch64_AM::SXTX;
+  }
 }
 
-/// getArithExtendImm - Encode the extend type and shift amount for an
-///                     arithmetic instruction:
-///   imm:     3-bit extend amount
+static inline AArch64_AM::ShiftExtendType getArithExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 3) & 0x7);
+}
+
+/// Mapping from extend bits to required operation:
 ///   shifter: 000 ==> uxtb
 ///            001 ==> uxth
 ///            010 ==> uxtw
@@ -128,12 +148,29 @@ static inline ARM64_AM::ExtendType getArithExtendType(unsigned Imm) {
 ///            101 ==> sxth
 ///            110 ==> sxtw
 ///            111 ==> sxtx
+inline unsigned getExtendEncoding(AArch64_AM::ShiftExtendType ET) {
+  switch (ET) {
+  default: llvm_unreachable("Invalid extend type requested");
+  case AArch64_AM::UXTB: return 0; break;
+  case AArch64_AM::UXTH: return 1; break;
+  case AArch64_AM::UXTW: return 2; break;
+  case AArch64_AM::UXTX: return 3; break;
+  case AArch64_AM::SXTB: return 4; break;
+  case AArch64_AM::SXTH: return 5; break;
+  case AArch64_AM::SXTW: return 6; break;
+  case AArch64_AM::SXTX: return 7; break;
+  }
+}
+
+/// getArithExtendImm - Encode the extend type and shift amount for an
+///                     arithmetic instruction:
+///   imm:     3-bit extend amount
 ///   {5-3}  = shifter
 ///   {2-0}  = imm3
-static inline unsigned getArithExtendImm(ARM64_AM::ExtendType ET,
+static inline unsigned getArithExtendImm(AArch64_AM::ShiftExtendType ET,
                                          unsigned Imm) {
   assert((Imm & 0x7) == Imm && "Illegal shifted immedate value!");
-  return (unsigned(ET) << 3) | (Imm & 0x7);
+  return (getExtendEncoding(ET) << 3) | (Imm & 0x7);
 }
 
 /// getMemDoShift - Extract the "do shift" flag value for load/store
@@ -144,8 +181,8 @@ static inline bool getMemDoShift(unsigned Imm) {
 
 /// getExtendType - Extract the extend type for the offset operand of
 /// loads/stores.
-static inline ARM64_AM::ExtendType getMemExtendType(unsigned Imm) {
-  return ARM64_AM::ExtendType((Imm >> 1) & 0x7);
+static inline AArch64_AM::ShiftExtendType getMemExtendType(unsigned Imm) {
+  return getExtendType((Imm >> 1) & 0x7);
 }
 
 /// getExtendImm - Encode the extend type and amount for a load/store inst:
@@ -160,66 +197,9 @@ static inline ARM64_AM::ExtendType getMemExtendType(unsigned Imm) {
 ///            111 ==> sxtx
 ///   {3-1}  = shifter
 ///   {0}  = doshift
-static inline unsigned getMemExtendImm(ARM64_AM::ExtendType ET, bool DoShift) {
-  return (unsigned(ET) << 1) | unsigned(DoShift);
-}
-
-//===----------------------------------------------------------------------===//
-// Prefetch
-//
-
-/// Pre-fetch operator names.
-/// The enum values match the encoding values:
-///   prfop<4:3> 00=preload data, 10=prepare for store
-///   prfop<2:1> 00=target L1 cache, 01=target L2 cache, 10=target L3 cache,
-///   prfop<0> 0=non-streaming (temporal), 1=streaming (non-temporal)
-enum PrefetchOp {
-  InvalidPrefetchOp = -1,
-  PLDL1KEEP = 0x00,
-  PLDL1STRM = 0x01,
-  PLDL2KEEP = 0x02,
-  PLDL2STRM = 0x03,
-  PLDL3KEEP = 0x04,
-  PLDL3STRM = 0x05,
-  PSTL1KEEP = 0x10,
-  PSTL1STRM = 0x11,
-  PSTL2KEEP = 0x12,
-  PSTL2STRM = 0x13,
-  PSTL3KEEP = 0x14,
-  PSTL3STRM = 0x15
-};
-
-/// isNamedPrefetchOp - Check if the prefetch-op 5-bit value has a name.
-static inline bool isNamedPrefetchOp(unsigned prfop) {
-  switch (prfop) {
-  default: return false;
-  case ARM64_AM::PLDL1KEEP: case ARM64_AM::PLDL1STRM: case ARM64_AM::PLDL2KEEP:
-  case ARM64_AM::PLDL2STRM: case ARM64_AM::PLDL3KEEP: case ARM64_AM::PLDL3STRM:
-  case ARM64_AM::PSTL1KEEP: case ARM64_AM::PSTL1STRM: case ARM64_AM::PSTL2KEEP:
-  case ARM64_AM::PSTL2STRM: case ARM64_AM::PSTL3KEEP: case ARM64_AM::PSTL3STRM:
-    return true;
-  }
-}
-
-
-/// getPrefetchOpName - Get the string encoding for the prefetch operator.
-static inline const char *getPrefetchOpName(ARM64_AM::PrefetchOp prfop) {
-  switch (prfop) {
-  default: assert(false && "unhandled prefetch-op type!");
-  case ARM64_AM::PLDL1KEEP: return "pldl1keep";
-  case ARM64_AM::PLDL1STRM: return "pldl1strm";
-  case ARM64_AM::PLDL2KEEP: return "pldl2keep";
-  case ARM64_AM::PLDL2STRM: return "pldl2strm";
-  case ARM64_AM::PLDL3KEEP: return "pldl3keep";
-  case ARM64_AM::PLDL3STRM: return "pldl3strm";
-  case ARM64_AM::PSTL1KEEP: return "pstl1keep";
-  case ARM64_AM::PSTL1STRM: return "pstl1strm";
-  case ARM64_AM::PSTL2KEEP: return "pstl2keep";
-  case ARM64_AM::PSTL2STRM: return "pstl2strm";
-  case ARM64_AM::PSTL3KEEP: return "pstl3keep";
-  case ARM64_AM::PSTL3STRM: return "pstl3strm";
-  }
-  return 0;
+static inline unsigned getMemExtendImm(AArch64_AM::ShiftExtendType ET,
+                                       bool DoShift) {
+  return (getExtendEncoding(ET) << 1) | unsigned(DoShift);
 }
 
 static inline uint64_t ror(uint64_t elt, unsigned size) {
@@ -751,7 +731,7 @@ static inline uint64_t decodeAdvSIMDModImmType12(uint8_t Imm) {
   return (EncVal << 32) | EncVal;
 }
 
-} // end namespace ARM64_AM
+} // end namespace AArch64_AM
 
 } // end namespace llvm
 
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index f1452ab..d8900d4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -6,168 +6,57 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the AArch64 implementation of the MCAsmBackend class,
-// which is principally concerned with relaxation of the various fixup kinds.
-//
-//===----------------------------------------------------------------------===//
 
+#include "AArch64.h"
+#include "AArch64RegisterInfo.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/MachO.h"
 using namespace llvm;
 
 namespace {
-class AArch64AsmBackend : public MCAsmBackend {
-  const MCSubtargetInfo* STI;
-public:
-  AArch64AsmBackend(const Target &T, const StringRef TT)
-    : MCAsmBackend(),
-      STI(AArch64_MC::createAArch64MCSubtargetInfo(TT, "", ""))
-    {}
-
-
-  ~AArch64AsmBackend() {
-    delete STI;
-  }
-
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  virtual void processFixupValue(const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFixup &Fixup, const MCFragment *DF,
-                                 const MCValue &Target, uint64_t &Value,
-                                 bool &IsResolved);
-};
-} // end anonymous namespace
-
-void AArch64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                          const MCAsmLayout &Layout,
-                                          const MCFixup &Fixup,
-                                          const MCFragment *DF,
-                                          const MCValue &Target,
-                                          uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_prel_got_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_adr_gottprel_page ||
-      (uint32_t)Fixup.getKind() == AArch64::fixup_a64_tlsdesc_adr_page)
-    IsResolved = false;
-}
-
 
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value);
-
-namespace {
+class AArch64AsmBackend : public MCAsmBackend {
+  static const unsigned PCRelFlagVal =
+      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
 
-class ELFAArch64AsmBackend : public AArch64AsmBackend {
-  uint8_t OSABI;
-  bool IsLittle; // Big or little endian
 public:
-  ELFAArch64AsmBackend(const Target &T, const StringRef TT,
-                       uint8_t _OSABI, bool isLittle)
-    : AArch64AsmBackend(T, TT), OSABI(_OSABI), IsLittle(isLittle) { }
+  AArch64AsmBackend(const Target &T) : MCAsmBackend() {}
 
-  bool fixupNeedsRelaxation(const MCFixup &Fixup,
-                            uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
-
-  unsigned int getNumFixupKinds() const {
+  unsigned getNumFixupKinds() const override {
     return AArch64::NumTargetFixupKinds;
   }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo Infos[AArch64::NumTargetFixupKinds] = {
-// This table *must* be in the order that the fixup_* kinds are defined in
-// AArch64FixupKinds.h.
-//
-// Name                   Offset (bits)    Size (bits)    Flags
-{ "fixup_a64_ld_prel",               0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_adr_prel_page",         0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_add_lo12",              0,    32,             0 },
-{ "fixup_a64_ldst8_lo12",            0,    32,             0 },
-{ "fixup_a64_ldst16_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst32_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst64_lo12",           0,    32,             0 },
-{ "fixup_a64_ldst128_lo12",          0,    32,             0 },
-{ "fixup_a64_tstbr",                 0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_condbr",                0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_uncondbr",              0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_call",                  0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_uabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g0_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g1_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2",          0,    32,             0 },
-{ "fixup_a64_movw_uabs_g2_nc",       0,    32,             0 },
-{ "fixup_a64_movw_uabs_g3",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g0",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g1",          0,    32,             0 },
-{ "fixup_a64_movw_sabs_g2",          0,    32,             0 },
-{ "fixup_a64_adr_prel_got_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_got_lo12_nc",      0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g2",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g1_nc",     0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0",        0,    32,             0 },
-{ "fixup_a64_movw_dtprel_g0_nc",     0,    32,             0 },
-{ "fixup_a64_add_dtprel_hi12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12",       0,    32,             0 },
-{ "fixup_a64_add_dtprel_lo12_nc",    0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst8_dtprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst16_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst32_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12",    0,    32,             0 },
-{ "fixup_a64_ldst64_dtprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g1",      0,    32,             0 },
-{ "fixup_a64_movw_gottprel_g0_nc",   0,    32,             0 },
-{ "fixup_a64_adr_gottprel_page",     0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_ld64_gottprel_lo12_nc", 0,    32,             0 },
-{ "fixup_a64_ld_gottprel_prel19",    0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_movw_tprel_g2",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g1_nc",      0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0",         0,    32,             0 },
-{ "fixup_a64_movw_tprel_g0_nc",      0,    32,             0 },
-{ "fixup_a64_add_tprel_hi12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12",        0,    32,             0 },
-{ "fixup_a64_add_tprel_lo12_nc",     0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12",      0,    32,             0 },
-{ "fixup_a64_ldst8_tprel_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst16_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst32_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12",     0,    32,             0 },
-{ "fixup_a64_ldst64_tprel_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_adr_page",      0,    32, MCFixupKindInfo::FKF_IsPCRel },
-{ "fixup_a64_tlsdesc_ld64_lo12_nc",  0,    32,             0 },
-{ "fixup_a64_tlsdesc_add_lo12_nc",   0,    32,             0 },
-{ "fixup_a64_tlsdesc_call",          0,     0,             0 }
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // AArch64FixupKinds.h.
+      //
+      // Name                           Offset (bits) Size (bits)     Flags
+      { "fixup_aarch64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
+      { "fixup_aarch64_add_imm12", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale1", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale2", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale4", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale8", 10, 12, 0 },
+      { "fixup_aarch64_ldst_imm12_scale16", 10, 12, 0 },
+      { "fixup_aarch64_ldr_pcrel_imm19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_movw", 5, 16, 0 },
+      { "fixup_aarch64_pcrel_branch14", 5, 14, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch19", 5, 19, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_branch26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal },
+      { "fixup_aarch64_tlsdesc_call", 0, 0, 0 }
     };
+
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -177,417 +66,501 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const {
-    unsigned NumBytes = getFixupKindInfo(Fixup.getKind()).TargetSize / 8;
-    Value = adjustFixupValue(Fixup.getKind(), Value);
-    if (!Value) return;           // Doesn't change encoding.
-
-    unsigned Offset = Fixup.getOffset();
-    assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-    // For each byte of the fragment that the fixup touches, mask in the bits
-    // from the fixup value.
-    for (unsigned i = 0; i != NumBytes; ++i) {
-      Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-    }
-  }
+                  uint64_t Value, bool IsPCRel) const override;
 
-  bool mayNeedRelaxation(const MCInst&) const {
-    return false;
-  }
+  bool mayNeedRelaxation(const MCInst &Inst) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override;
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void relaxInstruction(const MCInst&, llvm::MCInst&) const {
-    llvm_unreachable("Cannot relax instructions");
-  }
+  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createAArch64ELFObjectWriter(OS, OSABI, IsLittle);
-  }
+  unsigned getPointerSize() const { return 8; }
 };
 
 } // end anonymous namespace
 
-bool
-ELFAArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
-                                           uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // Correct for now. With all instructions 32-bit only very low-level
-  // considerations could make you select something which may fail.
-  return false;
-}
+/// \brief The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    assert(0 && "Unknown fixup kind!");
 
+  case AArch64::fixup_aarch64_tlsdesc_call:
+    return 0;
 
-bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // Can't emit NOP with size not multiple of 32-bits
-  if (Count % 4 != 0)
-    return false;
+  case FK_Data_1:
+    return 1;
 
-  uint64_t NumNops = Count / 4;
-  for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(0xd503201f);
+  case FK_Data_2:
+  case AArch64::fixup_aarch64_movw:
+    return 2;
+
+  case AArch64::fixup_aarch64_pcrel_branch14:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    return 3;
+
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+  case FK_Data_4:
+    return 4;
 
-  return true;
+  case FK_Data_8:
+    return 8;
+  }
 }
 
-static unsigned ADRImmBits(unsigned Value) {
+static unsigned AdrImmBits(unsigned Value) {
   unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1fffff) >> 2;
-
+  unsigned hi19 = (Value & 0x1ffffc) >> 2;
   return (hi19 << 5) | (lo2 << 29);
 }
 
 static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
-    llvm_unreachable("Unknown fixup kind!");
-  case FK_Data_2:
-    assert((int64_t)Value >= -32768 &&
-           (int64_t)Value <= 65536 &&
-           "Out of range ABS16 fixup");
+    assert(false && "Unknown fixup kind!");
+  case AArch64::fixup_aarch64_pcrel_adr_imm21:
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    return AdrImmBits(Value & 0x1fffffULL);
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
+  case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+  case AArch64::fixup_aarch64_pcrel_branch19:
+    // Signed 21-bit immediate
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded.
+    return (Value >> 2) & 0x7ffff;
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+    // Unsigned 12-bit immediate
+    if (Value >= 0x1000)
+      report_fatal_error("invalid imm12 fixup value");
     return Value;
-  case FK_Data_4:
-    assert((int64_t)Value >= -(1LL << 31) &&
-           (int64_t)Value <= (1LL << 32) - 1 &&
-           "Out of range ABS32 fixup");
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+    // Unsigned 12-bit immediate which gets multiplied by 2
+    if (Value & 1 || Value >= 0x2000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 1;
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+    // Unsigned 12-bit immediate which gets multiplied by 4
+    if (Value & 3 || Value >= 0x4000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 2;
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+    // Unsigned 12-bit immediate which gets multiplied by 8
+    if (Value & 7 || Value >= 0x8000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 3;
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
+    // Unsigned 12-bit immediate which gets multiplied by 16
+    if (Value & 15 || Value >= 0x10000)
+      report_fatal_error("invalid imm12 fixup value");
+    return Value >> 4;
+  case AArch64::fixup_aarch64_movw:
+    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
+  case AArch64::fixup_aarch64_pcrel_branch14:
+    // Signed 16-bit immediate
+    if (SignedValue > 32767 || SignedValue < -32768)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3fff;
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
+    // Signed 28-bit immediate
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      report_fatal_error("fixup value out of range");
+    // Low two bits are not encoded (4-byte alignment assumed).
+    if (Value & 0x3)
+      report_fatal_error("fixup not sufficiently aligned");
+    return (Value >> 2) & 0x3ffffff;
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
   case FK_Data_8:
     return Value;
+  }
+}
 
-  case AArch64::fixup_a64_ld_gottprel_prel19:
-    // R_AARCH64_LD_GOTTPREL_PREL19: Set a load-literal immediate to bits 1F
-    // FFFC of G(TPREL(S+A)) - P; check -2^20 <= X < 2^20.
-  case AArch64::fixup_a64_ld_prel:
-    // R_AARCH64_LD_PREL_LO19: Sets a load-literal (immediate) value to bits
-    // 1F FFFC of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range LDR (lit) fixup");
-    return (Value & 0x1ffffc) << 3;
-
-  case AArch64::fixup_a64_adr_prel:
-    // R_AARCH64_ADR_PREL_LO21: Sets an ADR immediate value to bits 1F FFFF of
-    // the result of S+A-P, checking that -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range ADR fixup");
-    return ADRImmBits(Value & 0x1fffff);
-
-  case AArch64::fixup_a64_adr_prel_page:
-    // R_AARCH64_ADR_PREL_PG_HI21: Sets an ADRP immediate value to bits 1 FFFF
-    // F000 of the result of the operation, checking that -2^32 <= result <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_add_dtprel_hi12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of DTPREL(S+A), check 0 <= X < 2^24.
-  case AArch64::fixup_a64_add_tprel_hi12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FF F000 of TPREL(S+A), check 0 <= X < 2^24.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 24) && "Out of range ADD fixup");
-    return (Value & 0xfff000) >> 2;
-
-  case AArch64::fixup_a64_add_dtprel_lo12:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_add_tprel_lo12:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t)Value >= 0 &&
-           (int64_t)Value < (1LL << 12) && "Out of range ADD fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_add_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_add_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_ADD_TPREL_LO12_NC: Set an ADD immediate field to bits
-    // FFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-    // R_AARCH64_TLSDESC_ADD_LO12_NC: Set an ADD immediate field to bits
-    // FFF of G(TLSDESC(S+A)), with no overflow check.
-  case AArch64::fixup_a64_add_lo12:
-    // R_AARCH64_ADD_ABS_LO12_NC: Sets an ADD immediate value to bits FFF of
-    // S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst8_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst8_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_DTPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST8_TPREL_LO12: Set an LD/ST offset field to bits FFF
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst8_lo12:
-    // R_AARCH64_LDST8_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFF
-    // of S+A, with no overflow check.
-    return (Value & 0xfff) << 10;
-
-  case AArch64::fixup_a64_ldst16_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst16_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_DTPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST16_TPREL_LO12: Set an LD/ST offset field to bits FFE
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst16_lo12:
-    // R_AARCH64_LDST16_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFE
-    // of S+A, with no overflow check.
-    return (Value & 0xffe) << 9;
-
-  case AArch64::fixup_a64_ldst32_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst32_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_DTPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST32_TPREL_LO12: Set an LD/ST offset field to bits FFC
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst32_lo12:
-    // R_AARCH64_LDST32_ABS_LO12_NC: Sets an LD/ST immediate value to bits FFC
-    // of S+A, with no overflow check.
-    return (Value & 0xffc) << 8;
-
-  case AArch64::fixup_a64_ldst64_dtprel_lo12:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-  case AArch64::fixup_a64_ldst64_tprel_lo12:
-    // R_AARCH64_TLSLE_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), check 0 <= X < 2^12.
-    assert((int64_t) Value >= 0 &&
-           (int64_t) Value < (1LL << 12) && "Out of range LD/ST fixup");
-    // ... fallthrough to no-checking versions ...
-  case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_DTPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-    // R_AARCH64_TLSLD_LDST64_TPREL_LO12: Set an LD/ST offset field to bits FF8
-    // of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_ldst64_lo12:
-    // R_AARCH64_LDST64_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF8
-    // of S+A, with no overflow check.
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_ldst128_lo12:
-    // R_AARCH64_LDST128_ABS_LO12_NC: Sets an LD/ST immediate value to bits FF0
-    // of S+A, with no overflow check.
-    return (Value & 0xff0) << 6;
-
-  case AArch64::fixup_a64_movw_uabs_g0:
-    // R_AARCH64_MOVW_UABS_G0: Sets a MOVZ immediate field to bits FFFF of S+A
-    // with a check that S+A < 2^16
-    assert(Value <= 0xffff && "Out of range move wide fixup");
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of DTPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_gottprel_g0_nc:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of G(TPREL(S+A)) - GOT with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g0_nc:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0_NC: Sets a MOVK immediate field to bits
-    // FFFF of TPREL(S+A) with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g0_nc:
-    // R_AARCH64_MOVW_UABS_G0_NC: Sets a MOVK immediate field to bits FFFF of
-    // S+A with no overflow check.
-    return (Value & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g1:
-    // R_AARCH64_MOVW_UABS_G1: Sets a MOVZ immediate field to bits FFFF0000 of
-    // S+A with a check that S+A < 2^32
-    assert(Value <= 0xffffffffull && "Out of range move wide fixup");
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of DTPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_tprel_g1_nc:
-    // R_AARCH64_TLSLD_MOVW_TPREL_G1_NC: Set a MOVK immediate field
-    // to bits FFFF0000 of TPREL(S+A), with no overflow check.
-  case AArch64::fixup_a64_movw_uabs_g1_nc:
-    // R_AARCH64_MOVW_UABS_G1_NC: Sets a MOVK immediate field to bits
-    // FFFF0000 of S+A with no overflow check.
-    return ((Value >> 16) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 of S+A with a check that S+A < 2^48
-    assert(Value <= 0xffffffffffffull && "Out of range move wide fixup");
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g2_nc:
-    // R_AARCH64_MOVW_UABS_G2: Sets a MOVK immediate field to bits FFFF 0000
-    // 0000 of S+A with no overflow check.
-    return ((Value >> 32) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_uabs_g3:
-    // R_AARCH64_MOVW_UABS_G3: Sets a MOVZ immediate field to bits FFFF 0000
-    // 0000 0000 of S+A (no overflow check needed)
-    return ((Value >> 48) & 0xffff) << 5;
-
-  case AArch64::fixup_a64_movw_dtprel_g0:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G0: Set a MOV[NZ] immediate field
-    // to bits FFFF of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g0:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G0: Set a MOV[NZ] immediate field to
-    // bits FFFF of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g0: {
-    // R_AARCH64_MOVW_SABS_G0: Sets MOV[NZ] immediate field using bits FFFF of
-    // S+A (see notes below); check -2^16 <= S+A < 2^16. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 16) && Signed < (1LL << 16)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = (Value & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      // MCCodeEmitter should have encoded a MOVN, which is fine.
-      Value = (~Value & 0xffff) << 5;
-    }
-    return Value;
+void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                   unsigned DataSize, uint64_t Value,
+                                   bool IsPCRel) const {
+  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  if (!Value)
+    return; // Doesn't change encoding.
+  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  // Apply any target-specific value adjustments.
+  Value = adjustFixupValue(Fixup.getKind(), Value);
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+bool AArch64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
+  return false;
+}
+
+bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
+                                             uint64_t Value,
+                                             const MCRelaxableFragment *DF,
+                                             const MCAsmLayout &Layout) const {
+  // FIXME:  This isn't correct for AArch64. Just moving the "generic" logic
+  // into the targets for now.
+  //
+  // Relax if the value is too big for a (signed) i8.
+  return int64_t(Value) != int64_t(int8_t(Value));
+}
+
+void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
+                                         MCInst &Res) const {
+  assert(false && "AArch64AsmBackend::relaxInstruction() unimplemented");
+}
+
+bool AArch64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  if ((Count & 3) != 0) {
+    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
+      OW->Write8(0);
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g1:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_gottprel_g1:
-    // R_AARCH64_TLSIE_MOVW_GOTTPREL_G1: Set a MOV[NZ] immediate field
-    // to bits FFFF0000 of G(TPREL(S+A)) - GOT.
-  case AArch64::fixup_a64_movw_tprel_g1:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G1: Set a MOV[NZ] immediate field to
-    // bits FFFF0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g1: {
-    // R_AARCH64_MOVW_SABS_G1: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // of S+A (see notes below); check -2^32 <= S+A < 2^32. (notes say that we
-    // should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 32) && Signed < (1LL << 32)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 16) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 16) & 0xffff) << 5;
-    }
-    return Value;
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+  for (uint64_t i = 0; i != Count; ++i)
+    OW->Write32(0xd503201f);
+  return true;
+}
+
+namespace {
+
+namespace CU {
+
+/// \brief Compact unwind encoding values.
+enum CompactUnwindEncodings {
+  /// \brief A "frameless" leaf function, where no non-volatile registers are
+  /// saved. The return remains in LR throughout the function.
+  UNWIND_AArch64_MODE_FRAMELESS = 0x02000000,
+
+  /// \brief No compact unwind encoding available. Instead the low 23-bits of
+  /// the compact unwind encoding is the offset of the DWARF FDE in the
+  /// __eh_frame section. This mode is never used in object files. It is only
+  /// generated by the linker in final linked images, which have only DWARF info
+  /// for a function.
+  UNWIND_AArch64_MODE_DWARF = 0x03000000,
+
+  /// \brief This is a standard arm64 prologue where FP/LR are immediately
+  /// pushed on the stack, then SP is copied to FP. If there are any
+  /// non-volatile register saved, they are copied into the stack fame in pairs
+  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
+  /// five X pairs and four D pairs can be saved, but the memory layout must be
+  /// in register number order.
+  UNWIND_AArch64_MODE_FRAME = 0x04000000,
+
+  /// \brief Frame register pair encodings.
+  UNWIND_AArch64_FRAME_X19_X20_PAIR = 0x00000001,
+  UNWIND_AArch64_FRAME_X21_X22_PAIR = 0x00000002,
+  UNWIND_AArch64_FRAME_X23_X24_PAIR = 0x00000004,
+  UNWIND_AArch64_FRAME_X25_X26_PAIR = 0x00000008,
+  UNWIND_AArch64_FRAME_X27_X28_PAIR = 0x00000010,
+  UNWIND_AArch64_FRAME_D8_D9_PAIR = 0x00000100,
+  UNWIND_AArch64_FRAME_D10_D11_PAIR = 0x00000200,
+  UNWIND_AArch64_FRAME_D12_D13_PAIR = 0x00000400,
+  UNWIND_AArch64_FRAME_D14_D15_PAIR = 0x00000800
+};
+
+} // end CU namespace
+
+// FIXME: This should be in a separate file.
+class DarwinAArch64AsmBackend : public AArch64AsmBackend {
+  const MCRegisterInfo &MRI;
+
+  /// \brief Encode compact unwind stack adjustment for frameless functions.
+  /// See UNWIND_AArch64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
+  /// The stack size always needs to be 16 byte aligned.
+  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
+    return (StackSize / 16) << 12;
+  }
+
+public:
+  DarwinAArch64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T), MRI(MRI) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
+                                         MachO::CPU_SUBTYPE_ARM64_ALL);
+  }
+
+  bool doesSectionRequireSymbols(const MCSection &Section) const override {
+    // Any section for which the linker breaks things into atoms needs to
+    // preserve symbols, including assembler local symbols, to identify
+    // those atoms. These sections are:
+    // Sections of type:
+    //
+    //    S_CSTRING_LITERALS  (e.g. __cstring)
+    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
+    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
+    //
+    // Sections named:
+    //
+    //    __TEXT,__eh_frame
+    //    __TEXT,__ustring
+    //    __DATA,__cfstring
+    //    __DATA,__objc_classrefs
+    //    __DATA,__objc_catlist
+    //
+    // FIXME: It would be better if the compiler used actual linker local
+    // symbols for each of these sections rather than preserving what
+    // are ostensibly assembler local symbols.
+    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
+    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
+            SMO.getType() == MachO::S_4BYTE_LITERALS ||
+            SMO.getType() == MachO::S_8BYTE_LITERALS ||
+            SMO.getType() == MachO::S_16BYTE_LITERALS ||
+            SMO.getType() == MachO::S_LITERAL_POINTERS ||
+            (SMO.getSegmentName() == "__TEXT" &&
+             (SMO.getSectionName() == "__eh_frame" ||
+              SMO.getSectionName() == "__ustring")) ||
+            (SMO.getSegmentName() == "__DATA" &&
+             (SMO.getSectionName() == "__cfstring" ||
+              SMO.getSectionName() == "__objc_classrefs" ||
+              SMO.getSectionName() == "__objc_catlist")));
   }
 
-  case AArch64::fixup_a64_movw_dtprel_g2:
-    // R_AARCH64_TLSLD_MOVW_DTPREL_G2: Set a MOV[NZ] immediate field
-    // to bits FFFF 0000 0000 of DTPREL(S+A).
-  case AArch64::fixup_a64_movw_tprel_g2:
-    // R_AARCH64_TLSLE_MOVW_TPREL_G2: Set a MOV[NZ] immediate field to
-    // bits FFFF 0000 0000 of TPREL(S+A).
-  case AArch64::fixup_a64_movw_sabs_g2: {
-    // R_AARCH64_MOVW_SABS_G2: Sets MOV[NZ] immediate field using bits FFFF 0000
-    // 0000 of S+A (see notes below); check -2^48 <= S+A < 2^48. (notes say that
-    // we should convert between MOVN and MOVZ to achieve our goals).
-    int64_t Signed = Value;
-    assert(Signed >= -(1LL << 48) && Signed < (1LL << 48)
-           && "Out of range move wide fixup");
-    if (Signed >= 0) {
-      Value = ((Value >> 32) & 0xffff) << 5;
-      // Bit 30 converts the MOVN encoding into a MOVZ
-      Value |= 1 << 30;
-    } else {
-      Value = ((~Value >> 32) & 0xffff) << 5;
+  /// \brief Generate the compact unwind encoding from the CFI directives.
+  uint32_t generateCompactUnwindEncoding(
+                             ArrayRef<MCCFIInstruction> Instrs) const override {
+    if (Instrs.empty())
+      return CU::UNWIND_AArch64_MODE_FRAMELESS;
+
+    bool HasFP = false;
+    unsigned StackSize = 0;
+
+    uint32_t CompactUnwindEncoding = 0;
+    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
+      const MCCFIInstruction &Inst = Instrs[i];
+
+      switch (Inst.getOperation()) {
+      default:
+        // Cannot handle this directive:  bail out.
+        return CU::UNWIND_AArch64_MODE_DWARF;
+      case MCCFIInstruction::OpDefCfa: {
+        // Defines a frame pointer.
+        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
+                   AArch64::FP &&
+               "Invalid frame pointer!");
+        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
+
+        const MCCFIInstruction &LRPush = Instrs[++i];
+        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Link register not pushed!");
+        const MCCFIInstruction &FPPush = Instrs[++i];
+        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
+               "Frame pointer not pushed!");
+
+        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
+        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
+
+        LRReg = getXRegFromWReg(LRReg);
+        FPReg = getXRegFromWReg(FPReg);
+
+        assert(LRReg == AArch64::LR && FPReg == AArch64::FP &&
+               "Pushing invalid registers for frame!");
+
+        // Indicate that the function has a frame.
+        CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAME;
+        HasFP = true;
+        break;
+      }
+      case MCCFIInstruction::OpDefCfaOffset: {
+        assert(StackSize == 0 && "We already have the CFA offset!");
+        StackSize = std::abs(Inst.getOffset());
+        break;
+      }
+      case MCCFIInstruction::OpOffset: {
+        // Registers are saved in pairs. We expect there to be two consecutive
+        // `.cfi_offset' instructions with the appropriate registers specified.
+        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
+        if (i + 1 == e)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+
+        const MCCFIInstruction &Inst2 = Instrs[++i];
+        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
+          return CU::UNWIND_AArch64_MODE_DWARF;
+        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
+
+        // N.B. The encodings must be in register number order, and the X
+        // registers before the D registers.
+
+        // X19/X20 pair = 0x00000001,
+        // X21/X22 pair = 0x00000002,
+        // X23/X24 pair = 0x00000004,
+        // X25/X26 pair = 0x00000008,
+        // X27/X28 pair = 0x00000010
+        Reg1 = getXRegFromWReg(Reg1);
+        Reg2 = getXRegFromWReg(Reg2);
+
+        if (Reg1 == AArch64::X19 && Reg2 == AArch64::X20 &&
+            (CompactUnwindEncoding & 0xF1E) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X19_X20_PAIR;
+        else if (Reg1 == AArch64::X21 && Reg2 == AArch64::X22 &&
+                 (CompactUnwindEncoding & 0xF1C) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X21_X22_PAIR;
+        else if (Reg1 == AArch64::X23 && Reg2 == AArch64::X24 &&
+                 (CompactUnwindEncoding & 0xF18) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X23_X24_PAIR;
+        else if (Reg1 == AArch64::X25 && Reg2 == AArch64::X26 &&
+                 (CompactUnwindEncoding & 0xF10) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X25_X26_PAIR;
+        else if (Reg1 == AArch64::X27 && Reg2 == AArch64::X28 &&
+                 (CompactUnwindEncoding & 0xF00) == 0)
+          CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_X27_X28_PAIR;
+        else {
+          Reg1 = getDRegFromBReg(Reg1);
+          Reg2 = getDRegFromBReg(Reg2);
+
+          // D8/D9 pair   = 0x00000100,
+          // D10/D11 pair = 0x00000200,
+          // D12/D13 pair = 0x00000400,
+          // D14/D15 pair = 0x00000800
+          if (Reg1 == AArch64::D8 && Reg2 == AArch64::D9 &&
+              (CompactUnwindEncoding & 0xE00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D8_D9_PAIR;
+          else if (Reg1 == AArch64::D10 && Reg2 == AArch64::D11 &&
+                   (CompactUnwindEncoding & 0xC00) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D10_D11_PAIR;
+          else if (Reg1 == AArch64::D12 && Reg2 == AArch64::D13 &&
+                   (CompactUnwindEncoding & 0x800) == 0)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D12_D13_PAIR;
+          else if (Reg1 == AArch64::D14 && Reg2 == AArch64::D15)
+            CompactUnwindEncoding |= CU::UNWIND_AArch64_FRAME_D14_D15_PAIR;
+          else
+            // A pair was pushed which we cannot handle.
+            return CU::UNWIND_AArch64_MODE_DWARF;
+        }
+
+        break;
+      }
+      }
     }
-    return Value;
+
+    if (!HasFP) {
+      // With compact unwind info we can only represent stack adjustments of up
+      // to 65520 bytes.
+      if (StackSize > 65520)
+        return CU::UNWIND_AArch64_MODE_DWARF;
+
+      CompactUnwindEncoding |= CU::UNWIND_AArch64_MODE_FRAMELESS;
+      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
+    }
+
+    return CompactUnwindEncoding;
   }
+};
 
-  case AArch64::fixup_a64_tstbr:
-    // R_AARCH64_TSTBR14: Sets the immediate field of a TBZ/TBNZ instruction to
-    // bits FFFC of S+A-P, checking -2^15 <= S+A-P < 2^15.
-    assert((int64_t)Value >= -(1LL << 15) &&
-           (int64_t)Value < (1LL << 15) && "Out of range TBZ/TBNZ fixup");
-    return (Value & 0xfffc) << (5 - 2);
-
-  case AArch64::fixup_a64_condbr:
-    // R_AARCH64_CONDBR19: Sets the immediate field of a conditional branch
-    // instruction to bits 1FFFFC of S+A-P, checking -2^20 <= S+A-P < 2^20.
-    assert((int64_t)Value >= -(1LL << 20) &&
-           (int64_t)Value < (1LL << 20) && "Out of range B.cond fixup");
-    return (Value & 0x1ffffc) << (5 - 2);
-
-  case AArch64::fixup_a64_uncondbr:
-    // R_AARCH64_JUMP26 same as below (except to a linker, possibly).
-  case AArch64::fixup_a64_call:
-    // R_AARCH64_CALL26: Sets a CALL immediate field to bits FFFFFFC of S+A-P,
-    // checking that -2^27 <= S+A-P < 2^27.
-    assert((int64_t)Value >= -(1LL << 27) &&
-           (int64_t)Value < (1LL << 27) && "Out of range branch fixup");
-    return (Value & 0xffffffc) >> 2;
-
-  case AArch64::fixup_a64_adr_gottprel_page:
-    // R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21: Set an ADRP immediate field to bits
-    // 1FFFFF000 of Page(G(TPREL(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_tlsdesc_adr_page:
-    // R_AARCH64_TLSDESC_ADR_PAGE: Set an ADRP immediate field to bits 1FFFFF000
-    // of Page(G(TLSDESC(S+A))) - Page(P); check -2^32 <= X < 2^32.
-  case AArch64::fixup_a64_adr_prel_got_page:
-    // R_AARCH64_ADR_GOT_PAGE: Sets the immediate value of an ADRP to bits
-    // 1FFFFF000 of the operation, checking that -2^32 < Page(G(S))-Page(GOT) <
-    // 2^32.
-    assert((int64_t)Value >= -(1LL << 32) &&
-           (int64_t)Value < (1LL << 32) && "Out of range ADRP fixup");
-    return ADRImmBits((Value & 0x1fffff000ULL) >> 12);
-
-  case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-    // R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC: Set an LD offset field to bits FF8
-    // of X, with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-    // R_AARCH64_TLSDESC_LD64_LO12_NC: Set an LD offset field to bits FF8 of
-    // G(TLSDESC(S+A)), with no overflow check. Check that X & 7 == 0.
-  case AArch64::fixup_a64_ld64_got_lo12_nc:
-    // R_AARCH64_LD64_GOT_LO12_NC: Sets the LD/ST immediate field to bits FF8 of
-    // G(S) with no overflow check. Check X & 7 == 0
-    assert(((int64_t)Value & 7) == 0 && "Misaligned fixup");
-    return (Value & 0xff8) << 7;
-
-  case AArch64::fixup_a64_tlsdesc_call:
-    // R_AARCH64_TLSDESC_CALL: For relaxation only.
-    return 0;
+} // end anonymous namespace
+
+namespace {
+
+class ELFAArch64AsmBackend : public AArch64AsmBackend {
+public:
+  uint8_t OSABI;
+  bool IsLittleEndian;
+
+  ELFAArch64AsmBackend(const Target &T, uint8_t OSABI, bool IsLittleEndian)
+    : AArch64AsmBackend(T), OSABI(OSABI), IsLittleEndian(IsLittleEndian) {}
+
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createAArch64ELFObjectWriter(OS, OSABI, IsLittleEndian);
   }
+
+  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                         const MCFixup &Fixup, const MCFragment *DF,
+                         const MCValue &Target, uint64_t &Value,
+                         bool &IsResolved) override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+};
+
+void ELFAArch64AsmBackend::processFixupValue(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &IsResolved) {
+  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
+  // ~0xfff. This means that the required offset to reach a symbol can vary by
+  // up to one step depending on where the ADRP is in memory. For example:
+  //
+  //     ADRP x0, there
+  //  there:
+  //
+  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
+  // we'll need that as an offset. At any other address "there" will be in the
+  // same page as the ADRP and the instruction should encode 0x0. Assuming the
+  // section isn't 0x1000-aligned, we therefore need to delegate this decision
+  // to the linker -- a relocation!
+  if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
+    IsResolved = false;
+}
+
+void ELFAArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                      unsigned DataSize, uint64_t Value,
+                                      bool IsPCRel) const {
+  // store fixups in .eh_frame section in big endian order
+  if (!IsLittleEndian && Fixup.getKind() == FK_Data_4) {
+    const MCSection *Sec = Fixup.getValue()->FindAssociatedSection();
+    const MCSectionELF *SecELF = static_cast<const MCSectionELF *>(Sec);
+    if (SecELF->getSectionName() == ".eh_frame")
+      Value = ByteSwap_32(unsigned(Value));
+  }
+  AArch64AsmBackend::applyFixup (Fixup, Data, DataSize, Value, IsPCRel);
+}
 }
 
-MCAsmBackend *
-llvm::createAArch64leAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ true);
+
+  if (TheTriple.isOSDarwin())
+    return new DarwinAArch64AsmBackend(T, MRI);
+
+  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(), /*IsLittleEndian=*/true);
 }
 
-MCAsmBackend *
-llvm::createAArch64beAsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                              StringRef TT, StringRef CPU) {
+MCAsmBackend *llvm::createAArch64beAsmBackend(const Target &T,
+                                            const MCRegisterInfo &MRI,
+                                            StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
-  return new ELFAArch64AsmBackend(T, TT, TheTriple.getOS(), /*isLittle*/ false);
+
+  assert(TheTriple.isOSBinFormatELF() &&
+         "Big endian is only supported for ELF targets!");
+  return new ELFAArch64AsmBackend(T, TheTriple.getOS(),
+                                  /*IsLittleEndian=*/false);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index a5fe914..e05191e 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCExpr.h"
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCValue.h"
@@ -35,257 +36,222 @@ private:
 };
 }
 
-AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI, bool IsLittleEndian)
-  : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                            /*HasRelocationAddend*/ true)
-{}
+AArch64ELFObjectWriter::AArch64ELFObjectWriter(uint8_t OSABI,
+                                               bool IsLittleEndian)
+    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
+                              /*HasRelocationAddend*/ true) {}
 
-AArch64ELFObjectWriter::~AArch64ELFObjectWriter()
-{}
+AArch64ELFObjectWriter::~AArch64ELFObjectWriter() {}
 
 unsigned AArch64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                              const MCFixup &Fixup,
-                                              bool IsPCRel) const {
-  unsigned Type;
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+  AArch64MCExpr::VariantKind RefKind =
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
+  bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
+
+  assert((!Target.getSymA() ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
+  assert((!Target.getSymB() ||
+          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
+         "Should only be expression-level modifiers here");
+
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
     case FK_Data_2:
       return ELF::R_AARCH64_PREL16;
-    case AArch64::fixup_a64_ld_prel:
-      Type = ELF::R_AARCH64_LD_PREL_LO19;
-      break;
-    case AArch64::fixup_a64_adr_prel:
-      Type = ELF::R_AARCH64_ADR_PREL_LO21;
-      break;
-    case AArch64::fixup_a64_adr_prel_page:
-      Type = ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      break;
-    case AArch64::fixup_a64_adr_prel_got_page:
-      Type = ELF::R_AARCH64_ADR_GOT_PAGE;
-      break;
-    case AArch64::fixup_a64_tstbr:
-      Type = ELF::R_AARCH64_TSTBR14;
-      break;
-    case AArch64::fixup_a64_condbr:
-      Type = ELF::R_AARCH64_CONDBR19;
-      break;
-    case AArch64::fixup_a64_uncondbr:
-      Type = ELF::R_AARCH64_JUMP26;
-      break;
-    case AArch64::fixup_a64_call:
-      Type = ELF::R_AARCH64_CALL26;
-      break;
-    case AArch64::fixup_a64_adr_gottprel_page:
-      Type = ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      break;
-    case AArch64::fixup_a64_ld_gottprel_prel19:
-      Type =  ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-      break;
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_PREL32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_PREL64;
+    case AArch64::fixup_aarch64_pcrel_adr_imm21:
+      assert(SymLoc == AArch64MCExpr::VK_NONE && "unexpected ADR relocation");
+      return ELF::R_AARCH64_ADR_PREL_LO21;
+    case AArch64::fixup_aarch64_pcrel_adrp_imm21:
+      if (SymLoc == AArch64MCExpr::VK_ABS && !IsNC)
+        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
+      if (SymLoc == AArch64MCExpr::VK_GOT && !IsNC)
+        return ELF::R_AARCH64_ADR_GOT_PAGE;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && !IsNC)
+        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
+      llvm_unreachable("invalid symbol kind for ADRP relocation");
+    case AArch64::fixup_aarch64_pcrel_branch26:
+      return ELF::R_AARCH64_JUMP26;
+    case AArch64::fixup_aarch64_pcrel_call26:
+      return ELF::R_AARCH64_CALL26;
+    case AArch64::fixup_aarch64_ldr_pcrel_imm19:
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL)
+        return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
+      return ELF::R_AARCH64_LD_PREL_LO19;
+    case AArch64::fixup_aarch64_pcrel_branch14:
+      return ELF::R_AARCH64_TSTBR14;
+    case AArch64::fixup_aarch64_pcrel_branch19:
+      return ELF::R_AARCH64_CONDBR19;
+    default:
+      llvm_unreachable("Unsupported pc-relative fixup kind");
     }
   } else {
     switch ((unsigned)Fixup.getKind()) {
-    default:
-      llvm_unreachable("Unimplemented fixup -> relocation");
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
     case FK_Data_2:
       return ELF::R_AARCH64_ABS16;
-    case AArch64::fixup_a64_add_lo12:
-      Type = ELF::R_AARCH64_ADD_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ld64_got_lo12_nc:
-      Type = ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_lo12:
-      Type = ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_lo12:
-      Type = ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_lo12:
-      Type = ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_lo12:
-      Type = ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst128_lo12:
-      Type = ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g0_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g1_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g2_nc:
-      Type = ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      break;
-    case AArch64::fixup_a64_movw_uabs_g3:
-      Type = ELF::R_AARCH64_MOVW_UABS_G3;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g0:
-      Type = ELF::R_AARCH64_MOVW_SABS_G0;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g1:
-      Type = ELF::R_AARCH64_MOVW_SABS_G1;
-      break;
-    case AArch64::fixup_a64_movw_sabs_g2:
-      Type = ELF::R_AARCH64_MOVW_SABS_G2;
-      break;
+    case FK_Data_4:
+      return ELF::R_AARCH64_ABS32;
+    case FK_Data_8:
+      return ELF::R_AARCH64_ABS64;
+    case AArch64::fixup_aarch64_add_imm12:
+      if (RefKind == AArch64MCExpr::VK_DTPREL_HI12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_HI12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_LO12)
+        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12_NC)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_LO12)
+        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
+      if (RefKind == AArch64MCExpr::VK_TLSDESC_LO12)
+        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
 
-    // TLS Local-dynamic block
-    case AArch64::fixup_a64_movw_dtprel_g2:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_dtprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_dtprel_hi12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_dtprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for add (uimm12) instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale1:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
 
-    // TLS initial-exec block
-    case AArch64::fixup_a64_movw_gottprel_g1:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_gottprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_ld64_gottprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 8-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale2:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
 
-    // TLS local-exec block
-    case AArch64::fixup_a64_movw_tprel_g2:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g1_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      break;
-    case AArch64::fixup_a64_movw_tprel_g0_nc:
-      Type = ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      break;
-    case AArch64::fixup_a64_add_tprel_hi12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_HI12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_add_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst8_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst16_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst32_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      break;
-    case AArch64::fixup_a64_ldst64_tprel_lo12_nc:
-      Type = ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      break;
+      report_fatal_error("invalid fixup for 16-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale4:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
 
-    // TLS general-dynamic block
-    case AArch64::fixup_a64_tlsdesc_adr_page:
-      Type = ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      break;
-    case AArch64::fixup_a64_tlsdesc_ld64_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_add_lo12_nc:
-      Type = ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      break;
-    case AArch64::fixup_a64_tlsdesc_call:
-      Type = ELF::R_AARCH64_TLSDESC_CALL;
-      break;
+      report_fatal_error("invalid fixup for 32-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale8:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOT && IsNC)
+        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_DTPREL && IsNC)
+        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && !IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
+      if (SymLoc == AArch64MCExpr::VK_TPREL && IsNC)
+        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_GOTTPREL && IsNC)
+        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
+      if (SymLoc == AArch64MCExpr::VK_TLSDESC && IsNC)
+        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
+
+      report_fatal_error("invalid fixup for 64-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_ldst_imm12_scale16:
+      if (SymLoc == AArch64MCExpr::VK_ABS && IsNC)
+        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
+
+      report_fatal_error("invalid fixup for 128-bit load/store instruction");
+      return 0;
+    case AArch64::fixup_aarch64_movw:
+      if (RefKind == AArch64MCExpr::VK_ABS_G3)
+        return ELF::R_AARCH64_MOVW_UABS_G3;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2)
+        return ELF::R_AARCH64_MOVW_UABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_S)
+        return ELF::R_AARCH64_MOVW_SABS_G2;
+      if (RefKind == AArch64MCExpr::VK_ABS_G2_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1)
+        return ELF::R_AARCH64_MOVW_UABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_S)
+        return ELF::R_AARCH64_MOVW_SABS_G1;
+      if (RefKind == AArch64MCExpr::VK_ABS_G1_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0)
+        return ELF::R_AARCH64_MOVW_UABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_S)
+        return ELF::R_AARCH64_MOVW_SABS_G0;
+      if (RefKind == AArch64MCExpr::VK_ABS_G0_NC)
+        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G2)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_DTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G2)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G1_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
+      if (RefKind == AArch64MCExpr::VK_TPREL_G0_NC)
+        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G1)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
+      if (RefKind == AArch64MCExpr::VK_GOTTPREL_G0_NC)
+        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
+      report_fatal_error("invalid fixup for movz/movk instruction");
+      return 0;
+    case AArch64::fixup_aarch64_tlsdesc_call:
+      return ELF::R_AARCH64_TLSDESC_CALL;
+    default:
+      llvm_unreachable("Unknown ELF relocation type");
     }
   }
 
-  return Type;
+  llvm_unreachable("Unimplemented fixup -> relocation");
 }
 
 MCObjectWriter *llvm::createAArch64ELFObjectWriter(raw_ostream &OS,
-                                                   uint8_t OSABI,
-                                                   bool IsLittleEndian) {
-  MCELFObjectTargetWriter *MOTW = new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
-  return createELFObjectWriter(MOTW, OS,  IsLittleEndian);
+                                                 uint8_t OSABI,
+                                                 bool IsLittleEndian) {
+  MCELFObjectTargetWriter *MOTW =
+      new AArch64ELFObjectWriter(OSABI, IsLittleEndian);
+  return createELFObjectWriter(MOTW, OS, IsLittleEndian);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 473b7dd..a79406d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -56,14 +56,14 @@ namespace {
 class AArch64ELFStreamer : public MCELFStreamer {
 public:
   AArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                     MCCodeEmitter *Emitter)
+                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
         LastEMS(EMS_None) {}
 
   ~AArch64ELFStreamer() {}
 
-  virtual void ChangeSection(const MCSection *Section,
-                             const MCExpr *Subsection) {
+  void ChangeSection(const MCSection *Section,
+                     const MCExpr *Subsection) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
@@ -76,7 +76,8 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  virtual void EmitInstruction(const MCInst& Inst, const MCSubtargetInfo &STI) {
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
     EmitA64MappingSymbol();
     MCELFStreamer::EmitInstruction(Inst, STI);
   }
@@ -84,7 +85,7 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitBytes(StringRef Data) {
+  void EmitBytes(StringRef Data) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitBytes(Data);
   }
@@ -92,7 +93,8 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
@@ -105,13 +107,15 @@ private:
   };
 
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMS == EMS_Data)
+      return;
     EmitMappingSymbol("$d");
     LastEMS = EMS_Data;
   }
 
   void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64) return;
+    if (LastEMS == EMS_A64)
+      return;
     EmitMappingSymbol("$x");
     LastEMS = EMS_A64;
   }
@@ -120,15 +124,14 @@ private:
     MCSymbol *Start = getContext().CreateTempSymbol();
     EmitLabel(Start);
 
-    MCSymbol *Symbol =
-      getContext().GetOrCreateSymbol(Name + "." +
-                                     Twine(MappingSymbolCounter++));
+    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++));
 
     MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
     MCELF::SetType(SD, ELF::STT_NOTYPE);
     MCELF::SetBinding(SD, ELF::STB_LOCAL);
     SD.setExternal(false);
-    AssignSection(Symbol, getCurrentSection().first);
+    Symbol->setSection(*getCurrentSection().first);
 
     const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
     Symbol->setVariableValue(Value);
@@ -144,16 +147,14 @@ private:
 }
 
 namespace llvm {
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-    AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
-    if (RelaxAll)
-      S->getAssembler().setRelaxAll(true);
-    if (NoExecStack)
-      S->getAssembler().setNoExecStack(true);
-    return S;
-  }
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack) {
+  AArch64ELFStreamer *S = new AArch64ELFStreamer(Context, TAB, OS, Emitter);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  if (NoExecStack)
+    S->getAssembler().setNoExecStack(true);
+  return S;
+}
 }
-
-
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
index 5a89ca5..bc6973b 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.h
@@ -18,10 +18,9 @@
 
 namespace llvm {
 
-  MCELFStreamer* createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                          raw_ostream &OS,
-                                          MCCodeEmitter *Emitter,
-                                          bool RelaxAll, bool NoExecStack);
+MCELFStreamer *createAArch64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
+                                        raw_ostream &OS, MCCodeEmitter *Emitter,
+                                        bool RelaxAll, bool NoExecStack);
 }
 
 #endif // AArch64_ELF_STREAMER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
index eeb122d..bf405fb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64FixupKinds.h
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64FixupKinds.h - AArch64 Specific Fixup Entries -*- C++ -*-=//
+//===-- AArch64FixupKinds.h - AArch64 Specific Fixup Entries ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,108 +6,71 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file describes the LLVM fixups applied to MCInsts in the AArch64
-// backend.
-//
-//===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_AARCH64FIXUPKINDS_H
-#define LLVM_AARCH64_AARCH64FIXUPKINDS_H
+#ifndef LLVM_AArch64FIXUPKINDS_H
+#define LLVM_AArch64FIXUPKINDS_H
 
 #include "llvm/MC/MCFixup.h"
 
 namespace llvm {
-  namespace AArch64 {
-    enum Fixups {
-      fixup_a64_ld_prel = FirstTargetFixupKind,
-      fixup_a64_adr_prel,
-      fixup_a64_adr_prel_page,
-
-      fixup_a64_add_lo12,
-
-      fixup_a64_ldst8_lo12,
-      fixup_a64_ldst16_lo12,
-      fixup_a64_ldst32_lo12,
-      fixup_a64_ldst64_lo12,
-      fixup_a64_ldst128_lo12,
-
-      fixup_a64_tstbr,
-      fixup_a64_condbr,
-      fixup_a64_uncondbr,
-      fixup_a64_call,
-
-      fixup_a64_movw_uabs_g0,
-      fixup_a64_movw_uabs_g0_nc,
-      fixup_a64_movw_uabs_g1,
-      fixup_a64_movw_uabs_g1_nc,
-      fixup_a64_movw_uabs_g2,
-      fixup_a64_movw_uabs_g2_nc,
-      fixup_a64_movw_uabs_g3,
-
-      fixup_a64_movw_sabs_g0,
-      fixup_a64_movw_sabs_g1,
-      fixup_a64_movw_sabs_g2,
-
-      fixup_a64_adr_prel_got_page,
-      fixup_a64_ld64_got_lo12_nc,
-
-      // Produce offsets relative to the module's dynamic TLS area.
-      fixup_a64_movw_dtprel_g2,
-      fixup_a64_movw_dtprel_g1,
-      fixup_a64_movw_dtprel_g1_nc,
-      fixup_a64_movw_dtprel_g0,
-      fixup_a64_movw_dtprel_g0_nc,
-      fixup_a64_add_dtprel_hi12,
-      fixup_a64_add_dtprel_lo12,
-      fixup_a64_add_dtprel_lo12_nc,
-      fixup_a64_ldst8_dtprel_lo12,
-      fixup_a64_ldst8_dtprel_lo12_nc,
-      fixup_a64_ldst16_dtprel_lo12,
-      fixup_a64_ldst16_dtprel_lo12_nc,
-      fixup_a64_ldst32_dtprel_lo12,
-      fixup_a64_ldst32_dtprel_lo12_nc,
-      fixup_a64_ldst64_dtprel_lo12,
-      fixup_a64_ldst64_dtprel_lo12_nc,
-
-      // Produce the GOT entry containing a variable's address in TLS's
-      // initial-exec mode.
-      fixup_a64_movw_gottprel_g1,
-      fixup_a64_movw_gottprel_g0_nc,
-      fixup_a64_adr_gottprel_page,
-      fixup_a64_ld64_gottprel_lo12_nc,
-      fixup_a64_ld_gottprel_prel19,
-
-      // Produce offsets relative to the thread pointer: TPIDR_EL0.
-      fixup_a64_movw_tprel_g2,
-      fixup_a64_movw_tprel_g1,
-      fixup_a64_movw_tprel_g1_nc,
-      fixup_a64_movw_tprel_g0,
-      fixup_a64_movw_tprel_g0_nc,
-      fixup_a64_add_tprel_hi12,
-      fixup_a64_add_tprel_lo12,
-      fixup_a64_add_tprel_lo12_nc,
-      fixup_a64_ldst8_tprel_lo12,
-      fixup_a64_ldst8_tprel_lo12_nc,
-      fixup_a64_ldst16_tprel_lo12,
-      fixup_a64_ldst16_tprel_lo12_nc,
-      fixup_a64_ldst32_tprel_lo12,
-      fixup_a64_ldst32_tprel_lo12_nc,
-      fixup_a64_ldst64_tprel_lo12,
-      fixup_a64_ldst64_tprel_lo12_nc,
-
-      // Produce the special fixups used by the general-dynamic TLS model.
-      fixup_a64_tlsdesc_adr_page,
-      fixup_a64_tlsdesc_ld64_lo12_nc,
-      fixup_a64_tlsdesc_add_lo12_nc,
-      fixup_a64_tlsdesc_call,
-
-
-      // Marker
-      LastTargetFixupKind,
-      NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-    };
-  }
-}
+namespace AArch64 {
+
+enum Fixups {
+  // fixup_aarch64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADR instruction.
+  fixup_aarch64_pcrel_adr_imm21 = FirstTargetFixupKind,
+
+  // fixup_aarch64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
+  // an ADRP instruction.
+  fixup_aarch64_pcrel_adrp_imm21,
+
+  // fixup_aarch64_imm12 - 12-bit fixup for add/sub instructions.
+  //     No alignment adjustment. All value bits are encoded.
+  fixup_aarch64_add_imm12,
+
+  // fixup_aarch64_ldst_imm12_* - unsigned 12-bit fixups for load and
+  // store instructions.
+  fixup_aarch64_ldst_imm12_scale1,
+  fixup_aarch64_ldst_imm12_scale2,
+  fixup_aarch64_ldst_imm12_scale4,
+  fixup_aarch64_ldst_imm12_scale8,
+  fixup_aarch64_ldst_imm12_scale16,
+
+  // fixup_aarch64_ldr_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is used by
+  // pc-relative loads and generates relocations directly when necessary.
+  fixup_aarch64_ldr_pcrel_imm19,
+
+  // FIXME: comment
+  fixup_aarch64_movw,
+
+  // fixup_aarch64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch14,
+
+  // fixup_aarch64_pcrel_branch19 - The high 19 bits of a 21-bit pc-relative
+  // immediate. Same encoding as fixup_aarch64_pcrel_adrhi, except this is use by
+  // b.cc and generates relocations directly when necessary.
+  fixup_aarch64_pcrel_branch19,
+
+  // fixup_aarch64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
+  // immediate.
+  fixup_aarch64_pcrel_branch26,
+
+  // fixup_aarch64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
+  // immediate. Distinguished from branch26 only on ELF.
+  fixup_aarch64_pcrel_call26,
+
+  // fixup_aarch64_tlsdesc_call - zero-space placeholder for the ELF
+  // R_AARCH64_TLSDESC_CALL relocation.
+  fixup_aarch64_tlsdesc_call,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+
+} // end namespace AArch64
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index b090a55..dc4a8bf 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -13,26 +13,82 @@
 
 #include "AArch64MCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
-
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
-AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
-  Triple TheTriple(TT);
-  if (TheTriple.getArch() == Triple::aarch64_be)
+enum AsmWriterVariantTy {
+  Default = -1,
+  Generic = 0,
+  Apple = 1
+};
+
+static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
+    "aarch64-neon-syntax", cl::init(Default),
+    cl::desc("Choose style of NEON code to emit from AArch64 backend:"),
+    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
+               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
+               clEnumValEnd));
+
+AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() {
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
+
+  PrivateGlobalPrefix = "L";
+  SeparatorString = "%%";
+  CommentString = ";";
+  PointerSize = CalleeSaveStackSlotSize = 8;
+
+  AlignmentIsInBytes = false;
+  UsesELFSectionDirectiveForBSS = true;
+  SupportsDebugInformation = true;
+  UseDataRegionDirectives = true;
+
+  ExceptionsType = ExceptionHandling::DwarfCFI;
+}
+
+const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
+    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
+  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
+  // is an indirect pc-relative reference. The default implementation
+  // won't reference using the GOT, so we need this target-specific
+  // version.
+  MCContext &Context = Streamer.getContext();
+  const MCExpr *Res =
+      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
+  MCSymbol *PCSym = Context.CreateTempSymbol();
+  Streamer.EmitLabel(PCSym);
+  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
+  return MCBinaryExpr::CreateSub(Res, PC, Context);
+}
+
+AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(StringRef TT) {
+  Triple T(TT);
+  if (T.getArch() == Triple::arm64_be || T.getArch() == Triple::aarch64_be)
     IsLittleEndian = false;
 
+  // We prefer NEON instructions to be printed in the short form.
+  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
+
   PointerSize = 8;
 
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
   CommentString = "//";
+  PrivateGlobalPrefix = ".L";
   Code32Directive = ".code\t32";
 
   Data16bitsDirective = "\t.hword\t";
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = "\t.xword\t";
 
+  UseDataRegionDirectives = false;
+
+  WeakRefDirective = "\t.weak\t";
+
   HasLEB128 = true;
   SupportsDebugInformation = true;
 
@@ -41,6 +97,3 @@ AArch64ELFMCAsmInfo::AArch64ELFMCAsmInfo(StringRef TT) {
 
   UseIntegratedAssembler = true;
 }
-
-// Pin the vtable to this file.
-void AArch64ELFMCAsmInfo::anchor() {}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
index 43c0e47..42a031d 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.h
@@ -1,4 +1,4 @@
-//==-- AArch64MCAsmInfo.h - AArch64 asm properties -------------*- C++ -*--===//
+//=====-- AArch64MCAsmInfo.h - AArch64 asm properties ---------*- C++ -*--====//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,17 +11,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64TARGETASMINFO_H
-#define LLVM_AARCH64TARGETASMINFO_H
+#ifndef AArch64TARGETASMINFO_H
+#define AArch64TARGETASMINFO_H
 
-#include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoDarwin.h"
 
 namespace llvm {
+class Target;
+class StringRef;
+class MCStreamer;
+struct AArch64MCAsmInfoDarwin : public MCAsmInfoDarwin {
+  explicit AArch64MCAsmInfoDarwin();
+  const MCExpr *
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+};
 
-struct AArch64ELFMCAsmInfo : public MCAsmInfoELF {
-  explicit AArch64ELFMCAsmInfo(StringRef TT);
-private:
-  virtual void anchor();
+struct AArch64MCAsmInfoELF : public MCAsmInfo {
+  explicit AArch64MCAsmInfoELF(StringRef TT);
 };
 
 } // namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index b9a61ef..464a18c 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code =//
+//=- AArch64/AArch64MCCodeEmitter.cpp - Convert AArch64 code to machine code-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
+#include "MCTargetDesc/AArch64AddressingModes.h"
 #include "MCTargetDesc/AArch64FixupKinds.h"
 #include "MCTargetDesc/AArch64MCExpr.h"
-#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
@@ -22,524 +21,562 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
+STATISTIC(MCNumFixups, "Number of MC fixups created.");
+
 namespace {
+
 class AArch64MCCodeEmitter : public MCCodeEmitter {
-  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
-  void operator=(const AArch64MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   MCContext &Ctx;
 
+  AArch64MCCodeEmitter(const AArch64MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const AArch64MCCodeEmitter &);     // DO NOT IMPLEMENT
 public:
-  AArch64MCCodeEmitter(MCContext &ctx) : Ctx(ctx) {}
+  AArch64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
+                     MCContext &ctx)
+      : Ctx(ctx) {}
 
   ~AArch64MCCodeEmitter() {}
 
-  unsigned getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
 
-  unsigned getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
+  /// getLdStUImm12OpValue - Return encoding info for 12-bit unsigned immediate
+  /// attached to a load, store or prfm instruction. If operand requires a
+  /// relocation, record it and return zero in that part of the encoding.
+  template <uint32_t FixupKind>
+  uint32_t getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+  /// target.
+  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
+
+  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+  /// the 2-bit shift field.
+  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
 
-  template<int MemSize>
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const {
-    return getOffsetUImm12OpValue(MI, OpIdx, Fixups, STI, MemSize);
-  }
+  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
+  /// branch target.
+  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI,
-                                    int MemSize) const;
+  /// getLoadLiteralOpValue - Return the encoded value for a load-literal
+  /// pc-relative address.
+  uint32_t getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
-  unsigned getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getMemExtendOpValue - Return the encoded value for a reg-extend load/store
+  /// instruction: bit 0 is whether a shift is present, bit 1 is whether the
+  /// operation is a sign extend (as opposed to a zero extend).
+  uint32_t getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                               SmallVectorImpl<MCFixup> &Fixups,
+                               const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftRightImm8(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm16(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm32(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-  unsigned getShiftRightImm64(const MCInst &MI, unsigned Op,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
+  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+  /// branch target.
+  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                      SmallVectorImpl<MCFixup> &Fixups,
+                                      const MCSubtargetInfo &STI) const;
 
-  unsigned getShiftLeftImm8(const MCInst &MI, unsigned Op,
-                            SmallVectorImpl<MCFixup> &Fixups,
-                            const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm16(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm32(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-  unsigned getShiftLeftImm64(const MCInst &MI, unsigned Op,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  /// getBranchTargetOpValue - Return the encoded value for an unconditional
+  /// branch target.
+  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
 
-  // Labels are handled mostly the same way: a symbol is needed, and
-  // just gets some fixup attached.
-  template<AArch64::Fixups fixupDesired>
-  unsigned getLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                           SmallVectorImpl<MCFixup> &Fixups,
-                           const MCSubtargetInfo &STI) const;
+  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
+  /// of a MOVZ or MOVK instruction.
+  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned  getLoadLitLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                   SmallVectorImpl<MCFixup> &Fixups,
-                                   const MCSubtargetInfo &STI) const;
+  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
+  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+
+  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
+  /// shifter (MSL).
+  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
+  /// getFixedPointScaleOpValue - Return the encoded value for the
+  // FP-to-fixed-point scale factor.
+  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const;
 
-  unsigned getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                SmallVectorImpl<MCFixup> &Fixups,
+                                const MCSubtargetInfo &STI) const;
 
+  /// getSIMDShift64OpValue - Return the encoded value for the
+  // shift-by-immediate AdvSIMD instructions.
+  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
-  unsigned getAddressWithFixup(const MCOperand &MO,
-                               unsigned FixupKind,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
+  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
 
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
 
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI,const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
+  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
 
+  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                   const MCSubtargetInfo &STI) const;
 
-  void EmitByte(unsigned char C, raw_ostream &OS) const {
-    OS << (char)C;
-  }
+  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
 
-  void EmitInstruction(uint32_t Val, raw_ostream &OS) const {
+  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
     // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      EmitByte(Val & 0xff, OS);
+    for (unsigned i = 0; i != Size; ++i) {
+      EmitByte(Val & 255, OS);
       Val >>= 8;
     }
   }
 
-
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
-
-  template<int hasRs, int hasRt2> unsigned
-  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
-                        const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   unsigned fixMulHigh(const MCInst &MI, unsigned EncodedValue,
                       const MCSubtargetInfo &STI) const;
 
+  template<int hasRs, int hasRt2> unsigned
+  fixLoadStoreExclusive(const MCInst &MI, unsigned EncodedValue,
+                        const MCSubtargetInfo &STI) const;
 
+  unsigned fixOneOperandFPComparison(const MCInst &MI, unsigned EncodedValue,
+                                     const MCSubtargetInfo &STI) const;
 };
 
 } // end anonymous namespace
 
-unsigned AArch64MCCodeEmitter::getAddressWithFixup(const MCOperand &MO,
-                                       unsigned FixupKind,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (!MO.isExpr()) {
-    // This can occur for manually decoded or constructed MCInsts, but neither
-    // the assembly-parser nor instruction selection will currently produce an
-    // MCInst that's not a symbol reference.
-    assert(MO.isImm() && "Unexpected address requested");
-    return MO.getImm();
-  }
+MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  return new AArch64MCCodeEmitter(MCII, STI, Ctx);
+}
 
-  const MCExpr *Expr = MO.getExpr();
-  MCFixupKind Kind = MCFixupKind(FixupKind);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind));
+/// getMachineOpValue - Return binary encoding of operand. If the machine
+/// operand requires relocation, record the relocation and return zero.
+unsigned
+AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+  else {
+    assert(MO.isImm() && "did not expect relocated expression");
+    return static_cast<unsigned>(MO.getImm());
+  }
 
+  assert(0 && "Unable to encode MCOperand!");
   return 0;
 }
 
-unsigned AArch64MCCodeEmitter::
-getOffsetUImm12OpValue(const MCInst &MI, unsigned OpIdx,
-                       SmallVectorImpl<MCFixup> &Fixups,
-                       const MCSubtargetInfo &STI,
-                       int MemSize) const {
-  const MCOperand &ImmOp = MI.getOperand(OpIdx);
-  if (ImmOp.isImm())
-    return ImmOp.getImm();
-
-  assert(ImmOp.isExpr() && "Unexpected operand type");
-  const AArch64MCExpr *Expr = cast<AArch64MCExpr>(ImmOp.getExpr());
-  unsigned FixupKind;
-
-
-  switch (Expr->getKind()) {
-  default: llvm_unreachable("Unexpected operand modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12: {
-    static const unsigned FixupsBySize[] = { AArch64::fixup_a64_ldst8_lo12,
-                                             AArch64::fixup_a64_ldst16_lo12,
-                                             AArch64::fixup_a64_ldst32_lo12,
-                                             AArch64::fixup_a64_ldst64_lo12,
-                                AArch64::fixup_a64_ldst128_lo12 };
-    assert(MemSize <= 16 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOT_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_got_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:  {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12,
-      AArch64::fixup_a64_ldst16_dtprel_lo12,
-      AArch64::fixup_a64_ldst32_dtprel_lo12,
-      AArch64::fixup_a64_ldst64_dtprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_dtprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_dtprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_ld64_gottprel_lo12_nc;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:{
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12,
-      AArch64::fixup_a64_ldst16_tprel_lo12,
-      AArch64::fixup_a64_ldst32_tprel_lo12,
-      AArch64::fixup_a64_ldst64_tprel_lo12
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC: {
-    static const unsigned FixupsBySize[] = {
-      AArch64::fixup_a64_ldst8_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst16_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst32_tprel_lo12_nc,
-      AArch64::fixup_a64_ldst64_tprel_lo12_nc
-    };
-    assert(MemSize <= 8 && "Invalid fixup for operation");
-    FixupKind = FixupsBySize[Log2_32(MemSize)];
-    break;
-  }
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    assert(MemSize == 8 && "Invalid fixup for operation");
-    FixupKind = AArch64::fixup_a64_tlsdesc_ld64_lo12_nc;
-    break;
+template<unsigned FixupKind> uint32_t
+AArch64MCCodeEmitter::getLdStUImm12OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  uint32_t ImmVal = 0;
+
+  if (MO.isImm())
+    ImmVal = static_cast<uint32_t>(MO.getImm());
+  else {
+    assert(MO.isExpr() && "unable to encode load/store imm operand");
+    MCFixupKind Kind = MCFixupKind(FixupKind);
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+    ++MCNumFixups;
   }
 
-  return getAddressWithFixup(ImmOp, FixupKind, Fixups, STI);
+  return ImmVal;
 }
 
-unsigned
-AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
+/// target.
+uint32_t
+AArch64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
+
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
+  const MCExpr *Expr = MO.getExpr();
 
-  assert(MO.isExpr());
-
-  unsigned FixupKind = 0;
-  switch(cast<AArch64MCExpr>(MO.getExpr())->getKind()) {
-  default: llvm_unreachable("Invalid expression modifier");
-  case AArch64MCExpr::VK_AARCH64_LO12:
-    FixupKind = AArch64::fixup_a64_add_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_dtprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_HI12:
-    FixupKind = AArch64::fixup_a64_add_tprel_hi12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_LO12_NC:
-    FixupKind = AArch64::fixup_a64_add_tprel_lo12_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC_LO12:
-    FixupKind = AArch64::fixup_a64_tlsdesc_add_lo12_nc; break;
-  }
+  MCFixupKind Kind = MI.getOpcode() == AArch64::ADR
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_adr_imm21)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_adrp_imm21);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
-}
+  MCNumFixups += 1;
 
-unsigned
-AArch64MCCodeEmitter::getAdrpLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  // All of the information is in the fixup.
+  return 0;
+}
 
+/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
+/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
+/// return value.
+uint32_t
+AArch64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Suboperands are [imm, shifter].
   const MCOperand &MO = MI.getOperand(OpIdx);
+  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
+  assert(AArch64_AM::getShiftType(MO1.getImm()) == AArch64_AM::LSL &&
+         "unexpected shift type for add/sub immediate");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO1.getImm());
+  assert((ShiftVal == 0 || ShiftVal == 12) &&
+         "unexpected shift value for add/sub immediate");
   if (MO.isImm())
-    return static_cast<unsigned>(MO.getImm());
-
-  assert(MO.isExpr());
+    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
+  assert(MO.isExpr() && "Unable to encode MCOperand!");
+  const MCExpr *Expr = MO.getExpr();
 
-  unsigned Modifier = AArch64MCExpr::VK_AARCH64_None;
-  if (const AArch64MCExpr *Expr = dyn_cast<AArch64MCExpr>(MO.getExpr()))
-    Modifier = Expr->getKind();
+  // Encode the 12 bits of the fixup.
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_add_imm12);
+  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
 
-  unsigned FixupKind = 0;
-  switch(Modifier) {
-  case AArch64MCExpr::VK_AARCH64_None:
-    FixupKind = AArch64::fixup_a64_adr_prel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOT:
-    FixupKind = AArch64::fixup_a64_adr_prel_got_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL:
-    FixupKind = AArch64::fixup_a64_adr_gottprel_page;
-    break;
-  case AArch64MCExpr::VK_AARCH64_TLSDESC:
-    FixupKind = AArch64::fixup_a64_tlsdesc_adr_page;
-    break;
-  default:
-    llvm_unreachable("Unknown symbol reference kind for ADRP instruction");
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getBitfield32LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-
+/// getCondBranchTargetOpValue - Return the encoded value for a conditional
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getCondBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
 
-  return ((32 - MO.getImm()) & 0x1f) | (31 - MO.getImm()) << 6;
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned
-AArch64MCCodeEmitter::getBitfield64LSLOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Only immediate expected for shift");
+  ++MCNumFixups;
 
-  return ((64 - MO.getImm()) & 0x3f) | (63 - MO.getImm()) << 6;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 8 - MI.getOperand(Op).getImm();
-}
+/// getLoadLiteralOpValue - Return the encoded value for a load-literal
+/// pc-relative address.
+uint32_t
+AArch64MCCodeEmitter::getLoadLiteralOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 16 - MI.getOperand(Op).getImm();
-}
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected target type!");
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 32 - MI.getOperand(Op).getImm();
-}
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_ldr_pcrel_imm19);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-unsigned AArch64MCCodeEmitter::getShiftRightImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return 64 - MI.getOperand(Op).getImm();
-}
+  ++MCNumFixups;
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm8(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 8;
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm16(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 16;
+uint32_t
+AArch64MCCodeEmitter::getMemExtendOpValue(const MCInst &MI, unsigned OpIdx,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  unsigned SignExtend = MI.getOperand(OpIdx).getImm();
+  unsigned DoShift = MI.getOperand(OpIdx + 1).getImm();
+  return (SignExtend << 1) | DoShift;
 }
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm32(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 32;
-}
+uint32_t
+AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
 
-unsigned AArch64MCCodeEmitter::getShiftLeftImm64(
-    const MCInst &MI, unsigned Op, SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
-  return MI.getOperand(Op).getImm() - 64;
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected movz/movk immediate");
+
+  Fixups.push_back(MCFixup::Create(
+      0, MO.getExpr(), MCFixupKind(AArch64::fixup_aarch64_movw), MI.getLoc()));
+
+  ++MCNumFixups;
+
+  return 0;
 }
 
-template<AArch64::Fixups fixupDesired> unsigned
-AArch64MCCodeEmitter::getLabelOpValue(const MCInst &MI,
-                                      unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
+/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
+/// branch target.
+uint32_t AArch64MCCodeEmitter::getTestBranchTargetOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
-  if (MO.isExpr())
-    return getAddressWithFixup(MO, fixupDesired, Fixups, STI);
+  // If the destination is an immediate, we have nothing to do.
+  if (MO.isImm())
+    return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
 
-  assert(MO.isImm());
-  return MO.getImm();
+  MCFixupKind Kind = MCFixupKind(AArch64::fixup_aarch64_pcrel_branch14);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
+
+  ++MCNumFixups;
+
+  // All of the information is in the fixup.
+  return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getLoadLitLabelOpValue(const MCInst &MI,
-                                       unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
+/// getBranchTargetOpValue - Return the encoded value for an unconditional
+/// branch target.
+uint32_t
+AArch64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
   const MCOperand &MO = MI.getOperand(OpIdx);
 
+  // If the destination is an immediate, we have nothing to do.
   if (MO.isImm())
     return MO.getImm();
+  assert(MO.isExpr() && "Unexpected ADR target type!");
 
-  assert(MO.isExpr());
+  MCFixupKind Kind = MI.getOpcode() == AArch64::BL
+                         ? MCFixupKind(AArch64::fixup_aarch64_pcrel_call26)
+                         : MCFixupKind(AArch64::fixup_aarch64_pcrel_branch26);
+  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
 
-  unsigned FixupKind;
-  if (isa<AArch64MCExpr>(MO.getExpr())) {
-    assert(dyn_cast<AArch64MCExpr>(MO.getExpr())->getKind()
-           == AArch64MCExpr::VK_AARCH64_GOTTPREL
-           && "Invalid symbol modifier for literal load");
-    FixupKind = AArch64::fixup_a64_ld_gottprel_prel19;
-  } else {
-    FixupKind = AArch64::fixup_a64_ld_prel;
-  }
+  ++MCNumFixups;
 
-  return getAddressWithFixup(MO, FixupKind, Fixups, STI);
+  // All of the information is in the fixup.
+  return 0;
 }
 
+/// getVecShifterOpValue - Return the encoded value for the vector shifter:
+///
+///   00 -> 0
+///   01 -> 8
+///   10 -> 16
+///   11 -> 24
+uint32_t
+AArch64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
 
-unsigned
-AArch64MCCodeEmitter::getMachineOpValue(const MCInst &MI,
-                                       const MCOperand &MO,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  if (MO.isReg()) {
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  } else if (MO.isImm()) {
-    return static_cast<unsigned>(MO.getImm());
+  switch (MO.getImm()) {
+  default:
+    break;
+  case 0:
+    return 0;
+  case 8:
+    return 1;
+  case 16:
+    return 2;
+  case 24:
+    return 3;
   }
 
-  llvm_unreachable("Unable to encode MCOperand!");
+  assert(false && "Invalid value for vector shift amount!");
   return 0;
 }
 
-unsigned
-AArch64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &UImm16MO = MI.getOperand(OpIdx);
-  const MCOperand &ShiftMO = MI.getOperand(OpIdx + 1);
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm());
+}
 
-  unsigned Result = static_cast<unsigned>(ShiftMO.getImm()) << 16;
+uint32_t AArch64MCCodeEmitter::getSIMDShift64_32OpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 64 - (MO.getImm() | 32);
+}
 
-  if (UImm16MO.isImm()) {
-    Result |= UImm16MO.getImm();
-    return Result;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 32 - (MO.getImm() | 16);
+}
 
-  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
-  AArch64::Fixups requestedFixup;
-  switch (A64E->getKind()) {
-  default: llvm_unreachable("unexpected expression modifier");
-  case AArch64MCExpr::VK_AARCH64_ABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G2_NC:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g2_nc; break;
-  case AArch64MCExpr::VK_AARCH64_ABS_G3:
-    requestedFixup = AArch64::fixup_a64_movw_uabs_g3; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g0; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g1; break;
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-    requestedFixup = AArch64::fixup_a64_movw_sabs_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_dtprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_gottprel_g0_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g2; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g1_nc; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0; break;
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0_NC:
-    requestedFixup = AArch64::fixup_a64_movw_tprel_g0_nc; break;
-  }
+uint32_t
+AArch64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
+  return 16 - (MO.getImm() | 8);
+}
 
-  return Result | getAddressWithFixup(UImm16MO, requestedFixup, Fixups, STI);
+/// getFixedPointScaleOpValue - Return the encoded value for the
+// FP-to-fixed-point scale factor.
+uint32_t AArch64MCCodeEmitter::getFixedPointScaleOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
 }
 
-template<int hasRs, int hasRt2> unsigned
-AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
-                                            unsigned EncodedValue,
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
-  if (!hasRs) EncodedValue |= 0x001F0000;
-  if (!hasRt2) EncodedValue |= 0x00007C00;
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 64 - MO.getImm();
+}
 
-  return EncodedValue;
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 32 - MO.getImm();
 }
 
-unsigned
-AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                              const MCSubtargetInfo &STI) const {
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 16 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return 8 - MO.getImm();
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 64;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 32;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 16;
+}
+
+uint32_t
+AArch64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
+  return MO.getImm() - 8;
+}
+
+/// getMoveVecShifterOpValue - Return the encoded value for the vector move
+/// shifter (MSL).
+uint32_t AArch64MCCodeEmitter::getMoveVecShifterOpValue(
+    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
+    const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isImm() &&
+         "Expected an immediate value for the move shift amount!");
+  unsigned ShiftVal = AArch64_AM::getShiftValue(MO.getImm());
+  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
+  return ShiftVal == 8 ? 0 : 1;
+}
+
+unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
+                                       const MCSubtargetInfo &STI) const {
   // If one of the signed fixup kinds is applied to a MOVZ instruction, the
   // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
   // job to ensure that any bits possibly affected by this are 0. This means we
@@ -552,23 +589,38 @@ AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
 
   const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
   switch (A64E->getKind()) {
-  case AArch64MCExpr::VK_AARCH64_SABS_G0:
-  case AArch64MCExpr::VK_AARCH64_SABS_G1:
-  case AArch64MCExpr::VK_AARCH64_SABS_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_DTPREL_G0:
-  case AArch64MCExpr::VK_AARCH64_GOTTPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G2:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G1:
-  case AArch64MCExpr::VK_AARCH64_TPREL_G0:
+  case AArch64MCExpr::VK_DTPREL_G2:
+  case AArch64MCExpr::VK_DTPREL_G1:
+  case AArch64MCExpr::VK_DTPREL_G0:
+  case AArch64MCExpr::VK_GOTTPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G2:
+  case AArch64MCExpr::VK_TPREL_G1:
+  case AArch64MCExpr::VK_TPREL_G0:
     return EncodedValue & ~(1u << 30);
   default:
     // Nothing to do for an unsigned fixup.
     return EncodedValue;
   }
 
-  llvm_unreachable("Should have returned by now");
+
+  return EncodedValue & ~(1u << 30);
+}
+
+void AArch64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                             SmallVectorImpl<MCFixup> &Fixups,
+                                             const MCSubtargetInfo &STI) const {
+  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
+    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
+    // following (BLR) instruction. It doesn't emit any code itself so it
+    // doesn't go through the normal TableGenerated channels.
+    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_aarch64_tlsdesc_call);
+    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
+    return;
+  }
+
+  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+  EmitConstant(Binary, 4, OS);
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
 }
 
 unsigned
@@ -581,32 +633,22 @@ AArch64MCCodeEmitter::fixMulHigh(const MCInst &MI,
   return EncodedValue;
 }
 
-MCCodeEmitter *llvm::createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                                const MCRegisterInfo &MRI,
-                                                const MCSubtargetInfo &STI,
-                                                MCContext &Ctx) {
-  return new AArch64MCCodeEmitter(Ctx);
-}
-
-void AArch64MCCodeEmitter::
-EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                  SmallVectorImpl<MCFixup> &Fixups,
-                  const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == AArch64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(AArch64::fixup_a64_tlsdesc_call);
-    const MCExpr *Expr;
-    Expr = AArch64MCExpr::CreateTLSDesc(MI.getOperand(0).getExpr(), Ctx);
-    Fixups.push_back(MCFixup::Create(0, Expr, Fixup));
-    return;
-  }
-
-  uint32_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
+template<int hasRs, int hasRt2> unsigned
+AArch64MCCodeEmitter::fixLoadStoreExclusive(const MCInst &MI,
+                                            unsigned EncodedValue,
+                                            const MCSubtargetInfo &STI) const {
+  if (!hasRs) EncodedValue |= 0x001F0000;
+  if (!hasRt2) EncodedValue |= 0x00007C00;
 
-  EmitInstruction(Binary, OS);
+  return EncodedValue;
 }
 
+unsigned AArch64MCCodeEmitter::fixOneOperandFPComparison(
+    const MCInst &MI, unsigned EncodedValue, const MCSubtargetInfo &STI) const {
+  // The Rm field of FCMP and friends is unused - it should be assembled
+  // as 0, but is ignored by the processor.
+  EncodedValue &= ~(0x1f << 16);
+  return EncodedValue;
+}
 
 #include "AArch64GenMCCodeEmitter.inc"
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
index c7ccaee..85c3ec7 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.cpp
@@ -12,74 +12,121 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "aarch64mcexpr"
 #include "AArch64MCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Object/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
-const AArch64MCExpr*
-AArch64MCExpr::Create(VariantKind Kind, const MCExpr *Expr,
-                      MCContext &Ctx) {
-  return new (Ctx) AArch64MCExpr(Kind, Expr);
+#define DEBUG_TYPE "aarch64symbolrefexpr"
+
+const AArch64MCExpr *AArch64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
+                                       MCContext &Ctx) {
+  return new (Ctx) AArch64MCExpr(Expr, Kind);
+}
+
+StringRef AArch64MCExpr::getVariantKindName() const {
+  switch (static_cast<uint32_t>(getKind())) {
+  case VK_CALL:                return "";
+  case VK_LO12:                return ":lo12:";
+  case VK_ABS_G3:              return ":abs_g3:";
+  case VK_ABS_G2:              return ":abs_g2:";
+  case VK_ABS_G2_S:            return ":abs_g2_s:";
+  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
+  case VK_ABS_G1:              return ":abs_g1:";
+  case VK_ABS_G1_S:            return ":abs_g1_s:";
+  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
+  case VK_ABS_G0:              return ":abs_g0:";
+  case VK_ABS_G0_S:            return ":abs_g0_s:";
+  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
+  case VK_DTPREL_G2:           return ":dtprel_g2:";
+  case VK_DTPREL_G1:           return ":dtprel_g1:";
+  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
+  case VK_DTPREL_G0:           return ":dtprel_g0:";
+  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
+  case VK_DTPREL_HI12:         return ":dtprel_hi12:";
+  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
+  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
+  case VK_TPREL_G2:            return ":tprel_g2:";
+  case VK_TPREL_G1:            return ":tprel_g1:";
+  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
+  case VK_TPREL_G0:            return ":tprel_g0:";
+  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
+  case VK_TPREL_HI12:          return ":tprel_hi12:";
+  case VK_TPREL_LO12:          return ":tprel_lo12:";
+  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
+  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
+  case VK_ABS_PAGE:            return "";
+  case VK_GOT_PAGE:            return ":got:";
+  case VK_GOT_LO12:            return ":got_lo12:";
+  case VK_GOTTPREL_PAGE:       return ":gottprel:";
+  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
+  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
+  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
+  case VK_TLSDESC:             return "";
+  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
+  default:
+    llvm_unreachable("Invalid ELF symbol kind");
+  }
 }
 
 void AArch64MCExpr::PrintImpl(raw_ostream &OS) const {
-  switch (Kind) {
-  default: llvm_unreachable("Invalid kind!");
-  case VK_AARCH64_GOT:              OS << ":got:"; break;
-  case VK_AARCH64_GOT_LO12:         OS << ":got_lo12:"; break;
-  case VK_AARCH64_LO12:             OS << ":lo12:"; break;
-  case VK_AARCH64_ABS_G0:           OS << ":abs_g0:"; break;
-  case VK_AARCH64_ABS_G0_NC:        OS << ":abs_g0_nc:"; break;
-  case VK_AARCH64_ABS_G1:           OS << ":abs_g1:"; break;
-  case VK_AARCH64_ABS_G1_NC:        OS << ":abs_g1_nc:"; break;
-  case VK_AARCH64_ABS_G2:           OS << ":abs_g2:"; break;
-  case VK_AARCH64_ABS_G2_NC:        OS << ":abs_g2_nc:"; break;
-  case VK_AARCH64_ABS_G3:           OS << ":abs_g3:"; break;
-  case VK_AARCH64_SABS_G0:          OS << ":abs_g0_s:"; break;
-  case VK_AARCH64_SABS_G1:          OS << ":abs_g1_s:"; break;
-  case VK_AARCH64_SABS_G2:          OS << ":abs_g2_s:"; break;
-  case VK_AARCH64_DTPREL_G2:        OS << ":dtprel_g2:"; break;
-  case VK_AARCH64_DTPREL_G1:        OS << ":dtprel_g1:"; break;
-  case VK_AARCH64_DTPREL_G1_NC:     OS << ":dtprel_g1_nc:"; break;
-  case VK_AARCH64_DTPREL_G0:        OS << ":dtprel_g0:"; break;
-  case VK_AARCH64_DTPREL_G0_NC:     OS << ":dtprel_g0_nc:"; break;
-  case VK_AARCH64_DTPREL_HI12:      OS << ":dtprel_hi12:"; break;
-  case VK_AARCH64_DTPREL_LO12:      OS << ":dtprel_lo12:"; break;
-  case VK_AARCH64_DTPREL_LO12_NC:   OS << ":dtprel_lo12_nc:"; break;
-  case VK_AARCH64_GOTTPREL_G1:      OS << ":gottprel_g1:"; break;
-  case VK_AARCH64_GOTTPREL_G0_NC:   OS << ":gottprel_g0_nc:"; break;
-  case VK_AARCH64_GOTTPREL:         OS << ":gottprel:"; break;
-  case VK_AARCH64_GOTTPREL_LO12:    OS << ":gottprel_lo12:"; break;
-  case VK_AARCH64_TPREL_G2:         OS << ":tprel_g2:"; break;
-  case VK_AARCH64_TPREL_G1:         OS << ":tprel_g1:"; break;
-  case VK_AARCH64_TPREL_G1_NC:      OS << ":tprel_g1_nc:"; break;
-  case VK_AARCH64_TPREL_G0:         OS << ":tprel_g0:"; break;
-  case VK_AARCH64_TPREL_G0_NC:      OS << ":tprel_g0_nc:"; break;
-  case VK_AARCH64_TPREL_HI12:       OS << ":tprel_hi12:"; break;
-  case VK_AARCH64_TPREL_LO12:       OS << ":tprel_lo12:"; break;
-  case VK_AARCH64_TPREL_LO12_NC:    OS << ":tprel_lo12_nc:"; break;
-  case VK_AARCH64_TLSDESC:          OS << ":tlsdesc:"; break;
-  case VK_AARCH64_TLSDESC_LO12:     OS << ":tlsdesc_lo12:"; break;
+  if (getKind() != VK_NONE)
+    OS << getVariantKindName();
+  OS << *Expr;
+}
+
+// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
+// that method should be made public?
+// FIXME: really do above: now that two backends are using it.
+static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
+  switch (Value->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
 
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
+    AddValueSymbolsImpl(BE->getLHS(), Asm);
+    AddValueSymbolsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef:
+    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
+    break;
+
+  case MCExpr::Unary:
+    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
+    break;
   }
+}
+
+void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
+  AddValueSymbolsImpl(getSubExpr(), Asm);
+}
 
-  const MCExpr *Expr = getSubExpr();
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << '(';
-  Expr->print(OS);
-  if (Expr->getKind() != MCExpr::SymbolRef)
-    OS << ')';
+const MCSection *AArch64MCExpr::FindAssociatedSection() const {
+  llvm_unreachable("FIXME: what goes here?");
 }
 
-bool
-AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                         const MCAsmLayout *Layout) const {
-  return getSubExpr()->EvaluateAsRelocatable(Res, Layout);
+bool AArch64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
+                                            const MCAsmLayout *Layout) const {
+  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
+    return false;
+
+  Res =
+      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
+
+  return true;
 }
 
 static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
@@ -113,66 +160,15 @@ static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
 }
 
 void AArch64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getKind()) {
+  switch (getSymbolLoc(Kind)) {
   default:
     return;
-  case VK_AARCH64_DTPREL_G2:
-  case VK_AARCH64_DTPREL_G1:
-  case VK_AARCH64_DTPREL_G1_NC:
-  case VK_AARCH64_DTPREL_G0:
-  case VK_AARCH64_DTPREL_G0_NC:
-  case VK_AARCH64_DTPREL_HI12:
-  case VK_AARCH64_DTPREL_LO12:
-  case VK_AARCH64_DTPREL_LO12_NC:
-  case VK_AARCH64_GOTTPREL_G1:
-  case VK_AARCH64_GOTTPREL_G0_NC:
-  case VK_AARCH64_GOTTPREL:
-  case VK_AARCH64_GOTTPREL_LO12:
-  case VK_AARCH64_TPREL_G2:
-  case VK_AARCH64_TPREL_G1:
-  case VK_AARCH64_TPREL_G1_NC:
-  case VK_AARCH64_TPREL_G0:
-  case VK_AARCH64_TPREL_G0_NC:
-  case VK_AARCH64_TPREL_HI12:
-  case VK_AARCH64_TPREL_LO12:
-  case VK_AARCH64_TPREL_LO12_NC:
-  case VK_AARCH64_TLSDESC:
-  case VK_AARCH64_TLSDESC_LO12:
+  case VK_DTPREL:
+  case VK_GOTTPREL:
+  case VK_TPREL:
+  case VK_TLSDESC:
     break;
   }
 
   fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
 }
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void AArch64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
index d9798ae..e869ed0 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCExpr.h
@@ -1,4 +1,4 @@
-//==- AArch64MCExpr.h - AArch64 specific MC expression classes --*- C++ -*-===//
+//=--- AArch64MCExpr.h - AArch64 specific MC expression classes ---*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,168 +12,149 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCEXPR_H
-#define LLVM_AARCH64MCEXPR_H
+#ifndef LLVM_AArch64MCEXPR_H
+#define LLVM_AArch64MCEXPR_H
 
 #include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
 class AArch64MCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
-    VK_AARCH64_None,
-    VK_AARCH64_GOT,      // :got: modifier in assembly
-    VK_AARCH64_GOT_LO12, // :got_lo12:
-    VK_AARCH64_LO12,     // :lo12:
-
-    VK_AARCH64_ABS_G0, // :abs_g0:
-    VK_AARCH64_ABS_G0_NC, // :abs_g0_nc:
-    VK_AARCH64_ABS_G1,
-    VK_AARCH64_ABS_G1_NC,
-    VK_AARCH64_ABS_G2,
-    VK_AARCH64_ABS_G2_NC,
-    VK_AARCH64_ABS_G3,
-
-    VK_AARCH64_SABS_G0, // :abs_g0_s:
-    VK_AARCH64_SABS_G1,
-    VK_AARCH64_SABS_G2,
-
-    VK_AARCH64_DTPREL_G2, // :dtprel_g2:
-    VK_AARCH64_DTPREL_G1,
-    VK_AARCH64_DTPREL_G1_NC,
-    VK_AARCH64_DTPREL_G0,
-    VK_AARCH64_DTPREL_G0_NC,
-    VK_AARCH64_DTPREL_HI12,
-    VK_AARCH64_DTPREL_LO12,
-    VK_AARCH64_DTPREL_LO12_NC,
-
-    VK_AARCH64_GOTTPREL_G1, // :gottprel:
-    VK_AARCH64_GOTTPREL_G0_NC,
-    VK_AARCH64_GOTTPREL,
-    VK_AARCH64_GOTTPREL_LO12,
-
-    VK_AARCH64_TPREL_G2, // :tprel:
-    VK_AARCH64_TPREL_G1,
-    VK_AARCH64_TPREL_G1_NC,
-    VK_AARCH64_TPREL_G0,
-    VK_AARCH64_TPREL_G0_NC,
-    VK_AARCH64_TPREL_HI12,
-    VK_AARCH64_TPREL_LO12,
-    VK_AARCH64_TPREL_LO12_NC,
-
-    VK_AARCH64_TLSDESC, // :tlsdesc:
-    VK_AARCH64_TLSDESC_LO12
+    VK_NONE     = 0x000,
+
+    // Symbol locations specifying (roughly speaking) what calculation should be
+    // performed to construct the final address for the relocated
+    // symbol. E.g. direct, via the GOT, ...
+    VK_ABS      = 0x001,
+    VK_SABS     = 0x002,
+    VK_GOT      = 0x003,
+    VK_DTPREL   = 0x004,
+    VK_GOTTPREL = 0x005,
+    VK_TPREL    = 0x006,
+    VK_TLSDESC  = 0x007,
+    VK_SymLocBits = 0x00f,
+
+    // Variants specifying which part of the final address calculation is
+    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
+    // MOVZ/MOVK.
+    VK_PAGE     = 0x010,
+    VK_PAGEOFF  = 0x020,
+    VK_HI12     = 0x030,
+    VK_G0       = 0x040,
+    VK_G1       = 0x050,
+    VK_G2       = 0x060,
+    VK_G3       = 0x070,
+    VK_AddressFragBits = 0x0f0,
+
+    // Whether the final relocation is a checked one (where a linker should
+    // perform a range-check on the final address) or not. Note that this field
+    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
+    // on its own is a non-checked relocation. We side with ELF on being
+    // explicit about this!
+    VK_NC       = 0x100,
+
+    // Convenience definitions for referring to specific textual representations
+    // of relocation specifiers. Note that this means the "_NC" is sometimes
+    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
+    // since a user would write ":lo12:").
+    VK_CALL              = VK_ABS,
+    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
+    VK_ABS_G3            = VK_ABS      | VK_G3,
+    VK_ABS_G2            = VK_ABS      | VK_G2,
+    VK_ABS_G2_S          = VK_SABS     | VK_G2,
+    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
+    VK_ABS_G1            = VK_ABS      | VK_G1,
+    VK_ABS_G1_S          = VK_SABS     | VK_G1,
+    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
+    VK_ABS_G0            = VK_ABS      | VK_G0,
+    VK_ABS_G0_S          = VK_SABS     | VK_G0,
+    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
+    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
+    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
+    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
+    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
+    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
+    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
+    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
+    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
+    VK_DTPREL_HI12       = VK_DTPREL   | VK_HI12,
+    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
+    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
+    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
+    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
+    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
+    VK_TPREL_G2          = VK_TPREL    | VK_G2,
+    VK_TPREL_G1          = VK_TPREL    | VK_G1,
+    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
+    VK_TPREL_G0          = VK_TPREL    | VK_G0,
+    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
+    VK_TPREL_HI12        = VK_TPREL    | VK_HI12,
+    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
+    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
+    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
+
+    VK_INVALID  = 0xfff
   };
 
 private:
-  const VariantKind Kind;
   const MCExpr *Expr;
+  const VariantKind Kind;
 
-  explicit AArch64MCExpr(VariantKind _Kind, const MCExpr *_Expr)
-    : Kind(_Kind), Expr(_Expr) {}
+  explicit AArch64MCExpr(const MCExpr *Expr, VariantKind Kind)
+    : Expr(Expr), Kind(Kind) {}
 
 public:
   /// @name Construction
   /// @{
 
-  static const AArch64MCExpr *Create(VariantKind Kind, const MCExpr *Expr,
-                                     MCContext &Ctx);
-
-  static const AArch64MCExpr *CreateLo12(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOT(const MCExpr *Expr, MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTLo12(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_GOT_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G1, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateDTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_DTPREL_G0_NC, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPREL(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateGOTTPRELLo12(const MCExpr *Expr,
-                                                 MCContext &Ctx) {
-    return Create(VK_AARCH64_GOTTPREL_LO12, Expr, Ctx);
-  }
-
-  static const AArch64MCExpr *CreateTLSDesc(const MCExpr *Expr,
-                                            MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC, Expr, Ctx);
-  }
+  static const AArch64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
+                                   MCContext &Ctx);
 
-  static const AArch64MCExpr *CreateTLSDescLo12(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TLSDESC_LO12, Expr, Ctx);
-  }
+  /// @}
+  /// @name Accessors
+  /// @{
 
-  static const AArch64MCExpr *CreateTPREL_G1(const MCExpr *Expr,
-                                             MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G1, Expr, Ctx);
-  }
+  /// Get the kind of this expression.
+  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
 
-  static const AArch64MCExpr *CreateTPREL_G0_NC(const MCExpr *Expr,
-                                                MCContext &Ctx) {
-    return Create(VK_AARCH64_TPREL_G0_NC, Expr, Ctx);
-  }
+  /// Get the expression this modifier applies to.
+  const MCExpr *getSubExpr() const { return Expr; }
 
-  static const AArch64MCExpr *CreateABS_G3(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G3, Expr, Ctx);
-  }
+  /// @}
+  /// @name VariantKind information extractors.
+  /// @{
 
-  static const AArch64MCExpr *CreateABS_G2_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G2_NC, Expr, Ctx);
+  static VariantKind getSymbolLoc(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_SymLocBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G1_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G1_NC, Expr, Ctx);
+  static VariantKind getAddressFrag(VariantKind Kind) {
+    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
   }
 
-  static const AArch64MCExpr *CreateABS_G0_NC(const MCExpr *Expr,
-                                           MCContext &Ctx) {
-    return Create(VK_AARCH64_ABS_G0_NC, Expr, Ctx);
-  }
+  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
 
   /// @}
-  /// @name Accessors
-  /// @{
 
-  /// getOpcode - Get the kind of this expression.
-  VariantKind getKind() const { return Kind; }
+  /// Convert the variant kind into an ELF-appropriate modifier
+  /// (e.g. ":got:", ":lo12:").
+  StringRef getVariantKindName() const;
 
-  /// getSubExpr - Get the child of this expression.
-  const MCExpr *getSubExpr() const { return Expr; }
+  void PrintImpl(raw_ostream &OS) const override;
 
-  /// @}
+  void AddValueSymbols(MCAssembler *) const override;
+
+  const MCSection *FindAssociatedSection() const override;
 
-  void PrintImpl(raw_ostream &OS) const;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
-    return getSubExpr()->FindAssociatedSection();
-  }
+                                 const MCAsmLayout *Layout) const override;
 
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 3d19e42..ae698c5 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions -------------===//
+//===-- AArch64MCTargetDesc.cpp - AArch64 Target Descriptions ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,9 +15,7 @@
 #include "AArch64ELFStreamer.h"
 #include "AArch64MCAsmInfo.h"
 #include "InstPrinter/AArch64InstPrinter.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -25,8 +23,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_REGINFO_MC_DESC
-#include "AArch64GenRegisterInfo.inc"
+using namespace llvm;
 
 #define GET_INSTRINFO_MC_DESC
 #include "AArch64GenInstrInfo.inc"
@@ -34,26 +31,29 @@
 #define GET_SUBTARGETINFO_MC_DESC
 #include "AArch64GenSubtargetInfo.inc"
 
-using namespace llvm;
+#define GET_REGINFO_MC_DESC
+#include "AArch64GenRegisterInfo.inc"
 
-MCSubtargetInfo *AArch64_MC::createAArch64MCSubtargetInfo(StringRef TT,
-                                                          StringRef CPU,
-                                                          StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
+static MCInstrInfo *createAArch64MCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAArch64MCInstrInfo(X);
   return X;
 }
 
+static MCSubtargetInfo *
+createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS) {
+  MCSubtargetInfo *X = new MCSubtargetInfo();
 
-static MCInstrInfo *createAArch64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitAArch64MCInstrInfo(X);
+  if (CPU.empty())
+    CPU = "generic";
+
+  InitAArch64MCSubtargetInfo(X, TT, CPU, FS);
   return X;
 }
 
 static MCRegisterInfo *createAArch64MCRegisterInfo(StringRef Triple) {
   MCRegisterInfo *X = new MCRegisterInfo();
-  InitAArch64MCRegisterInfo(X, AArch64::X30);
+  InitAArch64MCRegisterInfo(X, AArch64::LR);
   return X;
 }
 
@@ -61,9 +61,17 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   Triple TheTriple(TT);
 
-  MCAsmInfo *MAI = new AArch64ELFMCAsmInfo(TT);
-  unsigned Reg = MRI.getDwarfRegNum(AArch64::XSP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MCAsmInfo *MAI;
+  if (TheTriple.isOSDarwin())
+    MAI = new AArch64MCAsmInfoDarwin();
+  else {
+    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
+    MAI = new AArch64MCAsmInfoELF(TT);
+  }
+
+  // Initial state of the frame pointer is SP.
+  unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -72,40 +80,35 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 static MCCodeGenInfo *createAArch64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
                                                  CodeModel::Model CM,
                                                  CodeGenOpt::Level OL) {
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC) {
-    // On ELF platforms the default static relocation model has a smart enough
-    // linker to cope with referencing external symbols defined in a shared
-    // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-    RM = Reloc::Static;
-  }
+  Triple TheTriple(TT);
+  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
+         "Only expect Darwin and ELF targets");
 
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
-  else if (CM == CodeModel::JITDefault) {
-    // The default MCJIT memory managers make no guarantees about where they can
-    // find an executable page; JITed code needs to be able to refer to globals
-    // no matter how far away they are.
+  // The default MCJIT memory managers make no guarantees about where they can
+  // find an executable page; JITed code needs to be able to refer to globals
+  // no matter how far away they are.
+  else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  }
+  else if (CM != CodeModel::Small && CM != CodeModel::Large)
+    report_fatal_error(
+        "Only small and large code models are allowed on AArch64");
+
+  // AArch64 Darwin is always PIC.
+  if (TheTriple.isOSDarwin())
+    RM = Reloc::PIC_;
+  // On ELF platforms the default static relocation model has a smart enough
+  // linker to cope with referencing external symbols defined in a shared
+  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
+  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
+    RM = Reloc::Static;
 
+  MCCodeGenInfo *X = new MCCodeGenInfo();
   X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &MAB,
-                                    raw_ostream &OS,
-                                    MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI,
-                                    bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  return createAArch64ELFStreamer(Ctx, MAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-
 static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
@@ -114,108 +117,109 @@ static MCInstPrinter *createAArch64MCInstPrinter(const Target &T,
                                                  const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new AArch64InstPrinter(MAI, MII, MRI, STI);
-  return 0;
-}
-
-namespace {
-
-class AArch64MCInstrAnalysis : public MCInstrAnalysis {
-public:
-  AArch64MCInstrAnalysis(const MCInstrInfo *Info) : MCInstrAnalysis(Info) {}
-
-  virtual bool isUnconditionalBranch(const MCInst &Inst) const {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return true;
-    return MCInstrAnalysis::isUnconditionalBranch(Inst);
-  }
-
-  virtual bool isConditionalBranch(const MCInst &Inst) const {
-    if (Inst.getOpcode() == AArch64::Bcc
-        && Inst.getOperand(0).getImm() == A64CC::AL)
-      return false;
-    return MCInstrAnalysis::isConditionalBranch(Inst);
-  }
-
-  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                      uint64_t Size, uint64_t &Target) const {
-    unsigned LblOperand = Inst.getOpcode() == AArch64::Bcc ? 1 : 0;
-    // FIXME: We only handle PCRel branches for now.
-    if (Info->get(Inst.getOpcode()).OpInfo[LblOperand].OperandType
-        != MCOI::OPERAND_PCREL)
-      return false;
-
-    int64_t Imm = Inst.getOperand(LblOperand).getImm();
-    Target = Addr + Imm;
-    return true;
-  }
-};
+  if (SyntaxVariant == 1)
+    return new AArch64AppleInstPrinter(MAI, MII, MRI, STI);
 
+  return nullptr;
 }
 
-static MCInstrAnalysis *createAArch64MCInstrAnalysis(const MCInstrInfo *Info) {
-  return new AArch64MCInstrAnalysis(Info);
-}
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &TAB,
+                                    raw_ostream &OS, MCCodeEmitter *Emitter,
+                                    const MCSubtargetInfo &STI, bool RelaxAll,
+                                    bool NoExecStack) {
+  Triple TheTriple(TT);
 
+  if (TheTriple.isOSDarwin())
+    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
+                               /*LabelSections*/ true);
 
+  return createAArch64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
+}
 
+// Force static initialization.
 extern "C" void LLVMInitializeAArch64TargetMC() {
   // Register the MC asm info.
-  RegisterMCAsmInfoFn A(TheAArch64leTarget, createAArch64MCAsmInfo);
-  RegisterMCAsmInfoFn B(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn X(TheAArch64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Y(TheAArch64beTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn Z(TheARM64leTarget, createAArch64MCAsmInfo);
+  RegisterMCAsmInfoFn W(TheARM64beTarget, createAArch64MCAsmInfo);
 
   // Register the MC codegen info.
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64leTarget,
                                         createAArch64MCCodeGenInfo);
   TargetRegistry::RegisterMCCodeGenInfo(TheAArch64beTarget,
                                         createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64leTarget,
+                                        createAArch64MCCodeGenInfo);
+  TargetRegistry::RegisterMCCodeGenInfo(TheARM64beTarget,
+                                        createAArch64MCCodeGenInfo);
 
   // Register the MC instruction info.
   TargetRegistry::RegisterMCInstrInfo(TheAArch64leTarget,
                                       createAArch64MCInstrInfo);
   TargetRegistry::RegisterMCInstrInfo(TheAArch64beTarget,
                                       createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64leTarget,
+                                      createAArch64MCInstrInfo);
+  TargetRegistry::RegisterMCInstrInfo(TheARM64beTarget,
+                                      createAArch64MCInstrInfo);
 
   // Register the MC register info.
   TargetRegistry::RegisterMCRegInfo(TheAArch64leTarget,
                                     createAArch64MCRegisterInfo);
   TargetRegistry::RegisterMCRegInfo(TheAArch64beTarget,
                                     createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64leTarget,
+                                    createAArch64MCRegisterInfo);
+  TargetRegistry::RegisterMCRegInfo(TheARM64beTarget,
+                                    createAArch64MCRegisterInfo);
 
   // Register the MC subtarget info.
-  using AArch64_MC::createAArch64MCSubtargetInfo;
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64leTarget,
                                           createAArch64MCSubtargetInfo);
   TargetRegistry::RegisterMCSubtargetInfo(TheAArch64beTarget,
                                           createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64leTarget,
+                                          createAArch64MCSubtargetInfo);
+  TargetRegistry::RegisterMCSubtargetInfo(TheARM64beTarget,
+                                          createAArch64MCSubtargetInfo);
 
-  // Register the MC instruction analyzer.
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64leTarget,
-                                          createAArch64MCInstrAnalysis);
-  TargetRegistry::RegisterMCInstrAnalysis(TheAArch64beTarget,
-                                          createAArch64MCInstrAnalysis);
+  // Register the asm backend.
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
+                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64leTarget,
+                                       createAArch64leAsmBackend);
+  TargetRegistry::RegisterMCAsmBackend(TheARM64beTarget,
+                                       createAArch64beAsmBackend);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64leTarget,
                                         createAArch64MCCodeEmitter);
   TargetRegistry::RegisterMCCodeEmitter(TheAArch64beTarget,
                                         createAArch64MCCodeEmitter);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64leTarget,
-                                       createAArch64leAsmBackend);
-  TargetRegistry::RegisterMCAsmBackend(TheAArch64beTarget,
-                                       createAArch64beAsmBackend);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64leTarget,
+                                        createAArch64MCCodeEmitter);
+  TargetRegistry::RegisterMCCodeEmitter(TheARM64beTarget,
+                                        createAArch64MCCodeEmitter);
 
   // Register the object streamer.
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64leTarget,
                                            createMCStreamer);
   TargetRegistry::RegisterMCObjectStreamer(TheAArch64beTarget,
                                            createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64leTarget, createMCStreamer);
+  TargetRegistry::RegisterMCObjectStreamer(TheARM64beTarget, createMCStreamer);
 
   // Register the MCInstPrinter.
   TargetRegistry::RegisterMCInstPrinter(TheAArch64leTarget,
                                         createAArch64MCInstPrinter);
   TargetRegistry::RegisterMCInstPrinter(TheAArch64beTarget,
                                         createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64leTarget,
+                                        createAArch64MCInstPrinter);
+  TargetRegistry::RegisterMCInstPrinter(TheARM64beTarget,
+                                        createAArch64MCInstPrinter);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
index bd8beaf..d886ea2 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.h
@@ -11,18 +11,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64MCTARGETDESC_H
-#define LLVM_AARCH64MCTARGETDESC_H
+#ifndef AArch64MCTARGETDESC_H
+#define AArch64MCTARGETDESC_H
 
 #include "llvm/Support/DataTypes.h"
+#include <string>
 
 namespace llvm {
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
 class MCRegisterInfo;
+class MCObjectWriter;
 class MCSubtargetInfo;
 class StringRef;
 class Target;
@@ -30,28 +31,25 @@ class raw_ostream;
 
 extern Target TheAArch64leTarget;
 extern Target TheAArch64beTarget;
-
-namespace AArch64_MC {
-  MCSubtargetInfo *createAArch64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                StringRef FS);
-}
+extern Target TheARM64leTarget;
+extern Target TheARM64beTarget;
 
 MCCodeEmitter *createAArch64MCCodeEmitter(const MCInstrInfo &MCII,
-                                          const MCRegisterInfo &MRI,
-                                          const MCSubtargetInfo &STI,
-                                          MCContext &Ctx);
+                                        const MCRegisterInfo &MRI,
+                                        const MCSubtargetInfo &STI,
+                                        MCContext &Ctx);
+MCAsmBackend *createAArch64leAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
+MCAsmBackend *createAArch64beAsmBackend(const Target &T,
+                                        const MCRegisterInfo &MRI, StringRef TT,
+                                        StringRef CPU);
 
-MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS,
-                                             uint8_t OSABI,
+MCObjectWriter *createAArch64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI,
                                              bool IsLittleEndian);
 
-MCAsmBackend *createAArch64leAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
-
-MCAsmBackend *createAArch64beAsmBackend(const Target &T,
-                                        const MCRegisterInfo &MRI,
-                                        StringRef TT, StringRef CPU);
+MCObjectWriter *createAArch64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
+                                            uint32_t CPUSubtype);
 
 } // End llvm namespace
 
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 1733dc5..5c86189 100644
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachObjectWriter.cpp - ARM Mach Object Writer ------------------===//
+//===-- AArch64MachObjectWriter.cpp - ARM Mach Object Writer --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
+#include "MCTargetDesc/AArch64FixupKinds.h"
+#include "MCTargetDesc/AArch64MCTargetDesc.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
@@ -23,24 +23,24 @@
 using namespace llvm;
 
 namespace {
-class ARM64MachObjectWriter : public MCMachObjectTargetWriter {
-  bool getARM64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
+class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
+  bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
                                   const MCSymbolRefExpr *Sym,
                                   unsigned &Log2Size, const MCAssembler &Asm);
 
 public:
-  ARM64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
+  AArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype)
       : MCMachObjectTargetWriter(true /* is64Bit */, CPUType, CPUSubtype,
                                  /*UseAggressiveSymbolFolding=*/true) {}
 
   void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue);
+                        uint64_t &FixedValue) override;
 };
 }
 
-bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
+bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
     unsigned &Log2Size, const MCAssembler &Asm) {
   RelocType = unsigned(MachO::ARM64_RELOC_UNSIGNED);
@@ -66,12 +66,12 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
+  case AArch64::fixup_aarch64_add_imm12:
+  case AArch64::fixup_aarch64_ldst_imm12_scale1:
+  case AArch64::fixup_aarch64_ldst_imm12_scale2:
+  case AArch64::fixup_aarch64_ldst_imm12_scale4:
+  case AArch64::fixup_aarch64_ldst_imm12_scale8:
+  case AArch64::fixup_aarch64_ldst_imm12_scale16:
     Log2Size = llvm::Log2_32(4);
     switch (Sym->getKind()) {
     default:
@@ -86,7 +86,7 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
       RelocType = unsigned(MachO::ARM64_RELOC_TLVP_LOAD_PAGEOFF12);
       return true;
     }
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
+  case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     Log2Size = llvm::Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
@@ -104,15 +104,15 @@ bool ARM64MachObjectWriter::getARM64FixupKindMachOInfo(
       return true;
     }
     return true;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
+  case AArch64::fixup_aarch64_pcrel_branch26:
+  case AArch64::fixup_aarch64_pcrel_call26:
     Log2Size = llvm::Log2_32(4);
     RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
     return true;
   }
 }
 
-void ARM64MachObjectWriter::RecordRelocation(
+void AArch64MachObjectWriter::RecordRelocation(
     MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
     const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
     uint64_t &FixedValue) {
@@ -129,21 +129,20 @@ void ARM64MachObjectWriter::RecordRelocation(
 
   FixupOffset += Fixup.getOffset();
 
-  // ARM64 pcrel relocation addends do not include the section offset.
+  // AArch64 pcrel relocation addends do not include the section offset.
   if (IsPCRel)
     FixedValue += FixupOffset;
 
   // ADRP fixups use relocations for the whole symbol value and only
   // put the addend in the instruction itself. Clear out any value the
   // generic code figured out from the sybmol definition.
-  if (Kind == ARM64::fixup_arm64_pcrel_adrp_imm21 ||
-      Kind == ARM64::fixup_arm64_pcrel_imm19)
+  if (Kind == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     FixedValue = 0;
 
   // imm19 relocations are for conditional branches, which require
   // assembler local symbols. If we got here, that's not what we have,
   // so complain loudly.
-  if (Kind == ARM64::fixup_arm64_pcrel_imm19) {
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch19) {
     Asm.getContext().FatalError(Fixup.getLoc(),
                                 "conditional branch requires assembler-local"
                                 " label. '" +
@@ -154,15 +153,15 @@ void ARM64MachObjectWriter::RecordRelocation(
 
   // 14-bit branch relocations should only target internal labels, and so
   // should never get here.
-  if (Kind == ARM64::fixup_arm64_pcrel_branch14) {
+  if (Kind == AArch64::fixup_aarch64_pcrel_branch14) {
     Asm.getContext().FatalError(Fixup.getLoc(),
                                 "Invalid relocation on conditional branch!");
     return;
   }
 
-  if (!getARM64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
+  if (!getAArch64FixupKindMachOInfo(Fixup, Type, Target.getSymA(), Log2Size,
                                   Asm)) {
-    Asm.getContext().FatalError(Fixup.getLoc(), "unknown ARM64 fixup kind!");
+    Asm.getContext().FatalError(Fixup.getLoc(), "unknown AArch64 fixup kind!");
     return;
   }
 
@@ -184,11 +183,11 @@ void ARM64MachObjectWriter::RecordRelocation(
     }
   } else if (Target.getSymB()) { // A - B + constant
     const MCSymbol *A = &Target.getSymA()->getSymbol();
-    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Check for "_foo@got - .", which comes through here as:
@@ -221,8 +220,8 @@ void ARM64MachObjectWriter::RecordRelocation(
                                   "unsupported pc-relative relocation of "
                                   "difference");
 
-    // ARM64 always uses external relocations. If there is no symbol to use as
-    // a base address (a local symbol with no preceeding non-local symbol),
+    // AArch64 always uses external relocations. If there is no symbol to use as
+    // a base address (a local symbol with no preceding non-local symbol),
     // error out.
     //
     // FIXME: We should probably just synthesize an external symbol and use
@@ -242,14 +241,14 @@ void ARM64MachObjectWriter::RecordRelocation(
       Asm.getContext().FatalError(Fixup.getLoc(),
                                   "unsupported relocation with identical base");
 
-    Value += (A_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
-                                                   &A_SD, Layout)) -
-             (A_Base == NULL || A_Base->getFragment() == NULL
+    Value += (!A_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&A_SD, Layout)) -
+             (!A_Base || !A_Base->getFragment()
                   ? 0
                   : Writer->getSymbolAddress(A_Base, Layout));
-    Value -= (B_SD.getFragment() == NULL ? 0 : Writer->getSymbolAddress(
-                                                   &B_SD, Layout)) -
-             (B_Base == NULL || B_Base->getFragment() == NULL
+    Value -= (!B_SD.getFragment() ? 0
+                                  : Writer->getSymbolAddress(&B_SD, Layout)) -
+             (!B_Base || !B_Base->getFragment()
                   ? 0
                   : Writer->getSymbolAddress(B_Base, Layout));
 
@@ -268,7 +267,7 @@ void ARM64MachObjectWriter::RecordRelocation(
     Type = MachO::ARM64_RELOC_SUBTRACTOR;
   } else { // A + constant
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
     const MCSymbolData *Base = Asm.getAtom(&SD);
     const MCSectionMachO &Section = static_cast<const MCSectionMachO &>(
         Fragment->getParent()->getSection());
@@ -303,12 +302,12 @@ void ARM64MachObjectWriter::RecordRelocation(
     // have already been fixed up.
     if (Symbol->isInSection()) {
       if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = 0;
+        Base = nullptr;
     }
 
-    // ARM64 uses external relocations as much as possible. For debug sections,
-    // and for pointer-sized relocations (.quad), we allow section relocations.
-    // It's code sections that run into trouble.
+    // AArch64 uses external relocations as much as possible. For debug
+    // sections, and for pointer-sized relocations (.quad), we allow section
+    // relocations.  It's code sections that run into trouble.
     if (Base) {
       Index = Base->getIndex();
       IsExtern = 1;
@@ -388,9 +387,10 @@ void ARM64MachObjectWriter::RecordRelocation(
   Writer->addRelocation(Fragment->getParent(), MRE);
 }
 
-MCObjectWriter *llvm::createARM64MachObjectWriter(raw_ostream &OS,
+MCObjectWriter *llvm::createAArch64MachObjectWriter(raw_ostream &OS,
                                                   uint32_t CPUType,
                                                   uint32_t CPUSubtype) {
-  return createMachObjectWriter(new ARM64MachObjectWriter(CPUType, CPUSubtype),
-                                OS, /*IsLittleEndian=*/true);
+  return createMachObjectWriter(
+      new AArch64MachObjectWriter(CPUType, CPUSubtype), OS,
+      /*IsLittleEndian=*/true);
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/Android.mk b/lib/Target/AArch64/MCTargetDesc/Android.mk
index edcf1f2..c0cdb2b 100644
--- a/lib/Target/AArch64/MCTargetDesc/Android.mk
+++ b/lib/Target/AArch64/MCTargetDesc/Android.mk
@@ -10,6 +10,7 @@ arm64_mc_desc_SRC_FILES := \
   AArch64AsmBackend.cpp \
   AArch64ELFObjectWriter.cpp \
   AArch64ELFStreamer.cpp \
+  AArch64MachObjectWriter.cpp \
   AArch64MCAsmInfo.cpp \
   AArch64MCCodeEmitter.cpp \
   AArch64MCExpr.cpp \
diff --git a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
index 54c4465..7d5bced 100644
--- a/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AArch64/MCTargetDesc/CMakeLists.txt
@@ -6,4 +6,9 @@ add_llvm_library(LLVMAArch64Desc
   AArch64MCCodeEmitter.cpp
   AArch64MCExpr.cpp
   AArch64MCTargetDesc.cpp
-  )
+  AArch64MachObjectWriter.cpp
+)
+add_dependencies(LLVMAArch64Desc AArch64CommonTableGen)
+
+# Hack: we need to include 'main' target directory to grab private headers
+include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
index 37c8035..70cff0b 100644
--- a/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ----------*- Conf -*--===;
+;===- ./lib/Target/AArch64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Makefile b/lib/Target/AArch64/Makefile
index 641bb83..f356c58 100644
--- a/lib/Target/AArch64/Makefile
+++ b/lib/Target/AArch64/Makefile
@@ -12,19 +12,14 @@ LIBRARYNAME = LLVMAArch64CodeGen
 TARGET = AArch64
 
 # Make sure that tblgen is run, first thing.
-BUILT_SOURCES = AArch64GenAsmMatcher.inc \
-   AArch64GenAsmWriter.inc \
-   AArch64GenCallingConv.inc \
-   AArch64GenDAGISel.inc \
-   AArch64GenDisassemblerTables.inc \
-   AArch64GenInstrInfo.inc \
-   AArch64GenMCCodeEmitter.inc \
-   AArch64GenMCPseudoLowering.inc \
-   AArch64GenRegisterInfo.inc \
-   AArch64GenSubtargetInfo.inc
+BUILT_SOURCES = AArch64GenRegisterInfo.inc AArch64GenInstrInfo.inc \
+		AArch64GenAsmWriter.inc AArch64GenAsmWriter1.inc \
+		AArch64GenDAGISel.inc \
+		AArch64GenCallingConv.inc AArch64GenAsmMatcher.inc \
+		AArch64GenSubtargetInfo.inc AArch64GenMCCodeEmitter.inc \
+		AArch64GenFastISel.inc AArch64GenDisassemblerTables.inc \
+		AArch64GenMCPseudoLowering.inc
 
-DIRS = InstPrinter AsmParser Disassembler TargetInfo MCTargetDesc Utils
+DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc Utils
 
 include $(LEVEL)/Makefile.common
-
-
diff --git a/lib/Target/AArch64/README.txt b/lib/Target/AArch64/README.txt
deleted file mode 100644
index 601990f..0000000
--- a/lib/Target/AArch64/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-This file will contain changes that need to be made before AArch64 can become an
-officially supported target. Currently a placeholder.
diff --git a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
index 9281e4e..3a382c1 100644
--- a/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
+++ b/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -------------===//
+//===-- AArch64TargetInfo.cpp - AArch64 Target Implementation -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,22 +6,26 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the key registration step for the architecture.
-//
-//===----------------------------------------------------------------------===//
 
-#include "AArch64.h"
-#include "llvm/IR/Module.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
 
-Target llvm::TheAArch64leTarget;
-Target llvm::TheAArch64beTarget;
+namespace llvm {
+Target TheAArch64leTarget;
+Target TheAArch64beTarget;
+Target TheARM64leTarget;
+Target TheARM64beTarget;
+} // end namespace llvm
 
 extern "C" void LLVMInitializeAArch64TargetInfo() {
-    RegisterTarget<Triple::aarch64, /*HasJIT=*/true>
-    X(TheAArch64leTarget, "aarch64", "AArch64 (ARM 64-bit little endian target)");
-    RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true>
-    Y(TheAArch64beTarget, "aarch64_be", "AArch64 (ARM 64-bit big endian target)");
+  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64leTarget, "arm64",
+                                                   "AArch64 (little endian)");
+  RegisterTarget<Triple::arm64_be, /*HasJIT=*/true> Y(TheARM64beTarget, "arm64_be",
+                                                      "AArch64 (big endian)");
+
+  RegisterTarget<Triple::aarch64, /*HasJIT=*/true> Z(
+      TheAArch64leTarget, "aarch64", "AArch64 (little endian)");
+  RegisterTarget<Triple::aarch64_be, /*HasJIT=*/true> W(
+      TheAArch64beTarget, "aarch64_be", "AArch64 (big endian)");
 }
diff --git a/lib/Target/AArch64/TargetInfo/CMakeLists.txt b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
index ee734c6..e236eed 100644
--- a/lib/Target/AArch64/TargetInfo/CMakeLists.txt
+++ b/lib/Target/AArch64/TargetInfo/CMakeLists.txt
@@ -1,3 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
 add_llvm_library(LLVMAArch64Info
   AArch64TargetInfo.cpp
   )
+
+add_dependencies(LLVMAArch64Info AArch64CommonTableGen)
diff --git a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
index 6429172..93c5407 100644
--- a/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
+++ b/lib/Target/AArch64/TargetInfo/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index 2a97cd6..3c24bb3 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -18,7 +18,7 @@
 
 using namespace llvm;
 
-StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
+StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Value == Value) {
       Valid = true;
@@ -30,7 +30,7 @@ StringRef NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
   return StringRef();
 }
 
-uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
+uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   std::string LowerCaseName = Name.lower();
   for (unsigned i = 0; i < NumPairs; ++i) {
     if (Pairs[i].Name == LowerCaseName) {
@@ -43,11 +43,11 @@ uint32_t NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   return -1;
 }
 
-bool NamedImmMapper::validImm(uint32_t Value) const {
+bool AArch64NamedImmMapper::validImm(uint32_t Value) const {
   return Value < TooBigImm;
 }
 
-const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
   {"s1e1r", S1E1R},
   {"s1e2r", S1E2R},
   {"s1e3r", S1E3R},
@@ -62,10 +62,10 @@ const NamedImmMapper::Mapping A64AT::ATMapper::ATPairs[] = {
   {"s12e0w", S12E0W},
 };
 
-A64AT::ATMapper::ATMapper()
-  : NamedImmMapper(ATPairs, 0) {}
+AArch64AT::ATMapper::ATMapper()
+  : AArch64NamedImmMapper(ATPairs, 0) {}
 
-const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
   {"oshld", OSHLD},
   {"oshst", OSHST},
   {"osh", OSH},
@@ -80,10 +80,10 @@ const NamedImmMapper::Mapping A64DB::DBarrierMapper::DBarrierPairs[] = {
   {"sy", SY}
 };
 
-A64DB::DBarrierMapper::DBarrierMapper()
-  : NamedImmMapper(DBarrierPairs, 16u) {}
+AArch64DB::DBarrierMapper::DBarrierMapper()
+  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
 
-const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
   {"zva", ZVA},
   {"ivac", IVAC},
   {"isw", ISW},
@@ -94,26 +94,26 @@ const NamedImmMapper::Mapping A64DC::DCMapper::DCPairs[] = {
   {"cisw", CISW}
 };
 
-A64DC::DCMapper::DCMapper()
-  : NamedImmMapper(DCPairs, 0) {}
+AArch64DC::DCMapper::DCMapper()
+  : AArch64NamedImmMapper(DCPairs, 0) {}
 
-const NamedImmMapper::Mapping A64IC::ICMapper::ICPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
   {"ialluis",  IALLUIS},
   {"iallu", IALLU},
   {"ivau", IVAU}
 };
 
-A64IC::ICMapper::ICMapper()
-  : NamedImmMapper(ICPairs, 0) {}
+AArch64IC::ICMapper::ICMapper()
+  : AArch64NamedImmMapper(ICPairs, 0) {}
 
-const NamedImmMapper::Mapping A64ISB::ISBMapper::ISBPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
   {"sy",  SY},
 };
 
-A64ISB::ISBMapper::ISBMapper()
-  : NamedImmMapper(ISBPairs, 16) {}
+AArch64ISB::ISBMapper::ISBMapper()
+  : AArch64NamedImmMapper(ISBPairs, 16) {}
 
-const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pldl1keep", PLDL1KEEP},
   {"pldl1strm", PLDL1STRM},
   {"pldl2keep", PLDL2KEEP},
@@ -134,19 +134,19 @@ const NamedImmMapper::Mapping A64PRFM::PRFMMapper::PRFMPairs[] = {
   {"pstl3strm", PSTL3STRM}
 };
 
-A64PRFM::PRFMMapper::PRFMMapper()
-  : NamedImmMapper(PRFMPairs, 32) {}
+AArch64PRFM::PRFMMapper::PRFMMapper()
+  : AArch64NamedImmMapper(PRFMPairs, 32) {}
 
-const NamedImmMapper::Mapping A64PState::PStateMapper::PStatePairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
   {"spsel", SPSel},
   {"daifset", DAIFSet},
   {"daifclr", DAIFClr}
 };
 
-A64PState::PStateMapper::PStateMapper()
-  : NamedImmMapper(PStatePairs, 0) {}
+AArch64PState::PStateMapper::PStateMapper()
+  : AArch64NamedImmMapper(PStatePairs, 0) {}
 
-const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
   {"mdccsr_el0", MDCCSR_EL0},
   {"dbgdtrrx_el0", DBGDTRRX_EL0},
   {"mdrar_el1", MDRAR_EL1},
@@ -176,16 +176,16 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"id_isar3_el1", ID_ISAR3_EL1},
   {"id_isar4_el1", ID_ISAR4_EL1},
   {"id_isar5_el1", ID_ISAR5_EL1},
-  {"id_aa64pfr0_el1", ID_AA64PFR0_EL1},
-  {"id_aa64pfr1_el1", ID_AA64PFR1_EL1},
-  {"id_aa64dfr0_el1", ID_AA64DFR0_EL1},
-  {"id_aa64dfr1_el1", ID_AA64DFR1_EL1},
-  {"id_aa64afr0_el1", ID_AA64AFR0_EL1},
-  {"id_aa64afr1_el1", ID_AA64AFR1_EL1},
-  {"id_aa64isar0_el1", ID_AA64ISAR0_EL1},
-  {"id_aa64isar1_el1", ID_AA64ISAR1_EL1},
-  {"id_aa64mmfr0_el1", ID_AA64MMFR0_EL1},
-  {"id_aa64mmfr1_el1", ID_AA64MMFR1_EL1},
+  {"id_aa64pfr0_el1", ID_A64PFR0_EL1},
+  {"id_aa64pfr1_el1", ID_A64PFR1_EL1},
+  {"id_aa64dfr0_el1", ID_A64DFR0_EL1},
+  {"id_aa64dfr1_el1", ID_A64DFR1_EL1},
+  {"id_aa64afr0_el1", ID_A64AFR0_EL1},
+  {"id_aa64afr1_el1", ID_A64AFR1_EL1},
+  {"id_aa64isar0_el1", ID_A64ISAR0_EL1},
+  {"id_aa64isar1_el1", ID_A64ISAR1_EL1},
+  {"id_aa64mmfr0_el1", ID_A64MMFR0_EL1},
+  {"id_aa64mmfr1_el1", ID_A64MMFR1_EL1},
   {"mvfr0_el1", MVFR0_EL1},
   {"mvfr1_el1", MVFR1_EL1},
   {"mvfr2_el1", MVFR2_EL1},
@@ -245,12 +245,13 @@ const NamedImmMapper::Mapping A64SysReg::MRSMapper::MRSPairs[] = {
   {"ich_elsr_el2", ICH_ELSR_EL2}
 };
 
-A64SysReg::MRSMapper::MRSMapper() {
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MRSPairs[0];
     NumInstPairs = llvm::array_lengthof(MRSPairs);
 }
 
-const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
   {"dbgdtrtx_el0", DBGDTRTX_EL0},
   {"oslar_el1", OSLAR_EL1},
   {"pmswinc_el0", PMSWINC_EL0},
@@ -268,13 +269,14 @@ const NamedImmMapper::Mapping A64SysReg::MSRMapper::MSRPairs[] = {
   {"icc_sgi0r_el1", ICC_SGI0R_EL1}
 };
 
-A64SysReg::MSRMapper::MSRMapper() {
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
+  : SysRegMapper(FeatureBits) {
     InstPairs = &MSRPairs[0];
     NumInstPairs = llvm::array_lengthof(MSRPairs);
 }
 
 
-const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
   {"osdtrrx_el1", OSDTRRX_EL1},
   {"osdtrtx_el1",  OSDTRTX_EL1},
   {"teecr32_el1", TEECR32_EL1},
@@ -753,10 +755,16 @@ const NamedImmMapper::Mapping A64SysReg::SysRegMapper::SysRegPairs[] = {
   {"ich_lr15_el2", ICH_LR15_EL2}
 };
 
+const AArch64NamedImmMapper::Mapping
+AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+};
+
 uint32_t
-A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
-  // First search the registers shared by all
+AArch64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
   std::string NameLower = Name.lower();
+
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Name == NameLower) {
       Valid = true;
@@ -764,6 +772,16 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Name == NameLower) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Value;
+      }
+    }
+  }
+
   // Now try the instruction-specific registers (either read-only or
   // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
@@ -796,7 +814,8 @@ A64SysReg::SysRegMapper::fromString(StringRef Name, bool &Valid) const {
 }
 
 std::string
-A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+AArch64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
+  // First search the registers shared by all
   for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
     if (SysRegPairs[i].Value == Bits) {
       Valid = true;
@@ -804,6 +823,18 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
     }
   }
 
+  // Next search for target specific registers
+  if (FeatureBits & AArch64::ProcCyclone) {
+    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
+      if (CycloneSysRegPairs[i].Value == Bits) {
+        Valid = true;
+        return CycloneSysRegPairs[i].Name;
+      }
+    }
+  }
+
+  // Now try the instruction-specific registers (either read-only or
+  // write-only).
   for (unsigned i = 0; i < NumInstPairs; ++i) {
     if (InstPairs[i].Value == Bits) {
       Valid = true;
@@ -831,7 +862,7 @@ A64SysReg::SysRegMapper::toString(uint32_t Bits, bool &Valid) const {
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 
-const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
   {"ipas2e1is", IPAS2E1IS},
   {"ipas2le1is", IPAS2LE1IS},
   {"vmalle1is", VMALLE1IS},
@@ -866,308 +897,5 @@ const NamedImmMapper::Mapping A64TLBI::TLBIMapper::TLBIPairs[] = {
   {"vaale1", VAALE1}
 };
 
-A64TLBI::TLBIMapper::TLBIMapper()
-  : NamedImmMapper(TLBIPairs, 0) {}
-
-bool A64Imms::isFPImm(const APFloat &Val, uint32_t &Imm8Bits) {
-  const fltSemantics &Sem = Val.getSemantics();
-  unsigned FracBits = APFloat::semanticsPrecision(Sem) - 1;
-
-  uint32_t ExpMask;
-  switch (FracBits) {
-  case 10: // IEEE half-precision
-    ExpMask = 0x1f;
-    break;
-  case 23: // IEEE single-precision
-    ExpMask = 0xff;
-    break;
-  case 52: // IEEE double-precision
-    ExpMask = 0x7ff;
-    break;
-  case 112: // IEEE quad-precision
-    // No immediates are valid for double precision.
-    return false;
-  default:
-    llvm_unreachable("Only half, single and double precision supported");
-  }
-
-  uint32_t ExpStart = FracBits;
-  uint64_t FracMask = (1ULL << FracBits) - 1;
-
-  uint32_t Sign = Val.isNegative();
-
-  uint64_t Bits= Val.bitcastToAPInt().getLimitedValue();
-  uint64_t Fraction = Bits & FracMask;
-  int32_t Exponent = ((Bits >> ExpStart) & ExpMask);
-  Exponent -= ExpMask >> 1;
-
-  // S[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 5):imm8<5:0>:Zeros(19)
-  // D[d] = imm8<7>:NOT(imm8<6>):Replicate(imm8<6>, 8):imm8<5:0>:Zeros(48)
-  // This translates to: only 4 bits of fraction; -3 <= exp <= 4.
-  uint64_t A64FracStart = FracBits - 4;
-  uint64_t A64FracMask = 0xf;
-
-  // Are there too many fraction bits?
-  if (Fraction & ~(A64FracMask << A64FracStart))
-    return false;
-
-  if (Exponent < -3 || Exponent > 4)
-    return false;
-
-  uint32_t PackedFraction = (Fraction >> A64FracStart) & A64FracMask;
-  uint32_t PackedExp = (Exponent + 7) & 0x7;
-
-  Imm8Bits = (Sign << 7) | (PackedExp << 4) | PackedFraction;
-  return true;
-}
-
-// Encoding of the immediate for logical (immediate) instructions:
-//
-// | N | imms   | immr   | size | R            | S            |
-// |---+--------+--------+------+--------------+--------------|
-// | 1 | ssssss | rrrrrr |   64 | UInt(rrrrrr) | UInt(ssssss) |
-// | 0 | 0sssss | xrrrrr |   32 | UInt(rrrrr)  | UInt(sssss)  |
-// | 0 | 10ssss | xxrrrr |   16 | UInt(rrrr)   | UInt(ssss)   |
-// | 0 | 110sss | xxxrrr |    8 | UInt(rrr)    | UInt(sss)    |
-// | 0 | 1110ss | xxxxrr |    4 | UInt(rr)     | UInt(ss)     |
-// | 0 | 11110s | xxxxxr |    2 | UInt(r)      | UInt(s)      |
-// | 0 | 11111x | -      |      | UNALLOCATED  |              |
-//
-// Columns 'R', 'S' and 'size' specify a "bitmask immediate" of size bits in
-// which the lower S+1 bits are ones and the remaining bits are zero, then
-// rotated right by R bits, which is then replicated across the datapath.
-//
-// + Values of 'N', 'imms' and 'immr' which do not match the above table are
-//   RESERVED.
-// + If all 's' bits in the imms field are set then the instruction is
-//   RESERVED.
-// + The 'x' bits in the 'immr' field are IGNORED.
-
-bool A64Imms::isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits) {
-  int RepeatWidth;
-  int Rotation = 0;
-  int Num1s = 0;
-
-  // Because there are S+1 ones in the replicated mask, an immediate of all
-  // zeros is not allowed. Filtering it here is probably more efficient.
-  if (Imm == 0) return false;
-
-  for (RepeatWidth = RegWidth; RepeatWidth > 1; RepeatWidth /= 2) {
-    uint64_t RepeatMask = RepeatWidth == 64 ? -1 : (1ULL << RepeatWidth) - 1;
-    uint64_t ReplicatedMask = Imm & RepeatMask;
-
-    if (ReplicatedMask == 0) continue;
-
-    // First we have to make sure the mask is actually repeated in each slot for
-    // this width-specifier.
-    bool IsReplicatedMask = true;
-    for (unsigned i = RepeatWidth; i < RegWidth; i += RepeatWidth) {
-      if (((Imm >> i) & RepeatMask) != ReplicatedMask) {
-        IsReplicatedMask = false;
-        break;
-      }
-    }
-    if (!IsReplicatedMask) continue;
-
-    // Now we have to work out the amount of rotation needed. The first part of
-    // this calculation is actually independent of RepeatWidth, but the complex
-    // case will depend on it.
-    Rotation = countTrailingZeros(Imm);
-    if (Rotation == 0) {
-      // There were no leading zeros, which means it's either in place or there
-      // are 1s at each end (e.g. 0x8003 needs rotating).
-      Rotation = RegWidth == 64 ? CountLeadingOnes_64(Imm)
-                                : CountLeadingOnes_32(Imm);
-      Rotation = RepeatWidth - Rotation;
-    }
-
-    uint64_t ReplicatedOnes = ReplicatedMask;
-    if (Rotation != 0 && Rotation != 64)
-      ReplicatedOnes = (ReplicatedMask >> Rotation)
-        | ((ReplicatedMask << (RepeatWidth - Rotation)) & RepeatMask);
-
-    // Of course, they may not actually be ones, so we have to check that:
-    if (!isMask_64(ReplicatedOnes))
-      continue;
-
-    Num1s = CountTrailingOnes_64(ReplicatedOnes);
-
-    // We know we've got an almost valid encoding (certainly, if this is invalid
-    // no other parameters would work).
-    break;
-  }
-
-  // The encodings which would produce all 1s are RESERVED.
-  if (RepeatWidth == 1 || Num1s == RepeatWidth) return false;
-
-  uint32_t N = RepeatWidth == 64;
-  uint32_t ImmR = RepeatWidth - Rotation;
-  uint32_t ImmS = Num1s - 1;
-
-  switch (RepeatWidth) {
-  default: break; // No action required for other valid rotations.
-  case 16: ImmS |= 0x20; break; // 10ssss
-  case 8: ImmS |= 0x30; break;  // 110sss
-  case 4: ImmS |= 0x38; break;  // 1110ss
-  case 2: ImmS |= 0x3c; break;  // 11110s
-  }
-
-  Bits = ImmS | (ImmR << 6) | (N << 12);
-
-  return true;
-}
-
-
-bool A64Imms::isLogicalImmBits(unsigned RegWidth, uint32_t Bits,
-                               uint64_t &Imm) {
-  uint32_t N = Bits >> 12;
-  uint32_t ImmR = (Bits >> 6) & 0x3f;
-  uint32_t ImmS = Bits & 0x3f;
-
-  // N=1 encodes a 64-bit replication and is invalid for the 32-bit
-  // instructions.
-  if (RegWidth == 32 && N != 0) return false;
-
-  int Width = 0;
-  if (N == 1)
-    Width = 64;
-  else if ((ImmS & 0x20) == 0)
-    Width = 32;
-  else if ((ImmS & 0x10) == 0)
-    Width = 16;
-  else if ((ImmS & 0x08) == 0)
-    Width = 8;
-  else if ((ImmS & 0x04) == 0)
-    Width = 4;
-  else if ((ImmS & 0x02) == 0)
-    Width = 2;
-  else {
-    // ImmS  is 0b11111x: UNALLOCATED
-    return false;
-  }
-
-  int Num1s = (ImmS & (Width - 1)) + 1;
-
-  // All encodings which would map to -1 (signed) are RESERVED.
-  if (Num1s == Width) return false;
-
-  int Rotation = (ImmR & (Width - 1));
-  uint64_t Mask = (1ULL << Num1s) - 1;
-  uint64_t WidthMask = Width == 64 ? -1 : (1ULL << Width) - 1;
-  if (Rotation != 0 && Rotation != 64)
-    Mask = (Mask >> Rotation)
-      | ((Mask << (Width - Rotation)) & WidthMask);
-
-  Imm = Mask;
-  for (unsigned i = 1; i < RegWidth / Width; ++i) {
-    Mask <<= Width;
-    Imm |= Mask;
-  }
-
-  return true;
-}
-
-bool A64Imms::isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // If high bits are set then a 32-bit MOVZ can't possibly work.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  for (int i = 0; i < RegWidth; i += 16) {
-    // If the value is 0 when we mask out all the bits that could be set with
-    // the current LSL value then it's representable.
-    if ((Value & ~(0xffffULL << i)) == 0) {
-      Shift = i / 16;
-      UImm16 = (Value >> i) & 0xffff;
-      return true;
-    }
-  }
-  return false;
-}
-
-bool A64Imms::isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift) {
-  // MOVN is defined to set its register to NOT(LSL(imm16, shift)).
-
-  // We have to be a little careful about a 32-bit register: 0xffff_1234 *is*
-  // representable, but ~0xffff_1234 == 0xffff_ffff_0000_edcb which is not
-  // a valid input for isMOVZImm.
-  if (RegWidth == 32 && (Value & ~0xffffffffULL))
-    return false;
-
-  uint64_t MOVZEquivalent = RegWidth == 32 ? ~Value & 0xffffffff : ~Value;
-
-  return isMOVZImm(RegWidth, MOVZEquivalent, UImm16, Shift);
-}
-
-bool A64Imms::isOnlyMOVNImm(int RegWidth, uint64_t Value,
-                            int &UImm16, int &Shift) {
-  if (isMOVZImm(RegWidth, Value, UImm16, Shift))
-    return false;
-
-  return isMOVNImm(RegWidth, Value, UImm16, Shift);
-}
-
-// decodeNeonModShiftImm - Decode a Neon OpCmode value into the
-// the shift amount and the shift type (shift zeros or ones in) and
-// returns whether the OpCmode value implies a shift operation.
-bool A64Imms::decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                                    unsigned &ShiftOnesIn) {
-  ShiftImm = 0;
-  ShiftOnesIn = false;
-  bool HasShift = true;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    HasShift = false;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    HasShift = false;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    ShiftImm = ((OpCmode & 0x2) >> 1);
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    ShiftImm = ((OpCmode & 0x6) >> 1);
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    ShiftOnesIn = true;
-    ShiftImm = (OpCmode & 0x1);
-  } else {
-    // per byte, per bytemask
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-
-  return HasShift;
-}
-
-// decodeNeonModImm - Decode a NEON modified immediate and OpCmode values
-// into the element value and the element size in bits.
-uint64_t A64Imms::decodeNeonModImm(unsigned Val, unsigned OpCmode,
-                                   unsigned &EltBits) {
-  uint64_t DecodedVal = Val;
-  EltBits = 0;
-
-  if (OpCmode == 0xe) {
-    // movi byte
-    EltBits = 8;
-  } else if (OpCmode == 0x1e) {
-    // movi 64-bit bytemask
-    DecodedVal = 0;
-    for (unsigned ByteNum = 0; ByteNum < 8; ++ByteNum) {
-      if ((Val >> ByteNum) & 1)
-        DecodedVal |= (uint64_t)0xff << (8 * ByteNum);
-    }
-    EltBits = 64;
-  } else if ((OpCmode & 0xc) == 0x8) {
-    // shift zeros, per halfword
-    EltBits = 16;
-  } else if ((OpCmode & 0x8) == 0) {
-    // shift zeros, per word
-    EltBits = 32;
-  } else if ((OpCmode & 0xe) == 0xc) {
-    // shift ones, per word
-    EltBits = 32;
-  } else {
-    llvm_unreachable("Unsupported Neon modified immediate");
-  }
-  return DecodedVal;
-}
+AArch64TLBI::TLBIMapper::TLBIMapper()
+  : AArch64NamedImmMapper(TLBIPairs, 0) {}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 39b042b..9e4c389 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AArch64BaseInfo.h - Top level definitions for AArch64- --*- C++ -*-===//
+//===-- AArch64BaseInfo.h - Top level definitions for AArch64 ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,96 +14,271 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_AARCH64_BASEINFO_H
-#define LLVM_AARCH64_BASEINFO_H
+#ifndef AArch64BASEINFO_H
+#define AArch64BASEINFO_H
 
+// FIXME: Is it easiest to fix this layering violation by moving the .inc
+// #includes from AArch64MCTargetDesc.h to here?
+#include "MCTargetDesc/AArch64MCTargetDesc.h" // For AArch64::X0 and friends.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 
 namespace llvm {
 
-// // Enums corresponding to AArch64 condition codes
-namespace A64CC {
-  // The CondCodes constants map directly to the 4-bit encoding of the
-  // condition field for predicated instructions.
-  enum CondCodes {   // Meaning (integer)          Meaning (floating-point)
-    EQ = 0,        // Equal                      Equal
-    NE,            // Not equal                  Not equal, or unordered
-    HS,            // Unsigned higher or same    >, ==, or unordered
-    LO,            // Unsigned lower or same     Less than
-    MI,            // Minus, negative            Less than
-    PL,            // Plus, positive or zero     >, ==, or unordered
-    VS,            // Overflow                   Unordered
-    VC,            // No overflow                Ordered
-    HI,            // Unsigned higher            Greater than, or unordered
-    LS,            // Unsigned lower or same     Less than or equal
-    GE,            // Greater than or equal      Greater than or equal
-    LT,            // Less than                  Less than, or unordered
-    GT,            // Signed greater than        Greater than
-    LE,            // Signed less than or equal  <, ==, or unordered
-    AL,            // Always (unconditional)     Always (unconditional)
-    NV,             // Always (unconditional)     Always (unconditional)
-    // Note the NV exists purely to disassemble 0b1111. Execution
-    // is "always".
-    Invalid
-  };
+inline static unsigned getWRegFromXReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::X0: return AArch64::W0;
+  case AArch64::X1: return AArch64::W1;
+  case AArch64::X2: return AArch64::W2;
+  case AArch64::X3: return AArch64::W3;
+  case AArch64::X4: return AArch64::W4;
+  case AArch64::X5: return AArch64::W5;
+  case AArch64::X6: return AArch64::W6;
+  case AArch64::X7: return AArch64::W7;
+  case AArch64::X8: return AArch64::W8;
+  case AArch64::X9: return AArch64::W9;
+  case AArch64::X10: return AArch64::W10;
+  case AArch64::X11: return AArch64::W11;
+  case AArch64::X12: return AArch64::W12;
+  case AArch64::X13: return AArch64::W13;
+  case AArch64::X14: return AArch64::W14;
+  case AArch64::X15: return AArch64::W15;
+  case AArch64::X16: return AArch64::W16;
+  case AArch64::X17: return AArch64::W17;
+  case AArch64::X18: return AArch64::W18;
+  case AArch64::X19: return AArch64::W19;
+  case AArch64::X20: return AArch64::W20;
+  case AArch64::X21: return AArch64::W21;
+  case AArch64::X22: return AArch64::W22;
+  case AArch64::X23: return AArch64::W23;
+  case AArch64::X24: return AArch64::W24;
+  case AArch64::X25: return AArch64::W25;
+  case AArch64::X26: return AArch64::W26;
+  case AArch64::X27: return AArch64::W27;
+  case AArch64::X28: return AArch64::W28;
+  case AArch64::FP: return AArch64::W29;
+  case AArch64::LR: return AArch64::W30;
+  case AArch64::SP: return AArch64::WSP;
+  case AArch64::XZR: return AArch64::WZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-} // namespace A64CC
+inline static unsigned getXRegFromWReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::W0: return AArch64::X0;
+  case AArch64::W1: return AArch64::X1;
+  case AArch64::W2: return AArch64::X2;
+  case AArch64::W3: return AArch64::X3;
+  case AArch64::W4: return AArch64::X4;
+  case AArch64::W5: return AArch64::X5;
+  case AArch64::W6: return AArch64::X6;
+  case AArch64::W7: return AArch64::X7;
+  case AArch64::W8: return AArch64::X8;
+  case AArch64::W9: return AArch64::X9;
+  case AArch64::W10: return AArch64::X10;
+  case AArch64::W11: return AArch64::X11;
+  case AArch64::W12: return AArch64::X12;
+  case AArch64::W13: return AArch64::X13;
+  case AArch64::W14: return AArch64::X14;
+  case AArch64::W15: return AArch64::X15;
+  case AArch64::W16: return AArch64::X16;
+  case AArch64::W17: return AArch64::X17;
+  case AArch64::W18: return AArch64::X18;
+  case AArch64::W19: return AArch64::X19;
+  case AArch64::W20: return AArch64::X20;
+  case AArch64::W21: return AArch64::X21;
+  case AArch64::W22: return AArch64::X22;
+  case AArch64::W23: return AArch64::X23;
+  case AArch64::W24: return AArch64::X24;
+  case AArch64::W25: return AArch64::X25;
+  case AArch64::W26: return AArch64::X26;
+  case AArch64::W27: return AArch64::X27;
+  case AArch64::W28: return AArch64::X28;
+  case AArch64::W29: return AArch64::FP;
+  case AArch64::W30: return AArch64::LR;
+  case AArch64::WSP: return AArch64::SP;
+  case AArch64::WZR: return AArch64::XZR;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
+}
 
-inline static const char *A64CondCodeToString(A64CC::CondCodes CC) {
-  switch (CC) {
-  default: llvm_unreachable("Unknown condition code");
-  case A64CC::EQ:  return "eq";
-  case A64CC::NE:  return "ne";
-  case A64CC::HS:  return "hs";
-  case A64CC::LO:  return "lo";
-  case A64CC::MI:  return "mi";
-  case A64CC::PL:  return "pl";
-  case A64CC::VS:  return "vs";
-  case A64CC::VC:  return "vc";
-  case A64CC::HI:  return "hi";
-  case A64CC::LS:  return "ls";
-  case A64CC::GE:  return "ge";
-  case A64CC::LT:  return "lt";
-  case A64CC::GT:  return "gt";
-  case A64CC::LE:  return "le";
-  case A64CC::AL:  return "al";
-  case A64CC::NV:  return "nv";
+static inline unsigned getBRegFromDReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::D0:  return AArch64::B0;
+  case AArch64::D1:  return AArch64::B1;
+  case AArch64::D2:  return AArch64::B2;
+  case AArch64::D3:  return AArch64::B3;
+  case AArch64::D4:  return AArch64::B4;
+  case AArch64::D5:  return AArch64::B5;
+  case AArch64::D6:  return AArch64::B6;
+  case AArch64::D7:  return AArch64::B7;
+  case AArch64::D8:  return AArch64::B8;
+  case AArch64::D9:  return AArch64::B9;
+  case AArch64::D10: return AArch64::B10;
+  case AArch64::D11: return AArch64::B11;
+  case AArch64::D12: return AArch64::B12;
+  case AArch64::D13: return AArch64::B13;
+  case AArch64::D14: return AArch64::B14;
+  case AArch64::D15: return AArch64::B15;
+  case AArch64::D16: return AArch64::B16;
+  case AArch64::D17: return AArch64::B17;
+  case AArch64::D18: return AArch64::B18;
+  case AArch64::D19: return AArch64::B19;
+  case AArch64::D20: return AArch64::B20;
+  case AArch64::D21: return AArch64::B21;
+  case AArch64::D22: return AArch64::B22;
+  case AArch64::D23: return AArch64::B23;
+  case AArch64::D24: return AArch64::B24;
+  case AArch64::D25: return AArch64::B25;
+  case AArch64::D26: return AArch64::B26;
+  case AArch64::D27: return AArch64::B27;
+  case AArch64::D28: return AArch64::B28;
+  case AArch64::D29: return AArch64::B29;
+  case AArch64::D30: return AArch64::B30;
+  case AArch64::D31: return AArch64::B31;
   }
+  // For anything else, return it unchanged.
+  return Reg;
 }
 
-inline static A64CC::CondCodes A64StringToCondCode(StringRef CondStr) {
-  return StringSwitch<A64CC::CondCodes>(CondStr.lower())
-             .Case("eq", A64CC::EQ)
-             .Case("ne", A64CC::NE)
-             .Case("ne", A64CC::NE)
-             .Case("hs", A64CC::HS)
-             .Case("cs", A64CC::HS)
-             .Case("lo", A64CC::LO)
-             .Case("cc", A64CC::LO)
-             .Case("mi", A64CC::MI)
-             .Case("pl", A64CC::PL)
-             .Case("vs", A64CC::VS)
-             .Case("vc", A64CC::VC)
-             .Case("hi", A64CC::HI)
-             .Case("ls", A64CC::LS)
-             .Case("ge", A64CC::GE)
-             .Case("lt", A64CC::LT)
-             .Case("gt", A64CC::GT)
-             .Case("le", A64CC::LE)
-             .Case("al", A64CC::AL)
-             .Case("nv", A64CC::NV)
-             .Default(A64CC::Invalid);
+
+static inline unsigned getDRegFromBReg(unsigned Reg) {
+  switch (Reg) {
+  case AArch64::B0:  return AArch64::D0;
+  case AArch64::B1:  return AArch64::D1;
+  case AArch64::B2:  return AArch64::D2;
+  case AArch64::B3:  return AArch64::D3;
+  case AArch64::B4:  return AArch64::D4;
+  case AArch64::B5:  return AArch64::D5;
+  case AArch64::B6:  return AArch64::D6;
+  case AArch64::B7:  return AArch64::D7;
+  case AArch64::B8:  return AArch64::D8;
+  case AArch64::B9:  return AArch64::D9;
+  case AArch64::B10: return AArch64::D10;
+  case AArch64::B11: return AArch64::D11;
+  case AArch64::B12: return AArch64::D12;
+  case AArch64::B13: return AArch64::D13;
+  case AArch64::B14: return AArch64::D14;
+  case AArch64::B15: return AArch64::D15;
+  case AArch64::B16: return AArch64::D16;
+  case AArch64::B17: return AArch64::D17;
+  case AArch64::B18: return AArch64::D18;
+  case AArch64::B19: return AArch64::D19;
+  case AArch64::B20: return AArch64::D20;
+  case AArch64::B21: return AArch64::D21;
+  case AArch64::B22: return AArch64::D22;
+  case AArch64::B23: return AArch64::D23;
+  case AArch64::B24: return AArch64::D24;
+  case AArch64::B25: return AArch64::D25;
+  case AArch64::B26: return AArch64::D26;
+  case AArch64::B27: return AArch64::D27;
+  case AArch64::B28: return AArch64::D28;
+  case AArch64::B29: return AArch64::D29;
+  case AArch64::B30: return AArch64::D30;
+  case AArch64::B31: return AArch64::D31;
+  }
+  // For anything else, return it unchanged.
+  return Reg;
 }
 
-inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
-  // It turns out that the condition codes have been designed so that in order
-  // to reverse the intent of the condition you only have to invert the low bit:
+namespace AArch64CC {
+
+// The CondCodes constants map directly to the 4-bit encoding of the condition
+// field for predicated instructions.
+enum CondCode {  // Meaning (integer)          Meaning (floating-point)
+  EQ = 0x0,      // Equal                      Equal
+  NE = 0x1,      // Not equal                  Not equal, or unordered
+  HS = 0x2,      // Unsigned higher or same    >, ==, or unordered
+  LO = 0x3,      // Unsigned lower             Less than
+  MI = 0x4,      // Minus, negative            Less than
+  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
+  VS = 0x6,      // Overflow                   Unordered
+  VC = 0x7,      // No overflow                Not unordered
+  HI = 0x8,      // Unsigned higher            Greater than, or unordered
+  LS = 0x9,      // Unsigned lower or same     Less than or equal
+  GE = 0xa,      // Greater than or equal      Greater than or equal
+  LT = 0xb,      // Less than                  Less than, or unordered
+  GT = 0xc,      // Greater than               Greater than
+  LE = 0xd,      // Less than or equal         <, ==, or unordered
+  AL = 0xe,      // Always (unconditional)     Always (unconditional)
+  NV = 0xf,      // Always (unconditional)     Always (unconditional)
+  // Note the NV exists purely to disassemble 0b1111. Execution is "always".
+  Invalid
+};
 
-  return static_cast<A64CC::CondCodes>(static_cast<unsigned>(CC) ^ 0x1);
+inline static const char *getCondCodeName(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return "eq";
+  case NE:  return "ne";
+  case HS:  return "hs";
+  case LO:  return "lo";
+  case MI:  return "mi";
+  case PL:  return "pl";
+  case VS:  return "vs";
+  case VC:  return "vc";
+  case HI:  return "hi";
+  case LS:  return "ls";
+  case GE:  return "ge";
+  case LT:  return "lt";
+  case GT:  return "gt";
+  case LE:  return "le";
+  case AL:  return "al";
+  case NV:  return "nv";
+  }
+}
+
+inline static CondCode getInvertedCondCode(CondCode Code) {
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ:  return NE;
+  case NE:  return EQ;
+  case HS:  return LO;
+  case LO:  return HS;
+  case MI:  return PL;
+  case PL:  return MI;
+  case VS:  return VC;
+  case VC:  return VS;
+  case HI:  return LS;
+  case LS:  return HI;
+  case GE:  return LT;
+  case LT:  return GE;
+  case GT:  return LE;
+  case LE:  return GT;
+  }
 }
 
+/// Given a condition code, return NZCV flags that would satisfy that condition.
+/// The flag bits are in the format expected by the ccmp instructions.
+/// Note that many different flag settings can satisfy a given condition code,
+/// this function just returns one of them.
+inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
+  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
+  enum { N = 8, Z = 4, C = 2, V = 1 };
+  switch (Code) {
+  default: llvm_unreachable("Unknown condition code");
+  case EQ: return Z; // Z == 1
+  case NE: return 0; // Z == 0
+  case HS: return C; // C == 1
+  case LO: return 0; // C == 0
+  case MI: return N; // N == 1
+  case PL: return 0; // N == 0
+  case VS: return V; // V == 1
+  case VC: return 0; // V == 0
+  case HI: return C; // C == 1 && Z == 0
+  case LS: return 0; // C == 0 || Z == 1
+  case GE: return 0; // N == V
+  case LT: return N; // N != V
+  case GT: return 0; // Z == 0 && N == V
+  case LE: return Z; // Z == 1 || N != V
+  }
+}
+} // end namespace AArch64CC
+
 /// Instances of this class can perform bidirectional mapping from random
 /// identifier strings to operand encodings. For example "MSR" takes a named
 /// system-register which must be encoded somehow and decoded for printing. This
@@ -115,14 +290,14 @@ inline static A64CC::CondCodes A64InvertCondCode(A64CC::CondCodes CC) {
 /// out just how often these instructions are emitted before working on it. It
 /// might even be optimal to just reorder the tables for the common instructions
 /// rather than changing the algorithm.
-struct NamedImmMapper {
+struct AArch64NamedImmMapper {
   struct Mapping {
     const char *Name;
     uint32_t Value;
   };
 
   template<int N>
-  NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
+  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
     : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
 
   StringRef toString(uint32_t Value, bool &Valid) const;
@@ -138,7 +313,7 @@ protected:
   uint32_t TooBigImm;
 };
 
-namespace A64AT {
+namespace AArch64AT {
   enum ATValues {
     Invalid = -1,    // Op0 Op1  CRn   CRm   Op2
     S1E1R = 0x43c0,  // 01  000  0111  1000  000
@@ -155,14 +330,14 @@ namespace A64AT {
     S12E0W = 0x63c7  // 01  100  0111  1000  111
   };
 
-  struct ATMapper : NamedImmMapper {
+  struct ATMapper : AArch64NamedImmMapper {
     const static Mapping ATPairs[];
 
     ATMapper();
   };
 
 }
-namespace A64DB {
+namespace AArch64DB {
   enum DBValues {
     Invalid = -1,
     OSHLD = 0x1,
@@ -179,14 +354,14 @@ namespace A64DB {
     SY =    0xf
   };
 
-  struct DBarrierMapper : NamedImmMapper {
+  struct DBarrierMapper : AArch64NamedImmMapper {
     const static Mapping DBarrierPairs[];
 
     DBarrierMapper();
   };
 }
 
-namespace  A64DC {
+namespace  AArch64DC {
   enum DCValues {
     Invalid = -1,   // Op1  CRn   CRm   Op2
     ZVA   = 0x5ba1, // 01  011  0111  0100  001
@@ -199,7 +374,7 @@ namespace  A64DC {
     CISW  = 0x43f2  // 01  000  0111  1110  010
   };
 
-  struct DCMapper : NamedImmMapper {
+  struct DCMapper : AArch64NamedImmMapper {
     const static Mapping DCPairs[];
 
     DCMapper();
@@ -207,7 +382,7 @@ namespace  A64DC {
 
 }
 
-namespace  A64IC {
+namespace  AArch64IC {
   enum ICValues {
     Invalid = -1,     // Op1  CRn   CRm   Op2
     IALLUIS = 0x0388, // 000  0111  0001  000
@@ -216,7 +391,7 @@ namespace  A64IC {
   };
 
 
-  struct ICMapper : NamedImmMapper {
+  struct ICMapper : AArch64NamedImmMapper {
     const static Mapping ICPairs[];
 
     ICMapper();
@@ -227,19 +402,19 @@ namespace  A64IC {
   }
 }
 
-namespace  A64ISB {
+namespace  AArch64ISB {
   enum ISBValues {
     Invalid = -1,
     SY = 0xf
   };
-  struct ISBMapper : NamedImmMapper {
+  struct ISBMapper : AArch64NamedImmMapper {
     const static Mapping ISBPairs[];
 
     ISBMapper();
   };
 }
 
-namespace A64PRFM {
+namespace AArch64PRFM {
   enum PRFMValues {
     Invalid = -1,
     PLDL1KEEP = 0x00,
@@ -262,14 +437,14 @@ namespace A64PRFM {
     PSTL3STRM = 0x15
   };
 
-  struct PRFMMapper : NamedImmMapper {
+  struct PRFMMapper : AArch64NamedImmMapper {
     const static Mapping PRFMPairs[];
 
     PRFMMapper();
   };
 }
 
-namespace A64PState {
+namespace AArch64PState {
   enum PStateValues {
     Invalid = -1,
     SPSel = 0x05,
@@ -277,7 +452,7 @@ namespace A64PState {
     DAIFClr = 0x1f
   };
 
-  struct PStateMapper : NamedImmMapper {
+  struct PStateMapper : AArch64NamedImmMapper {
     const static Mapping PStatePairs[];
 
     PStateMapper();
@@ -285,7 +460,7 @@ namespace A64PState {
 
 }
 
-namespace A64SE {
+namespace AArch64SE {
     enum ShiftExtSpecifiers {
         Invalid = -1,
         LSL,
@@ -306,7 +481,7 @@ namespace A64SE {
     };
 }
 
-namespace A64Layout {
+namespace AArch64Layout {
     enum VectorLayout {
         Invalid = -1,
         VL_8B,
@@ -329,43 +504,43 @@ namespace A64Layout {
 }
 
 inline static const char *
-A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
+AArch64VectorLayoutToString(AArch64Layout::VectorLayout Layout) {
   switch (Layout) {
-  case A64Layout::VL_8B:  return ".8b";
-  case A64Layout::VL_4H:  return ".4h";
-  case A64Layout::VL_2S:  return ".2s";
-  case A64Layout::VL_1D:  return ".1d";
-  case A64Layout::VL_16B:  return ".16b";
-  case A64Layout::VL_8H:  return ".8h";
-  case A64Layout::VL_4S:  return ".4s";
-  case A64Layout::VL_2D:  return ".2d";
-  case A64Layout::VL_B:  return ".b";
-  case A64Layout::VL_H:  return ".h";
-  case A64Layout::VL_S:  return ".s";
-  case A64Layout::VL_D:  return ".d";
+  case AArch64Layout::VL_8B:  return ".8b";
+  case AArch64Layout::VL_4H:  return ".4h";
+  case AArch64Layout::VL_2S:  return ".2s";
+  case AArch64Layout::VL_1D:  return ".1d";
+  case AArch64Layout::VL_16B:  return ".16b";
+  case AArch64Layout::VL_8H:  return ".8h";
+  case AArch64Layout::VL_4S:  return ".4s";
+  case AArch64Layout::VL_2D:  return ".2d";
+  case AArch64Layout::VL_B:  return ".b";
+  case AArch64Layout::VL_H:  return ".h";
+  case AArch64Layout::VL_S:  return ".s";
+  case AArch64Layout::VL_D:  return ".d";
   default: llvm_unreachable("Unknown Vector Layout");
   }
 }
 
-inline static A64Layout::VectorLayout
-A64StringToVectorLayout(StringRef LayoutStr) {
-  return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
-             .Case(".8b", A64Layout::VL_8B)
-             .Case(".4h", A64Layout::VL_4H)
-             .Case(".2s", A64Layout::VL_2S)
-             .Case(".1d", A64Layout::VL_1D)
-             .Case(".16b", A64Layout::VL_16B)
-             .Case(".8h", A64Layout::VL_8H)
-             .Case(".4s", A64Layout::VL_4S)
-             .Case(".2d", A64Layout::VL_2D)
-             .Case(".b", A64Layout::VL_B)
-             .Case(".h", A64Layout::VL_H)
-             .Case(".s", A64Layout::VL_S)
-             .Case(".d", A64Layout::VL_D)
-             .Default(A64Layout::Invalid);
+inline static AArch64Layout::VectorLayout
+AArch64StringToVectorLayout(StringRef LayoutStr) {
+  return StringSwitch<AArch64Layout::VectorLayout>(LayoutStr)
+             .Case(".8b", AArch64Layout::VL_8B)
+             .Case(".4h", AArch64Layout::VL_4H)
+             .Case(".2s", AArch64Layout::VL_2S)
+             .Case(".1d", AArch64Layout::VL_1D)
+             .Case(".16b", AArch64Layout::VL_16B)
+             .Case(".8h", AArch64Layout::VL_8H)
+             .Case(".4s", AArch64Layout::VL_4S)
+             .Case(".2d", AArch64Layout::VL_2D)
+             .Case(".b", AArch64Layout::VL_B)
+             .Case(".h", AArch64Layout::VL_H)
+             .Case(".s", AArch64Layout::VL_S)
+             .Case(".d", AArch64Layout::VL_D)
+             .Default(AArch64Layout::Invalid);
 }
 
-namespace A64SysReg {
+namespace AArch64SysReg {
   enum SysRegROValues {
     MDCCSR_EL0        = 0x9808, // 10  011  0000  0001  000
     DBGDTRRX_EL0      = 0x9828, // 10  011  0000  0101  000
@@ -396,16 +571,16 @@ namespace A64SysReg {
     ID_ISAR3_EL1      = 0xc013, // 11  000  0000  0010  011
     ID_ISAR4_EL1      = 0xc014, // 11  000  0000  0010  100
     ID_ISAR5_EL1      = 0xc015, // 11  000  0000  0010  101
-    ID_AA64PFR0_EL1   = 0xc020, // 11  000  0000  0100  000
-    ID_AA64PFR1_EL1   = 0xc021, // 11  000  0000  0100  001
-    ID_AA64DFR0_EL1   = 0xc028, // 11  000  0000  0101  000
-    ID_AA64DFR1_EL1   = 0xc029, // 11  000  0000  0101  001
-    ID_AA64AFR0_EL1   = 0xc02c, // 11  000  0000  0101  100
-    ID_AA64AFR1_EL1   = 0xc02d, // 11  000  0000  0101  101
-    ID_AA64ISAR0_EL1  = 0xc030, // 11  000  0000  0110  000
-    ID_AA64ISAR1_EL1  = 0xc031, // 11  000  0000  0110  001
-    ID_AA64MMFR0_EL1  = 0xc038, // 11  000  0000  0111  000
-    ID_AA64MMFR1_EL1  = 0xc039, // 11  000  0000  0111  001
+    ID_A64PFR0_EL1    = 0xc020, // 11  000  0000  0100  000
+    ID_A64PFR1_EL1    = 0xc021, // 11  000  0000  0100  001
+    ID_A64DFR0_EL1    = 0xc028, // 11  000  0000  0101  000
+    ID_A64DFR1_EL1    = 0xc029, // 11  000  0000  0101  001
+    ID_A64AFR0_EL1    = 0xc02c, // 11  000  0000  0101  100
+    ID_A64AFR1_EL1    = 0xc02d, // 11  000  0000  0101  101
+    ID_A64ISAR0_EL1   = 0xc030, // 11  000  0000  0110  000
+    ID_A64ISAR1_EL1   = 0xc031, // 11  000  0000  0110  001
+    ID_A64MMFR0_EL1   = 0xc038, // 11  000  0000  0111  000
+    ID_A64MMFR1_EL1   = 0xc039, // 11  000  0000  0111  001
     MVFR0_EL1         = 0xc018, // 11  000  0000  0011  000
     MVFR1_EL1         = 0xc019, // 11  000  0000  0011  001
     MVFR2_EL1         = 0xc01a, // 11  000  0000  0011  010
@@ -960,38 +1135,45 @@ namespace A64SysReg {
     ICH_LR12_EL2      = 0xe66c, // 11  100  1100  1101  100
     ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
     ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
-    ICH_LR15_EL2      = 0xe66f  // 11  100  1100  1101  111
+    ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
+  };
+
+  // Cyclone specific system registers
+  enum CycloneSysRegValues {
+    CPM_IOACC_CTL_EL3 = 0xff90
   };
 
-  // Note that these do not inherit from NamedImmMapper. This class is
+  // Note that these do not inherit from AArch64NamedImmMapper. This class is
   // sufficiently different in its behaviour that I don't believe it's worth
-  // burdening the common NamedImmMapper with abstractions only needed in
+  // burdening the common AArch64NamedImmMapper with abstractions only needed in
   // this one case.
   struct SysRegMapper {
-    static const NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
 
-    const NamedImmMapper::Mapping *InstPairs;
+    const AArch64NamedImmMapper::Mapping *InstPairs;
     size_t NumInstPairs;
+    uint64_t FeatureBits;
 
-    SysRegMapper() {}
+    SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
     std::string toString(uint32_t Bits, bool &Valid) const;
   };
 
   struct MSRMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MSRPairs[];
-    MSRMapper();
+    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    MSRMapper(uint64_t FeatureBits);
   };
 
   struct MRSMapper : SysRegMapper {
-    static const NamedImmMapper::Mapping MRSPairs[];
-    MRSMapper();
+    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    MRSMapper(uint64_t FeatureBits);
   };
 
   uint32_t ParseGenericRegister(StringRef Name, bool &Valid);
 }
 
-namespace A64TLBI {
+namespace AArch64TLBI {
   enum TLBIValues {
     Invalid = -1,          // Op0 Op1  CRn   CRm   Op2
     IPAS2E1IS    = 0x6401, // 01  100  1000  0000  001
@@ -1028,7 +1210,7 @@ namespace A64TLBI {
     VAALE1       = 0x443f  // 01  000  1000  0111  111
   };
 
-  struct TLBIMapper : NamedImmMapper {
+  struct TLBIMapper : AArch64NamedImmMapper {
     const static Mapping TLBIPairs[];
 
     TLBIMapper();
@@ -1051,88 +1233,62 @@ namespace A64TLBI {
       return true;
     }
   }
-}
+} 
 
 namespace AArch64II {
-
+  /// Target Operand Flag enum.
   enum TOF {
-    //===--------------------------------------------------------------===//
+    //===------------------------------------------------------------------===//
     // AArch64 Specific MachineOperand flags.
 
     MO_NO_FLAG,
 
-    // MO_GOT - Represents a relocation referring to the GOT entry of a given
-    // symbol. Used in adrp.
-    MO_GOT,
-
-    // MO_GOT_LO12 - Represents a relocation referring to the low 12 bits of the
-    // GOT entry of a given symbol. Used in ldr only.
-    MO_GOT_LO12,
-
-    // MO_DTPREL_* - Represents a relocation referring to the offset from a
-    // module's dynamic thread pointer. Used in the local-dynamic TLS access
-    // model.
-    MO_DTPREL_G1,
-    MO_DTPREL_G0_NC,
-
-    // MO_GOTTPREL_* - Represents a relocation referring to a GOT entry
-    // providing the offset of a variable from the thread-pointer. Used in
-    // initial-exec TLS model where this offset is assigned in the static thread
-    // block and thus known by the dynamic linker.
-    MO_GOTTPREL,
-    MO_GOTTPREL_LO12,
-
-    // MO_TLSDESC_* - Represents a relocation referring to a GOT entry providing
-    // a TLS descriptor chosen by the dynamic linker. Used for the
-    // general-dynamic and local-dynamic TLS access models where very littls is
-    // known at link-time.
-    MO_TLSDESC,
-    MO_TLSDESC_LO12,
-
-    // MO_TPREL_* - Represents a relocation referring to the offset of a
-    // variable from the thread pointer itself. Used in the local-exec TLS
-    // access model.
-    MO_TPREL_G1,
-    MO_TPREL_G0_NC,
-
-    // MO_LO12 - On a symbol operand, this represents a relocation containing
-    // lower 12 bits of the address. Used in add/sub/ldr/str.
-    MO_LO12,
-
-    // MO_ABS_G* - Represent the 16-bit granules of an absolute reference using
-    // movz/movk instructions.
-    MO_ABS_G3,
-    MO_ABS_G2_NC,
-    MO_ABS_G1_NC,
-    MO_ABS_G0_NC
+    MO_FRAGMENT = 0x7,
+
+    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
+    /// offset of the 4K page containing the symbol.  This is used with the
+    /// ADRP instruction.
+    MO_PAGE = 1,
+
+    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
+    /// that symbol within a 4K page.  This offset is added to the page address
+    /// to produce the complete address.
+    MO_PAGEOFF = 2,
+
+    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
+    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G3 = 3,
+
+    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
+    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G2 = 4,
+
+    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
+    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G1 = 5,
+
+    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
+    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
+    MO_G0 = 6,
+
+    /// MO_GOT - This flag indicates that a symbol operand represents the
+    /// address of the GOT entry for the symbol, rather than the address of
+    /// the symbol itself.
+    MO_GOT = 8,
+
+    /// MO_NC - Indicates whether the linker is expected to check the symbol
+    /// reference for overflow. For example in an ADRP/ADD pair of relocations
+    /// the ADRP usually does check, but not the ADD.
+    MO_NC = 0x10,
+
+    /// MO_TLS - Indicates that the operand being accessed is some kind of
+    /// thread-local symbol. On Darwin, only one type of thread-local access
+    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
+    /// referee will affect interpretation.
+    MO_TLS = 0x20
   };
-}
-
-class APFloat;
-
-namespace A64Imms {
-  bool isFPImm(const APFloat &Val, uint32_t &Imm8Bits);
-
-  inline bool isFPImm(const APFloat &Val) {
-    uint32_t Imm8;
-    return isFPImm(Val, Imm8);
-  }
-
-  bool isLogicalImm(unsigned RegWidth, uint64_t Imm, uint32_t &Bits);
-  bool isLogicalImmBits(unsigned RegWidth, uint32_t Bits, uint64_t &Imm);
-
-  bool isMOVZImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-  bool isMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  // We sometimes want to know whether the immediate is representable with a
-  // MOVN but *not* with a MOVZ (because that would take priority).
-  bool isOnlyMOVNImm(int RegWidth, uint64_t Value, int &UImm16, int &Shift);
-
-  uint64_t decodeNeonModImm(unsigned Val, unsigned OpCmode, unsigned &EltBits);
-  bool decodeNeonModShiftImm(unsigned OpCmode, unsigned &ShiftImm,
-                             unsigned &ShiftOnesIn);
-  }
+} // end namespace AArch64II
 
-} // end namespace llvm;
+} // end namespace llvm
 
 #endif
diff --git a/lib/Target/AArch64/Utils/Android.mk b/lib/Target/AArch64/Utils/Android.mk
index b8bf795..3c1d194 100644
--- a/lib/Target/AArch64/Utils/Android.mk
+++ b/lib/Target/AArch64/Utils/Android.mk
@@ -1,5 +1,10 @@
 LOCAL_PATH := $(call my-dir)
 
+arm64_utils_TBLGEN_TABLES := \
+  AArch64GenRegisterInfo.inc \
+  AArch64GenInstrInfo.inc \
+  AArch64GenSubtargetInfo.inc
+
 arm64_utils_SRC_FILES := \
   AArch64BaseInfo.cpp
 
@@ -16,7 +21,12 @@ LOCAL_MODULE:= libLLVMARM64Utils
 
 LOCAL_MODULE_TAGS := optional
 
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
 include $(LLVM_DEVICE_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_STATIC_LIBRARY)
 endif
 
@@ -32,5 +42,10 @@ LOCAL_MODULE:= libLLVMARM64Utils
 
 LOCAL_MODULE_TAGS := optional
 
+TBLGEN_TD_DIR := $(LOCAL_PATH)/..
+TBLGEN_TABLES := $(arm64_utils_TBLGEN_TABLES)
+
 include $(LLVM_HOST_BUILD_MK)
+include $(LLVM_TBLGEN_RULES_MK)
+include $(LLVM_GEN_INTRINSICS_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/lib/Target/AArch64/Utils/LLVMBuild.txt b/lib/Target/AArch64/Utils/LLVMBuild.txt
index 4acecc9..bcefeb6 100644
--- a/lib/Target/AArch64/Utils/LLVMBuild.txt
+++ b/lib/Target/AArch64/Utils/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/AArch646/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
+;===- ./lib/Target/AArch64/Utils/LLVMBuild.txt ----------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
diff --git a/lib/Target/AArch64/Utils/Makefile b/lib/Target/AArch64/Utils/Makefile
index 0f4a645..0b80f82 100644
--- a/lib/Target/AArch64/Utils/Makefile
+++ b/lib/Target/AArch64/Utils/Makefile
@@ -9,7 +9,8 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMAArch64Utils
 
-# Hack: we need to include 'main' AArch64 target directory to grab private headers
-#CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+# Hack: we need to include 'main' AArch64 target directory to grab private
+# headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index 28ea879..94faf6f 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -24,7 +24,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "a15-sd-optimizer"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
@@ -39,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "a15-sd-optimizer"
+
 namespace {
   struct A15SDOptimizer : public MachineFunctionPass {
     static char ID;
@@ -90,7 +91,7 @@ namespace {
     unsigned createImplicitDef(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator InsertBefore,
                                DebugLoc DL);
-    
+
     //
     // Various property checkers
     //
@@ -259,7 +260,7 @@ unsigned A15SDOptimizer::optimizeSDPattern(MachineInstr *MI) {
       if (DPRMI && SPRMI) {
         // See if the first operand of this insert_subreg is IMPLICIT_DEF
         MachineInstr *ECDef = elideCopies(DPRMI);
-        if (ECDef != 0 && ECDef->isImplicitDef()) {
+        if (ECDef && ECDef->isImplicitDef()) {
           // Another corner case - if we're inserting something that is purely
           // a subreg copy of a DPR, just use that DPR.
 
@@ -348,10 +349,10 @@ MachineInstr *A15SDOptimizer::elideCopies(MachineInstr *MI) {
   if (!MI->isFullCopy())
     return MI;
   if (!TRI->isVirtualRegister(MI->getOperand(1).getReg()))
-    return NULL;
+    return nullptr;
   MachineInstr *Def = MRI->getVRegDef(MI->getOperand(1).getReg());
   if (!Def)
-    return NULL;
+    return nullptr;
   return elideCopies(Def);
 }
 
@@ -435,7 +436,7 @@ A15SDOptimizer::createDupLane(MachineBasicBlock &MBB,
                          Out)
                    .addReg(Reg)
                    .addImm(Lane));
- 
+
   return Out;
 }
 
@@ -601,7 +602,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
   //   * INSERT_SUBREG: * If the SPR value was originally in another DPR/QPR
   //                      lane, and the other lane(s) of the DPR/QPR register
   //                      that we are inserting in are undefined, use the
-  //                      original DPR/QPR value. 
+  //                      original DPR/QPR value.
   //                    * Otherwise, fall back on the same stategy as COPY.
   //
   //   * REG_SEQUENCE:  * If all except one of the input operands are
@@ -693,7 +694,7 @@ bool A15SDOptimizer::runOnMachineFunction(MachineFunction &Fn) {
       MI != ME;) {
       Modified |= runOnInstruction(MI++);
     }
- 
+
   }
 
   for (std::set<MachineInstr *>::iterator I = DeadInstr.begin(),
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 4412b45..55df29c 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -49,8 +49,6 @@ FunctionPass *createThumb2SizeReductionPass();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *createARMTargetTransformInfoPass(const ARMBaseTargetMachine *TM);
 
-FunctionPass *createARMAtomicExpandPass(const TargetMachine *TM);
-
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 0fa865f..55e9fe5 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "ARMAsmPrinter.h"
 #include "ARM.h"
 #include "ARMConstantPoolValue.h"
@@ -45,6 +44,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ARMBuildAttributes.h"
+#include "llvm/Support/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
@@ -55,6 +55,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 void ARMAsmPrinter::EmitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
@@ -85,7 +87,7 @@ void ARMAsmPrinter::EmitXXStructor(const Constant *CV) {
                                              ? MCSymbolRefExpr::VK_ARM_TARGET1
                                              : MCSymbolRefExpr::VK_None),
                                             OutContext);
-  
+
   OutStreamer.EmitValue(E, Size);
 }
 
@@ -96,7 +98,28 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AFI = MF.getInfo<ARMFunctionInfo>();
   MCP = MF.getConstantPool();
 
-  return AsmPrinter::runOnMachineFunction(MF);
+  SetupMachineFunction(MF);
+
+  if (Subtarget->isTargetCOFF()) {
+    bool Internal = MF.getFunction()->hasInternalLinkage();
+    COFF::SymbolStorageClass Scl = Internal ? COFF::IMAGE_SYM_CLASS_STATIC
+                                            : COFF::IMAGE_SYM_CLASS_EXTERNAL;
+    int Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
+
+    OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
+    OutStreamer.EmitCOFFSymbolStorageClass(Scl);
+    OutStreamer.EmitCOFFSymbolType(Type);
+    OutStreamer.EndCOFFSymbolDef();
+  }
+
+  // Have common code print out the function header with linkage info etc.
+  EmitFunctionHeader();
+
+  // Emit the rest of the function body.
+  EmitFunctionBody();
+
+  // We didn't modify anything.
+  return false;
 }
 
 void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
@@ -239,7 +262,7 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       if (ARM::GPRPairRegClass.contains(RegBegin)) {
         const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
         unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
-        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";;
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";
         RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
       }
       O << ARMInstPrinter::getRegisterName(RegBegin);
@@ -383,7 +406,7 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
   // If either end mode is unknown (EndInfo == NULL) or different than
   // the start mode, then restore the start mode.
   const bool WasThumb = isThumb(StartInfo);
-  if (EndInfo == NULL || WasThumb != isThumb(*EndInfo)) {
+  if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
     OutStreamer.EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
   }
 }
@@ -456,6 +479,29 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
     emitAttributes();
 }
 
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
 
 void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetMachO()) {
@@ -472,27 +518,9 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        //   .indirect_symbol _foo
-        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
-        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),MCSA_IndirectSymbol);
-
-        if (MCSym.getInt())
-          // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/);
-        else
-          // Internal to current translation unit.
-          //
-          // When we place the LSDA into the TEXT section, the type info
-          // pointers need to be indirect and pc-rel. We accomplish this by
-          // using NLPs; however, sometimes the types are local to the file.
-          // We need to fill in the value for the NLP in those cases.
-          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
-                                                        OutContext),
-                                4/*size*/);
-      }
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -500,17 +528,11 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 
     Stubs = MMIMacho.GetHiddenGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
+      OutStreamer.SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
       EmitAlignment(2);
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        //   .long _foo
-        OutStreamer.EmitValue(MCSymbolRefExpr::
-                              Create(Stubs[i].second.getPointer(),
-                                     OutContext),
-                              4/*size*/);
-      }
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
 
       Stubs.clear();
       OutStreamer.AddBlankLine();
@@ -523,6 +545,28 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // generates code that does this, it is always safe to set.
     OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
+
+  // Emit a .data.rel section containing any stubs that were created.
+  if (Subtarget->isTargetELF()) {
+    const TargetLoweringObjectFileELF &TLOFELF =
+      static_cast<const TargetLoweringObjectFileELF &>(getObjFileLowering());
+
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
+      const DataLayout *TD = TM.getDataLayout();
+
+      for (auto &stub: Stubs) {
+        OutStreamer.EmitLabel(stub.first);
+        OutStreamer.EmitSymbolValue(stub.second.getPointer(),
+                                    TD->getPointerSize(0));
+      }
+      Stubs.clear();
+    }
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -575,7 +619,7 @@ void ARMAsmPrinter::emitAttributes() {
                     getArchForCPU(CPUString, Subtarget));
 
   // Tag_CPU_arch_profile must have the default value of 0 when "Architecture
-  // profile is not applicable (e.g. pre v7, or cross-profile code)". 
+  // profile is not applicable (e.g. pre v7, or cross-profile code)".
   if (Subtarget->hasV7Ops()) {
     if (Subtarget->isAClass()) {
       ATS.emitAttribute(ARMBuildAttrs::CPU_arch_profile,
@@ -627,6 +671,20 @@ void ARMAsmPrinter::emitAttributes() {
       ATS.emitFPU(ARM::VFPV2);
   }
 
+  if (TM.getRelocationModel() == Reloc::PIC_) {
+    // PIC specific attributes.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RW_data,
+                      ARMBuildAttrs::AddressRWPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_RO_data,
+                      ARMBuildAttrs::AddressROPCRel);
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressGOT);
+  } else {
+    // Allow direct addressing of imported data for all other relocation models.
+    ATS.emitAttribute(ARMBuildAttrs::ABI_PCS_GOT_use,
+                      ARMBuildAttrs::AddressDirect);
+  }
+
   // Signal various FP modes.
   if (!TM.Options.UnsafeFPMath) {
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal, ARMBuildAttrs::Allowed);
@@ -723,7 +781,7 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
   MachineModuleInfoImpl::StubValueTy &StubSym =
     GV->hasHiddenVisibility() ? MMIMachO.getHiddenGVStubEntry(MCSym) :
     MMIMachO.getGVStubEntry(MCSym);
-  if (StubSym.getPointer() == 0)
+  if (!StubSym.getPointer())
     StubSym = MachineModuleInfoImpl::
       StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
   return MCSym;
@@ -971,7 +1029,8 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       RegList.push_back(SrcReg);
       break;
     }
-    ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
+    if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+      ATS.emitRegSave(RegList, Opc == ARM::VSTMDDB_UPD);
   } else {
     // Changes of stack / frame pointer.
     if (SrcReg == ARM::SP) {
@@ -1016,18 +1075,20 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       }
       }
 
-      if (DstReg == FramePtr && FramePtr != ARM::SP)
-        // Set-up of the frame pointer. Positive values correspond to "add"
-        // instruction.
-        ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
-      else if (DstReg == ARM::SP) {
-        // Change of SP by an offset. Positive values correspond to "sub"
-        // instruction.
-        ATS.emitPad(Offset);
-      } else {
-        // Move of SP to a register.  Positive values correspond to an "add"
-        // instruction.
-        ATS.emitMovSP(DstReg, -Offset);
+      if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
+        if (DstReg == FramePtr && FramePtr != ARM::SP)
+          // Set-up of the frame pointer. Positive values correspond to "add"
+          // instruction.
+          ATS.emitSetFP(FramePtr, ARM::SP, -Offset);
+        else if (DstReg == ARM::SP) {
+          // Change of SP by an offset. Positive values correspond to "sub"
+          // instruction.
+          ATS.emitPad(Offset);
+        } else {
+          // Move of SP to a register.  Positive values correspond to an "add"
+          // instruction.
+          ATS.emitMovSP(DstReg, -Offset);
+        }
       }
     } else if (DstReg == ARM::SP) {
       MI->dump();
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index 46c2626..7c103c6 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -47,16 +47,17 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   bool InConstantPool;
 public:
   explicit ARMAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), AFI(NULL), MCP(NULL), InConstantPool(false) {
-      Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    }
+    : AsmPrinter(TM, Streamer), AFI(nullptr), MCP(nullptr),
+      InConstantPool(false) {
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+  }
 
   const char *getPassName() const override {
     return "ARM Assembly / Object Emitter";
   }
 
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
                        unsigned AsmVariant, const char *ExtraCode,
diff --git a/lib/Target/ARM/ARMAtomicExpandPass.cpp b/lib/Target/ARM/ARMAtomicExpandPass.cpp
deleted file mode 100644
index 18e0783..0000000
--- a/lib/Target/ARM/ARMAtomicExpandPass.cpp
+++ /dev/null
@@ -1,406 +0,0 @@
-//===-- ARMAtomicExpandPass.cpp - Expand atomic instructions --------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a pass (at IR level) to replace atomic instructions with
-// appropriate (intrinsic-based) ldrex/strex loops.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm-atomic-expand"
-#include "ARM.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/TargetLowering.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-namespace {
-  class ARMAtomicExpandPass : public FunctionPass {
-    const TargetLowering *TLI;
-  public:
-    static char ID; // Pass identification, replacement for typeid
-    explicit ARMAtomicExpandPass(const TargetMachine *TM = 0)
-      : FunctionPass(ID), TLI(TM->getTargetLowering()) {}
-
-    bool runOnFunction(Function &F) override;
-    bool expandAtomicInsts(Function &F);
-
-    bool expandAtomicLoad(LoadInst *LI);
-    bool expandAtomicStore(StoreInst *LI);
-    bool expandAtomicRMW(AtomicRMWInst *AI);
-    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
-
-    AtomicOrdering insertLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-    void insertTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord);
-
-    /// Perform a load-linked operation on Addr, returning a "Value *" with the
-    /// corresponding pointee type. This may entail some non-trivial operations
-    /// to truncate or reconstruct illegal types since intrinsics must be legal
-    Value *loadLinked(IRBuilder<> &Builder, Value *Addr, AtomicOrdering Ord);
-
-    /// Perform a store-conditional operation to Addr. Return the status of the
-    /// store: 0 if the it succeeded, non-zero otherwise.
-    Value *storeConditional(IRBuilder<> &Builder, Value *Val, Value *Addr,
-                            AtomicOrdering Ord);
-
-    /// Return true if the given (atomic) instruction should be expanded by this
-    /// pass.
-    bool shouldExpandAtomic(Instruction *Inst);
-  };
-}
-
-char ARMAtomicExpandPass::ID = 0;
-
-FunctionPass *llvm::createARMAtomicExpandPass(const TargetMachine *TM) {
-  return new ARMAtomicExpandPass(TM);
-}
-
-bool ARMAtomicExpandPass::runOnFunction(Function &F) {
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (BasicBlock &BB : F)
-    for (Instruction &Inst : BB) {
-      if (isa<AtomicRMWInst>(&Inst) || isa<AtomicCmpXchgInst>(&Inst) ||
-          (isa<LoadInst>(&Inst) && cast<LoadInst>(&Inst)->isAtomic()) ||
-          (isa<StoreInst>(&Inst) && cast<StoreInst>(&Inst)->isAtomic()))
-        AtomicInsts.push_back(&Inst);
-    }
-
-  bool MadeChange = false;
-  for (Instruction *Inst : AtomicInsts) {
-    if (!shouldExpandAtomic(Inst))
-      continue;
-
-    if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(Inst))
-      MadeChange |= expandAtomicRMW(AI);
-    else if (AtomicCmpXchgInst *CI = dyn_cast<AtomicCmpXchgInst>(Inst))
-      MadeChange |= expandAtomicCmpXchg(CI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-      MadeChange |= expandAtomicLoad(LI);
-    else if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-      MadeChange |= expandAtomicStore(SI);
-    else
-      llvm_unreachable("Unknown atomic instruction");
-  }
-
-  return MadeChange;
-}
-
-bool ARMAtomicExpandPass::expandAtomicLoad(LoadInst *LI) {
-  // Load instructions don't actually need a leading fence, even in the
-  // SequentiallyConsistent case.
-  AtomicOrdering MemOpOrder =
-    TLI->getInsertFencesForAtomic() ? Monotonic : LI->getOrdering();
-
-  // The only 64-bit load guaranteed to be single-copy atomic by the ARM ARM is
-  // an ldrexd (A3.5.3).
-  IRBuilder<> Builder(LI);
-  Value *Val = loadLinked(Builder, LI->getPointerOperand(), MemOpOrder);
-
-  insertTrailingFence(Builder, LI->getOrdering());
-
-  LI->replaceAllUsesWith(Val);
-  LI->eraseFromParent();
-
-  return true;
-}
-
-bool ARMAtomicExpandPass::expandAtomicStore(StoreInst *SI) {
-  // The only atomic 64-bit store on ARM is an strexd that succeeds, which means
-  // we need a loop and the entire instruction is essentially an "atomicrmw
-  // xchg" that ignores the value loaded.
-  IRBuilder<> Builder(SI);
-  AtomicRMWInst *AI =
-      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
-                              SI->getValueOperand(), SI->getOrdering());
-  SI->eraseFromParent();
-
-  // Now we have an appropriate swap instruction, lower it as usual.
-  return expandAtomicRMW(AI);
-}
-
-bool ARMAtomicExpandPass::expandAtomicRMW(AtomicRMWInst *AI) {
-  AtomicOrdering Order = AI->getOrdering();
-  Value *Addr = AI->getPointerOperand();
-  BasicBlock *BB = AI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
-  //
-  // The standard expansion we produce is:
-  //     [...]
-  //     fence?
-  // atomicrmw.start:
-  //     %loaded = @load.linked(%addr)
-  //     %new = some_op iN %loaded, %incr
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %atomicrmw.end
-  // atomicrmw.end:
-  //     fence?
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(AI, "atomicrmw.end");
-  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
-
-  // This grabs the DebugLoc from AI.
-  IRBuilder<> Builder(AI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, Order);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
-
-  Value *NewVal;
-  switch (AI->getOperation()) {
-  case AtomicRMWInst::Xchg:
-    NewVal = AI->getValOperand();
-    break;
-  case AtomicRMWInst::Add:
-    NewVal = Builder.CreateAdd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Sub:
-    NewVal = Builder.CreateSub(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::And:
-    NewVal = Builder.CreateAnd(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Nand:
-    NewVal = Builder.CreateAnd(Loaded, Builder.CreateNot(AI->getValOperand()),
-                               "new");
-    break;
-  case AtomicRMWInst::Or:
-    NewVal = Builder.CreateOr(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Xor:
-    NewVal = Builder.CreateXor(Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Max:
-    NewVal = Builder.CreateICmpSGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::Min:
-    NewVal = Builder.CreateICmpSLE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMax:
-    NewVal = Builder.CreateICmpUGT(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  case AtomicRMWInst::UMin:
-    NewVal = Builder.CreateICmpULE(Loaded, AI->getValOperand());
-    NewVal = Builder.CreateSelect(NewVal, Loaded, AI->getValOperand(), "new");
-    break;
-  default:
-    llvm_unreachable("Unknown atomic op");
-  }
-
-  Value *StoreSuccess = storeConditional(Builder, NewVal, Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
-  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
-
-  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  insertTrailingFence(Builder, Order);
-
-  AI->replaceAllUsesWith(Loaded);
-  AI->eraseFromParent();
-
-  return true;
-}
-
-bool ARMAtomicExpandPass::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
-  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
-  AtomicOrdering FailureOrder = CI->getFailureOrdering();
-  Value *Addr = CI->getPointerOperand();
-  BasicBlock *BB = CI->getParent();
-  Function *F = BB->getParent();
-  LLVMContext &Ctx = F->getContext();
-
-  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
-  //
-  // The full expansion we produce is:
-  //     [...]
-  //     fence?
-  // cmpxchg.start:
-  //     %loaded = @load.linked(%addr)
-  //     %should_store = icmp eq %loaded, %desired
-  //     br i1 %should_store, label %cmpxchg.trystore,
-  //                          label %cmpxchg.end/%cmpxchg.barrier
-  // cmpxchg.trystore:
-  //     %stored = @store_conditional(%new, %addr)
-  //     %try_again = icmp i32 ne %stored, 0
-  //     br i1 %try_again, label %loop, label %cmpxchg.end
-  // cmpxchg.barrier:
-  //     fence?
-  //     br label %cmpxchg.end
-  // cmpxchg.end:
-  //     [...]
-  BasicBlock *ExitBB = BB->splitBasicBlock(CI, "cmpxchg.end");
-  auto BarrierBB = BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ExitBB);
-  auto TryStoreBB = BasicBlock::Create(Ctx, "cmpxchg.barrier", F, BarrierBB);
-  auto LoopBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, TryStoreBB);
-
-  // This grabs the DebugLoc from CI
-  IRBuilder<> Builder(CI);
-
-  // The split call above "helpfully" added a branch at the end of BB (to the
-  // wrong place), but we might want a fence too. It's easiest to just remove
-  // the branch entirely.
-  std::prev(BB->end())->eraseFromParent();
-  Builder.SetInsertPoint(BB);
-  AtomicOrdering MemOpOrder = insertLeadingFence(Builder, SuccessOrder);
-  Builder.CreateBr(LoopBB);
-
-  // Start the main loop block now that we've taken care of the preliminaries.
-  Builder.SetInsertPoint(LoopBB);
-  Value *Loaded = loadLinked(Builder, Addr, MemOpOrder);
-  Value *ShouldStore =
-      Builder.CreateICmpEQ(Loaded, CI->getCompareOperand(), "should_store");
-
-  // If the the cmpxchg doesn't actually need any ordering when it fails, we can
-  // jump straight past that fence instruction (if it exists).
-  BasicBlock *FailureBB = FailureOrder == Monotonic ? ExitBB : BarrierBB;
-  Builder.CreateCondBr(ShouldStore, TryStoreBB, FailureBB);
-
-  Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess =
-      storeConditional(Builder, CI->getNewValOperand(), Addr, MemOpOrder);
-  Value *TryAgain = Builder.CreateICmpNE(
-      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
-  Builder.CreateCondBr(TryAgain, LoopBB, BarrierBB);
-
-  // Finally, make sure later instructions don't get reordered with a fence if
-  // necessary.
-  Builder.SetInsertPoint(BarrierBB);
-  insertTrailingFence(Builder, SuccessOrder);
-  Builder.CreateBr(ExitBB);
-
-  CI->replaceAllUsesWith(Loaded);
-  CI->eraseFromParent();
-
-  return true;
-}
-
-Value *ARMAtomicExpandPass::loadLinked(IRBuilder<> &Builder, Value *Addr,
-                                          AtomicOrdering Ord) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
-  bool IsAcquire =
-      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
-  // intrinsic must return {i32, i32} and we have to recombine them into a
-  // single i64 here.
-  if (ValTy->getPrimitiveSizeInBits() == 64) {
-    Intrinsic::ID Int =
-        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
-
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
-
-    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
-    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
-    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
-    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
-    return Builder.CreateOr(
-        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
-  }
-
-  Type *Tys[] = { Addr->getType() };
-  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateTruncOrBitCast(
-      Builder.CreateCall(Ldrex, Addr),
-      cast<PointerType>(Addr->getType())->getElementType());
-}
-
-Value *ARMAtomicExpandPass::storeConditional(IRBuilder<> &Builder, Value *Val,
-                                           Value *Addr, AtomicOrdering Ord) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  bool IsRelease =
-      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
-
-  // Since the intrinsics must have legal type, the i64 intrinsics take two
-  // parameters: "i32, i32". We must marshal Val into the appropriate form
-  // before the call.
-  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
-    Intrinsic::ID Int =
-        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
-    Function *Strex = Intrinsic::getDeclaration(M, Int);
-    Type *Int32Ty = Type::getInt32Ty(M->getContext());
-
-    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
-    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
-    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
-    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
-  }
-
-  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
-  Type *Tys[] = { Addr->getType() };
-  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
-
-  return Builder.CreateCall2(
-      Strex, Builder.CreateZExtOrBitCast(
-                 Val, Strex->getFunctionType()->getParamType(0)),
-      Addr);
-}
-
-AtomicOrdering ARMAtomicExpandPass::insertLeadingFence(IRBuilder<> &Builder,
-                                                       AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
-    return Ord;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    Builder.CreateFence(Release);
-
-  // The exclusive operations don't need any barrier if we're adding separate
-  // fences.
-  return Monotonic;
-}
-
-void ARMAtomicExpandPass::insertTrailingFence(IRBuilder<> &Builder,
-                                              AtomicOrdering Ord) {
-  if (!TLI->getInsertFencesForAtomic())
-    return;
-
-  if (Ord == Acquire || Ord == AcquireRelease)
-    Builder.CreateFence(Acquire);
-  else if (Ord == SequentiallyConsistent)
-    Builder.CreateFence(SequentiallyConsistent);
-}
-
-bool ARMAtomicExpandPass::shouldExpandAtomic(Instruction *Inst) {
-  // Loads and stores less than 64-bits are already atomic; ones above that
-  // are doomed anyway, so defer to the default libcall and blame the OS when
-  // things go wrong:
-  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
-    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
-  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
-    return LI->getType()->getPrimitiveSizeInBits() == 64;
-
-  // For the real atomic operations, we have ldrex/strex up to 64 bits.
-  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
-}
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 47f5bf9..bc266e8 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -37,11 +37,13 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-instrinfo"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "ARMGenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
@@ -125,14 +127,14 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // FIXME: Thumb2 support.
 
   if (!EnableARM3Addr)
-    return NULL;
+    return nullptr;
 
   MachineInstr *MI = MBBI;
   MachineFunction &MF = *MI->getParent()->getParent();
   uint64_t TSFlags = MI->getDesc().TSFlags;
   bool isPre = false;
   switch ((TSFlags & ARMII::IndexModeMask) >> ARMII::IndexModeShift) {
-  default: return NULL;
+  default: return nullptr;
   case ARMII::IndexModePre:
     isPre = true;
     break;
@@ -144,10 +146,10 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // operation.
   unsigned MemOpc = getUnindexedOpcode(MI->getOpcode());
   if (MemOpc == 0)
-    return NULL;
+    return nullptr;
 
-  MachineInstr *UpdateMI = NULL;
-  MachineInstr *MemMI = NULL;
+  MachineInstr *UpdateMI = nullptr;
+  MachineInstr *MemMI = nullptr;
   unsigned AddrMode = (TSFlags & ARMII::AddrModeMask);
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned NumOps = MCID.getNumOperands();
@@ -169,7 +171,7 @@ ARMBaseInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       if (ARM_AM::getSOImmVal(Amt) == -1)
         // Can't encode it in a so_imm operand. This transformation will
         // add more than 1 instruction. Abandon!
-        return NULL;
+        return nullptr;
       UpdateMI = BuildMI(MF, MI->getDebugLoc(),
                          get(isSub ? ARM::SUBri : ARM::ADDri), WBReg)
         .addReg(BaseReg).addImm(Amt)
@@ -273,8 +275,8 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 MachineBasicBlock *&FBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
-  TBB = 0;
-  FBB = 0;
+  TBB = nullptr;
+  FBB = nullptr;
 
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
@@ -331,7 +333,7 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
            I->isReturn())) {
       // Forget any previous condition branch information - it no longer applies.
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // If we can modify the function, delete everything below this
       // unconditional branch.
@@ -405,7 +407,7 @@ ARMBaseInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "ARM branch conditions have two components!");
 
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) { // Unconditional branch?
       if (isThumb)
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).addImm(ARMCC::AL).addReg(0);
@@ -535,7 +537,8 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr *MI) const {
   return true;
 }
 
-template<> bool IsCPSRDead<MachineInstr>(MachineInstr* MI) {
+namespace llvm {
+template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
@@ -548,6 +551,7 @@ template<> bool IsCPSRDead<MachineInstr>(MachineInstr* MI) {
   // all definitions of CPSR are dead
   return true;
 }
+}
 
 /// FIXME: Works around a gcc miscompilation with -fstrict-aliasing.
 LLVM_ATTRIBUTE_NOINLINE
@@ -620,7 +624,7 @@ unsigned ARMBaseInstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
       MI->getOperand(NumOps - (MI->isPredicable() ? 3 : 2));
     unsigned JTI = JTOP.getIndex();
     const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-    assert(MJTI != 0);
+    assert(MJTI != nullptr);
     const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
     assert(JTI < JT.size());
     // Thumb instructions are 2 byte aligned, but JT entries are 4 byte
@@ -1248,7 +1252,7 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
     static_cast<ARMConstantPoolValue*>(MCPE.Val.MachineCPVal);
 
   unsigned PCLabelId = AFI->createPICLabelUId();
-  ARMConstantPoolValue *NewCPV = 0;
+  ARMConstantPoolValue *NewCPV = nullptr;
 
   // FIXME: The below assumes PIC relocation model and that the function
   // is Thumb mode (t1 or t2). PCAdjustment would be 8 for ARM mode PIC, and
@@ -1659,10 +1663,10 @@ ARMBaseInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
     ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
-      return NULL;
+      return nullptr;
     MI = TargetInstrInfo::commuteInstruction(MI, NewMI);
     if (!MI)
-      return NULL;
+      return nullptr;
     // After swapping the MOVCC operands, also invert the condition.
     MI->getOperand(MI->findFirstPredOperandIdx())
       .setImm(ARMCC::getOppositeCondition(CC));
@@ -1678,35 +1682,36 @@ static MachineInstr *canFoldIntoMOVCC(unsigned Reg,
                                       const MachineRegisterInfo &MRI,
                                       const TargetInstrInfo *TII) {
   if (!TargetRegisterInfo::isVirtualRegister(Reg))
-    return 0;
+    return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
-    return 0;
+    return nullptr;
   MachineInstr *MI = MRI.getVRegDef(Reg);
   if (!MI)
-    return 0;
+    return nullptr;
   // MI is folded into the MOVCC by predicating it.
   if (!MI->isPredicable())
-    return 0;
+    return nullptr;
   // Check if MI has any non-dead defs or physreg uses. This also detects
   // predicated instructions which will be reading CPSR.
   for (unsigned i = 1, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     // Reject frame index operands, PEI can't handle the predicated pseudos.
     if (MO.isFI() || MO.isCPI() || MO.isJTI())
-      return 0;
+      return nullptr;
     if (!MO.isReg())
       continue;
     // MI can't have any tied operands, that would conflict with predication.
     if (MO.isTied())
-      return 0;
+      return nullptr;
     if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
-      return 0;
+      return nullptr;
     if (MO.isDef() && !MO.isDead())
-      return 0;
+      return nullptr;
   }
   bool DontMoveAcrossStores = true;
-  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ 0, DontMoveAcrossStores))
-    return 0;
+  if (!MI->isSafeToMove(TII, /* AliasAnalysis = */ nullptr,
+                        DontMoveAcrossStores))
+    return nullptr;
   return MI;
 }
 
@@ -1741,14 +1746,14 @@ MachineInstr *ARMBaseInstrInfo::optimizeSelect(MachineInstr *MI,
   if (!DefMI)
     DefMI = canFoldIntoMOVCC(MI->getOperand(1).getReg(), MRI, this);
   if (!DefMI)
-    return 0;
+    return nullptr;
 
   // Find new register class to use.
   MachineOperand FalseReg = MI->getOperand(Invert ? 2 : 1);
   unsigned       DestReg  = MI->getOperand(0).getReg();
   const TargetRegisterClass *PreviousClass = MRI.getRegClass(FalseReg.getReg());
   if (!MRI.constrainRegClass(DestReg, PreviousClass))
-    return 0;
+    return nullptr;
 
   // Create a new predicated version of DefMI.
   // Rfalse is the first use.
@@ -2254,7 +2259,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // Masked compares sometimes use the same register as the corresponding 'and'.
   if (CmpMask != ~0) {
     if (!isSuitableForMask(MI, SrcReg, CmpMask, false) || isPredicated(MI)) {
-      MI = 0;
+      MI = nullptr;
       for (MachineRegisterInfo::use_instr_iterator
            UI = MRI->use_instr_begin(SrcReg), UE = MRI->use_instr_end();
            UI != UE; ++UI) {
@@ -2281,17 +2286,17 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
   // For CMPri(r1, CmpValue), we are looking for SUBri(r1, CmpValue).
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
-    MI = NULL;
+    MI = nullptr;
   else if (MI->getParent() != CmpInstr->getParent() || CmpValue != 0) {
     // Conservatively refuse to convert an instruction which isn't in the same
     // BB as the comparison.
     // For CMPri, we need to check Sub, thus we can't return here.
     if (CmpInstr->getOpcode() == ARM::CMPri ||
        CmpInstr->getOpcode() == ARM::t2CMPri)
-      MI = NULL;
+      MI = nullptr;
     else
       return false;
   }
@@ -3295,7 +3300,7 @@ static const MachineInstr *getBundledUseMI(const TargetRegisterInfo *TRI,
 
   if (Idx == -1) {
     Dist = 0;
-    return 0;
+    return nullptr;
   }
 
   UseIdx = Idx;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 3ddddcb..4b3e740 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -261,7 +261,7 @@ private:
 
   unsigned getInstrLatency(const InstrItineraryData *ItinData,
                            const MachineInstr *MI,
-                           unsigned *PredCost = 0) const override;
+                           unsigned *PredCost = nullptr) const override;
 
   int getInstrLatency(const InstrItineraryData *ItinData,
                       SDNode *Node) const override;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 8130a2d..a2eee9f 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -44,14 +44,18 @@
 using namespace llvm;
 
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMSubtarget &sti)
-  : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti),
-    FramePtr((STI.isTargetMachO() || STI.isThumb()) ? ARM::R7 : ARM::R11),
-    BasePtr(ARM::R6) {
+    : ARMGenRegisterInfo(ARM::LR, 0, 0, ARM::PC), STI(sti), BasePtr(ARM::R6) {
+  if (STI.isTargetMachO())
+    FramePtr = ARM::R7;
+  else if (STI.isTargetWindows())
+    FramePtr = ARM::R11;
+  else // ARM EABI
+    FramePtr = STI.isThumb() ? ARM::R7 : ARM::R11;
 }
 
-const uint16_t*
+const MCPhysReg*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  const uint16_t *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+  const MCPhysReg *RegList = (STI.isTargetIOS() && !STI.isAAPCS_ABI())
                                 ? CSR_iOS_SaveList
                                 : CSR_AAPCS_SaveList;
 
@@ -107,7 +111,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
   // should return NULL
   if (CC == CallingConv::GHC)
     // This is academic becase all GHC calls are (supposed to be) tail calls
-    return NULL;
+    return nullptr;
   return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
     ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
 }
@@ -173,7 +177,7 @@ ARMBaseRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
   if (RC == &ARM::CCRRegClass)
-    return 0;  // Can't copy CCR registers.
+    return nullptr;  // Can't copy CCR registers.
   return RC;
 }
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 66b3c82..91df565 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -100,8 +100,8 @@ protected:
 
 public:
   /// Code Generation virtual methods...
-  const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
 
@@ -186,7 +186,7 @@ public:
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMCallingConv.h b/lib/Target/ARM/ARMCallingConv.h
index 4f94ad2..dc41c1c 100644
--- a/lib/Target/ARM/ARMCallingConv.h
+++ b/lib/Target/ARM/ARMCallingConv.h
@@ -28,7 +28,7 @@ namespace llvm {
 static bool f64AssignAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                           CCValAssign::LocInfo &LocInfo,
                           CCState &State, bool CanFail) {
-  static const uint16_t RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const MCPhysReg RegList[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   // Try to get the first register.
   if (unsigned Reg = State.AllocateReg(RegList, 4))
@@ -71,10 +71,10 @@ static bool CC_ARM_APCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 static bool f64AssignAAPCS(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                            CCValAssign::LocInfo &LocInfo,
                            CCState &State, bool CanFail) {
-  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
-  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
-  static const uint16_t ShadowRegList[] = { ARM::R0, ARM::R1 };
-  static const uint16_t GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg ShadowRegList[] = { ARM::R0, ARM::R1 };
+  static const MCPhysReg GPRArgRegs[] = { ARM::R0, ARM::R1, ARM::R2, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, ShadowRegList, 2);
   if (Reg == 0) {
@@ -123,8 +123,8 @@ static bool CC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
 
 static bool f64RetAssign(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                          CCValAssign::LocInfo &LocInfo, CCState &State) {
-  static const uint16_t HiRegList[] = { ARM::R0, ARM::R2 };
-  static const uint16_t LoRegList[] = { ARM::R1, ARM::R3 };
+  static const MCPhysReg HiRegList[] = { ARM::R0, ARM::R2 };
+  static const MCPhysReg LoRegList[] = { ARM::R1, ARM::R3 };
 
   unsigned Reg = State.AllocateReg(HiRegList, LoRegList, 2);
   if (Reg == 0)
@@ -160,6 +160,105 @@ static bool RetCC_ARM_AAPCS_Custom_f64(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
                                    State);
 }
 
+static const uint16_t SRegList[] = { ARM::S0,  ARM::S1,  ARM::S2,  ARM::S3,
+                                     ARM::S4,  ARM::S5,  ARM::S6,  ARM::S7,
+                                     ARM::S8,  ARM::S9,  ARM::S10, ARM::S11,
+                                     ARM::S12, ARM::S13, ARM::S14,  ARM::S15 };
+static const uint16_t DRegList[] = { ARM::D0, ARM::D1, ARM::D2, ARM::D3,
+                                     ARM::D4, ARM::D5, ARM::D6, ARM::D7 };
+static const uint16_t QRegList[] = { ARM::Q0, ARM::Q1, ARM::Q2, ARM::Q3 };
+
+// Allocate part of an AAPCS HFA or HVA. We assume that each member of the HA
+// has InConsecutiveRegs set, and that the last member also has
+// InConsecutiveRegsLast set. We must process all members of the HA before
+// we can allocate it, as we need to know the total number of registers that
+// will be needed in order to (attempt to) allocate a contiguous block.
+static bool CC_ARM_AAPCS_Custom_HA(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
+                                   CCValAssign::LocInfo &LocInfo,
+                                   ISD::ArgFlagsTy &ArgFlags, CCState &State) {
+  SmallVectorImpl<CCValAssign> &PendingHAMembers = State.getPendingLocs();
+  // AAPCS HFAs must have 1-4 elements, all of the same type
+  assert(PendingHAMembers.size() < 8);
+  if (PendingHAMembers.size() > 0)
+    assert(PendingHAMembers[0].getLocVT() == LocVT);
+
+  // Add the argument to the list to be allocated once we know the size of the
+  // HA
+  PendingHAMembers.push_back(
+      CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo));
+
+  if (ArgFlags.isInConsecutiveRegsLast()) {
+    assert(PendingHAMembers.size() > 0 && PendingHAMembers.size() <= 8 &&
+           "Homogeneous aggregates must have between 1 and 4 members");
+
+    // Try to allocate a contiguous block of registers, each of the correct
+    // size to hold one member.
+    const uint16_t *RegList;
+    unsigned NumRegs;
+    switch (LocVT.SimpleTy) {
+    case MVT::i32:
+    case MVT::f32:
+      RegList = SRegList;
+      NumRegs = 16;
+      break;
+    case MVT::f64:
+      RegList = DRegList;
+      NumRegs = 8;
+      break;
+    case MVT::v2f64:
+      RegList = QRegList;
+      NumRegs = 4;
+      break;
+    default:
+      llvm_unreachable("Unexpected member type for HA");
+      break;
+    }
+
+    unsigned RegResult =
+        State.AllocateRegBlock(RegList, NumRegs, PendingHAMembers.size());
+
+    if (RegResult) {
+      for (SmallVectorImpl<CCValAssign>::iterator It = PendingHAMembers.begin();
+           It != PendingHAMembers.end(); ++It) {
+        It->convertToReg(RegResult);
+        State.addLoc(*It);
+        ++RegResult;
+      }
+      PendingHAMembers.clear();
+      return true;
+    }
+
+    // Register allocation failed, fall back to the stack
+
+    // Mark all VFP regs as unavailable (AAPCS rule C.2.vfp)
+    for (unsigned regNo = 0; regNo < 16; ++regNo)
+      State.AllocateReg(SRegList[regNo]);
+
+    unsigned Size = LocVT.getSizeInBits() / 8;
+    unsigned Align = Size;
+
+    if (LocVT.SimpleTy == MVT::v2f64 || LocVT.SimpleTy == MVT::i32) {
+      // Vectors are always aligned to 8 bytes. If we've seen an i32 here
+      // it's because it's been split from a larger type, also with align 8.
+      Align = 8;
+    }
+
+    for (auto It : PendingHAMembers) {
+      It.convertToMem(State.AllocateStack(Size, Align));
+      State.addLoc(It);
+
+      // Only the first member needs to be aligned.
+      Align = 1;
+    }
+
+    // All pending members have now been allocated
+    PendingHAMembers.clear();
+  }
+
+  // This will be allocated by the last member of the HA
+  return true;
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 7cffd82..526089b 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -174,6 +174,9 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
   CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
+  // HFAs are passed in a contiguous block of registers, or on the stack
+  CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_HA">>,
+
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 7359a11..2fd7edd 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMConstantPoolValue.h"
@@ -40,6 +39,8 @@
 #endif
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -65,10 +66,10 @@ namespace {
     static char ID;
   public:
     ARMCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-      : MachineFunctionPass(ID), JTI(0),
+      : MachineFunctionPass(ID), JTI(nullptr),
         II((const ARMBaseInstrInfo *)tm.getInstrInfo()),
         TD(tm.getDataLayout()), TM(tm),
-        MCE(mce), MCPEs(0), MJTEs(0),
+        MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_), IsThumb(false) {}
 
     /// getBinaryCodeForInstr - This function, generated by the
@@ -373,7 +374,7 @@ bool ARMCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
 
   Subtarget = &TM.getSubtarget<ARMSubtarget>();
   MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = 0;
+  MJTEs = nullptr;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   IsPIC = TM.getRelocationModel() == Reloc::PIC_;
   IsThumb = MF.getInfo<ARMFunctionInfo>()->isThumbFunction();
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index ba05171..ce264ee 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-cp-islands"
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
@@ -36,6 +35,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-cp-islands"
+
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -593,7 +594,7 @@ ARMConstantIslands::CPEntry
     if (CPEs[i].CPEMI == CPEMI)
       return &CPEs[i];
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
@@ -1102,7 +1103,7 @@ bool ARMConstantIslands::decrementCPEReferenceCount(unsigned CPI,
   assert(CPE && "Unexpected!");
   if (--CPE->RefCount == 0) {
     removeDeadCPEMI(CPEMI);
-    CPE->CPEMI = NULL;
+    CPE->CPEMI = nullptr;
     --NumCPEs;
     return true;
   }
@@ -1135,7 +1136,7 @@ int ARMConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
@@ -1317,7 +1318,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
   ++MI;
   unsigned CPUIndex = CPUserIndex+1;
   unsigned NumCPUsers = CPUsers.size();
-  MachineInstr *LastIT = 0;
+  MachineInstr *LastIT = nullptr;
   for (unsigned Offset = UserOffset+TII->GetInstSizeInBytes(UserMI);
        Offset < BaseInsertOffset;
        Offset += TII->GetInstSizeInBytes(MI), MI = std::next(MI)) {
@@ -1491,7 +1492,7 @@ bool ARMConstantIslands::removeUnusedCPEntries() {
       for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
         if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
           removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = NULL;
+          CPEs[j].CPEMI = nullptr;
           MadeChange = true;
         }
       }
@@ -1844,7 +1845,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
   // FIXME: After the tables are shrunk, can we get rid some of the
   // constantpool tables?
   MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (MJTI == 0) return false;
+  if (!MJTI) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
@@ -1970,7 +1971,7 @@ bool ARMConstantIslands::reorderThumb2JumpTables() {
   bool MadeChange = false;
 
   MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
-  if (MJTI == 0) return false;
+  if (!MJTI) return false;
 
   const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
   for (unsigned i = 0, e = T2JumpTables.size(); i != e; ++i) {
@@ -2012,7 +2013,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   // try to move it; otherwise, create a new block following the jump
   // table that branches back to the actual target. This is a very simple
   // heuristic. FIXME: We can definitely improve it.
-  MachineBasicBlock *TBB = 0, *FBB = 0;
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   SmallVector<MachineOperand, 4> CondPrior;
   MachineFunction::iterator BBi = BB;
@@ -2032,7 +2033,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
     // Update numbering to account for the block being moved.
     MF->RenumberBlocks();
     ++NumJTMoved;
-    return NULL;
+    return nullptr;
   }
 
   // Create a new MBB for the code after the jump BB.
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index bd4ee44..6045738 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-pseudo"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
@@ -23,6 +22,7 @@
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/CommandLine.h"
@@ -31,6 +31,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-pseudo"
+
 static cl::opt<bool>
 VerifyARMPseudo("verify-arm-pseudo-expand", cl::Hidden,
                 cl::desc("Verify machine code after expanding ARM pseudos"));
@@ -345,7 +347,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
     std::lower_bound(NEONLdStTable, NEONLdStTable + NumEntries, Opcode);
   if (I != NEONLdStTable + NumEntries && I->PseudoOpc == Opcode)
     return I;
-  return NULL;
+  return nullptr;
 }
 
 /// GetDSubRegs - Get 4 D subregisters of a Q, QQ, or QQQQ register,
@@ -614,6 +616,39 @@ void ARMExpandPseudo::ExpandVTBL(MachineBasicBlock::iterator &MBBI,
   MI.eraseFromParent();
 }
 
+static bool IsAnAddressOperand(const MachineOperand &MO) {
+  // This check is overly conservative.  Unless we are certain that the machine
+  // operand is not a symbol reference, we return that it is a symbol reference.
+  // This is important as the load pair may not be split up Windows.
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+  case MachineOperand::MO_Immediate:
+  case MachineOperand::MO_CImmediate:
+  case MachineOperand::MO_FPImmediate:
+    return false;
+  case MachineOperand::MO_MachineBasicBlock:
+    return true;
+  case MachineOperand::MO_FrameIndex:
+    return false;
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+  case MachineOperand::MO_JumpTableIndex:
+  case MachineOperand::MO_ExternalSymbol:
+  case MachineOperand::MO_GlobalAddress:
+  case MachineOperand::MO_BlockAddress:
+    return true;
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return false;
+  case MachineOperand::MO_Metadata:
+  case MachineOperand::MO_MCSymbol:
+    return true;
+  case MachineOperand::MO_CFIIndex:
+    return false;
+  }
+  llvm_unreachable("unhandled machine operand type");
+}
+
 void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
@@ -624,10 +659,14 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   bool DstIsDead = MI.getOperand(0).isDead();
   bool isCC = Opcode == ARM::MOVCCi32imm || Opcode == ARM::t2MOVCCi32imm;
   const MachineOperand &MO = MI.getOperand(isCC ? 2 : 1);
+  bool RequiresBundling = STI->isTargetWindows() && IsAnAddressOperand(MO);
   MachineInstrBuilder LO16, HI16;
 
   if (!STI->hasV6T2Ops() &&
       (Opcode == ARM::MOVi32imm || Opcode == ARM::MOVCCi32imm)) {
+    // FIXME Windows CE supports older ARM CPUs
+    assert(!STI->isTargetWindows() && "Windows on ARM requires ARMv7+");
+
     // Expand into a movi + orr.
     LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::MOVi), DstReg);
     HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ORRri))
@@ -664,17 +703,29 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(DstReg);
 
-  if (MO.isImm()) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Immediate: {
     unsigned Imm = MO.getImm();
     unsigned Lo16 = Imm & 0xffff;
     unsigned Hi16 = (Imm >> 16) & 0xffff;
     LO16 = LO16.addImm(Lo16);
     HI16 = HI16.addImm(Hi16);
-  } else {
+    break;
+  }
+  case MachineOperand::MO_ExternalSymbol: {
+    const char *ES = MO.getSymbolName();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addExternalSymbol(ES, TF | ARMII::MO_LO16);
+    HI16 = HI16.addExternalSymbol(ES, TF | ARMII::MO_HI16);
+    break;
+  }
+  default: {
     const GlobalValue *GV = MO.getGlobal();
     unsigned TF = MO.getTargetFlags();
     LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
     HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+    break;
+  }
   }
 
   LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
@@ -682,6 +733,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   LO16.addImm(Pred).addReg(PredReg);
   HI16.addImm(Pred).addReg(PredReg);
 
+  if (RequiresBundling)
+    finalizeBundle(MBB, &*LO16, &*MBBI);
+
   TransferImpOps(MI, LO16, HI16);
   MI.eraseFromParent();
 }
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index c442444..6f8fb1a 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -166,8 +166,6 @@ class ARMFastISel final : public FastISel {
 
     // Utility routines.
   private:
-    unsigned constrainOperandRegClass(const MCInstrDesc &II, unsigned OpNum,
-                                      unsigned Op);
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
@@ -191,6 +189,8 @@ class ARMFastISel final : public FastISel {
     unsigned ARMSelectCallOp(bool UseReg);
     unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
 
+    const TargetLowering *getTargetLowering() { return TM.getTargetLowering(); }
+
     // Call handling routines.
   private:
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC,
@@ -283,23 +283,6 @@ ARMFastISel::AddOptionalDefs(const MachineInstrBuilder &MIB) {
   return MIB;
 }
 
-unsigned ARMFastISel::constrainOperandRegClass(const MCInstrDesc &II,
-                                               unsigned Op, unsigned OpNum) {
-  if (TargetRegisterInfo::isVirtualRegister(Op)) {
-    const TargetRegisterClass *RegClass =
-        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
-    if (!MRI.constrainRegClass(Op, RegClass)) {
-      // If it's not legal to COPY between the register classes, something
-      // has gone very wrong before we got here.
-      unsigned NewOp = createResultReg(RegClass);
-      AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                              TII.get(TargetOpcode::COPY), NewOp).addReg(Op));
-      return NewOp;
-    }
-  }
-  return Op;
-}
-
 unsigned ARMFastISel::FastEmitInst_r(unsigned MachineInstOpcode,
                                      const TargetRegisterClass *RC,
                                      unsigned Op0, bool Op0IsKill) {
@@ -769,7 +752,7 @@ bool ARMFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
 // Computes the address to get to an object.
 bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   // Some boilerplate from the X86 FastISel.
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
@@ -1400,7 +1383,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
       const APInt &CIVal = ConstInt->getValue();
       Imm = (isZExt) ? (int)CIVal.getZExtValue() : (int)CIVal.getSExtValue();
       // For INT_MIN/LONG_MIN (i.e., 0x80000000) we need to use a cmp, rather
-      // then a cmn, because there is no way to represent 2147483648 as a 
+      // then a cmn, because there is no way to represent 2147483648 as a
       // signed 32-bit int.
       if (Imm < 0 && Imm != (int)0x80000000) {
         isNegativeImm = true;
@@ -2182,7 +2165,8 @@ unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   if (!LCREVT.isSimple()) return 0;
 
   GlobalValue *GV = new GlobalVariable(M, Type::getInt32Ty(*Context), false,
-                                       GlobalValue::ExternalLinkage, 0, Name);
+                                       GlobalValue::ExternalLinkage, nullptr,
+                                       Name);
   assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
   return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
 }
@@ -2286,7 +2270,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 }
 
 bool ARMFastISel::SelectCall(const Instruction *I,
-                             const char *IntrMemName = 0) {
+                             const char *IntrMemName = nullptr) {
   const CallInst *CI = cast<CallInst>(I);
   const Value *Callee = CI->getCalledValue();
 
@@ -3092,6 +3076,6 @@ namespace llvm {
       TM.Options.NoFramePointerElim = true;
       return new ARMFastISel(funcInfo, libInfo);
     }
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index a30f4cd..e191a3c 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -1,4 +1,4 @@
-//===-- ARMFeatures.h - Checks for ARM instruction features ------*- C++ -*-===//
+//===-- ARMFeatures.h - Checks for ARM instruction features -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,11 +16,11 @@
 
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 
+namespace llvm {
+
 template<typename InstrType> // could be MachineInstr or MCInst
 bool IsCPSRDead(InstrType *Instr);
 
-namespace llvm {
-
 template<typename InstrType> // could be MachineInstr or MCInst
 inline bool isV8EligibleForIT(InstrType *Instr) {
   switch (Instr->getOpcode()) {
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 36ecfca..0caf4bf 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -87,7 +87,7 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
 
 static bool isCSRestore(MachineInstr *MI,
                         const ARMBaseInstrInfo &TII,
-                        const uint16_t *CSRegs) {
+                        const MCPhysReg *CSRegs) {
   // Integer spill area is handled with "pop".
   if (isPopOpcode(MI->getOpcode())) {
     // The first two operands are predicates. The last two are
@@ -142,6 +142,14 @@ static int sizeOfSPAdjustment(const MachineInstr *MI) {
   return count;
 }
 
+static bool WindowsRequiresStackProbe(const MachineFunction &MF,
+                                      size_t StackSizeInBytes) {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  if (MFI->getStackProtectorIndex() > 0)
+    return StackSizeInBytes >= 4080;
+  return StackSizeInBytes >= 4096;
+}
+
 void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineBasicBlock &MBB = MF.front();
   MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -149,15 +157,16 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineModuleInfo &MMI = MF.getMMI();
   MCContext &Context = MMI.getContext();
+  const TargetMachine &TM = MF.getTarget();
   const MCRegisterInfo *MRI = Context.getRegisterInfo();
   const ARMBaseRegisterInfo *RegInfo =
-    static_cast<const ARMBaseRegisterInfo*>(MF.getTarget().getRegisterInfo());
+    static_cast<const ARMBaseRegisterInfo*>(TM.getRegisterInfo());
   const ARMBaseInstrInfo &TII =
-    *static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+    *static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned Align = MF.getTarget().getFrameLowering()->getStackAlignment();
+  unsigned Align = TM.getFrameLowering()->getStackAlignment();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize(Align);
   unsigned NumBytes = MFI->getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
@@ -187,7 +196,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
         .addCFIIndex(CFIIndex);
   }
 
-  if (!AFI->hasStackFrame()) {
+  if (!AFI->hasStackFrame() &&
+      (!STI.isTargetWindows() || !WindowsRequiresStackProbe(MF, NumBytes))) {
     if (NumBytes - ArgRegsSaveSize != 0) {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -(NumBytes - ArgRegsSaveSize),
                    MachineInstr::FrameSetup);
@@ -284,6 +294,51 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   } else
     NumBytes = DPRCSOffset;
 
+  if (STI.isTargetWindows() && WindowsRequiresStackProbe(MF, NumBytes)) {
+    uint32_t NumWords = NumBytes >> 2;
+
+    if (NumWords < 65536)
+      AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi16), ARM::R4)
+                     .addImm(NumWords)
+                     .setMIFlags(MachineInstr::FrameSetup));
+    else
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R4)
+        .addImm(NumWords)
+        .setMIFlags(MachineInstr::FrameSetup);
+
+    switch (TM.getCodeModel()) {
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Default:
+    case CodeModel::Kernel:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBL))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addExternalSymbol("__chkstk")
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    case CodeModel::Large:
+    case CodeModel::JITDefault:
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::t2MOVi32imm), ARM::R12)
+        .addExternalSymbol("__chkstk")
+        .setMIFlags(MachineInstr::FrameSetup);
+
+      BuildMI(MBB, MBBI, dl, TII.get(ARM::tBLXr))
+        .addImm((unsigned)ARMCC::AL).addReg(0)
+        .addReg(ARM::R12, RegState::Kill)
+        .addReg(ARM::R4, RegState::Implicit)
+        .setMIFlags(MachineInstr::FrameSetup);
+      break;
+    }
+
+    AddDefaultCC(AddDefaultPred(BuildMI(MBB, MBBI, dl, TII.get(ARM::t2SUBrr),
+                                        ARM::SP)
+                                .addReg(ARM::SP, RegState::Define)
+                                .addReg(ARM::R4, RegState::Kill)
+                                .setMIFlags(MachineInstr::FrameSetup)));
+    NumBytes = 0;
+  }
+
   unsigned adjustedGPRCS1Size = GPRCS1Size;
   if (NumBytes) {
     // Adjust SP after all the callee-save spills.
@@ -316,10 +371,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     MachineBasicBlock::iterator Pos = ++GPRCS1Push;
     BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
       case ARM::R9:
@@ -382,10 +436,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       BuildMI(MBB, Pos, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       switch (Reg) {
       case ARM::R8:
       case ARM::R9:
@@ -411,7 +464,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
     do {
       MachineBasicBlock::iterator Push = DPRCSPush++;
       if (!HasFP) {
-        CFAOffset -= sizeOfSPAdjustment(Push);;
+        CFAOffset -= sizeOfSPAdjustment(Push);
         unsigned CFIIndex = MMI.addFrameInst(
             MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
         BuildMI(MBB, DPRCSPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
@@ -419,10 +472,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       }
     } while (DPRCSPush->getOpcode() == ARM::VSTMDDB_UPD);
 
-    for (std::vector<CalleeSavedInfo>::const_iterator I = CSI.begin(),
-           E = CSI.end(); I != E; ++I) {
-      unsigned Reg = I->getReg();
-      int FI = I->getFrameIdx();
+    for (const auto &Entry : CSI) {
+      unsigned Reg = Entry.getReg();
+      int FI = Entry.getFrameIdx();
       if ((Reg >= ARM::D0 && Reg <= ARM::D31) &&
           (Reg < ARM::D8 || Reg >= ARM::D8 + AFI->getNumAlignedDPRCS2Regs())) {
         unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
@@ -540,7 +592,7 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+    const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
       do {
         --MBBI;
@@ -1205,12 +1257,9 @@ bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 static unsigned GetFunctionSizeInBytes(const MachineFunction &MF,
                                        const ARMBaseInstrInfo &TII) {
   unsigned FnSize = 0;
-  for (MachineFunction::const_iterator MBBI = MF.begin(), E = MF.end();
-       MBBI != E; ++MBBI) {
-    const MachineBasicBlock &MBB = *MBBI;
-    for (MachineBasicBlock::const_iterator I = MBB.begin(),E = MBB.end();
-         I != E; ++I)
-      FnSize += TII.GetInstSizeInBytes(I);
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB)
+      FnSize += TII.GetInstSizeInBytes(&MI);
   }
   return FnSize;
 }
@@ -1223,21 +1272,21 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF,
                                          const TargetFrameLowering *TFI) {
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   unsigned Limit = (1 << 12) - 1;
-  for (MachineFunction::iterator BB = MF.begin(),E = MF.end(); BB != E; ++BB) {
-    for (MachineBasicBlock::iterator I = BB->begin(), E = BB->end();
-         I != E; ++I) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
-        if (!I->getOperand(i).isFI()) continue;
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        if (!MI.getOperand(i).isFI())
+          continue;
 
         // When using ADDri to get the address of a stack object, 255 is the
         // largest offset guaranteed to fit in the immediate offset.
-        if (I->getOpcode() == ARM::ADDri) {
+        if (MI.getOpcode() == ARM::ADDri) {
           Limit = std::min(Limit, (1U << 8) - 1);
           break;
         }
 
         // Otherwise check the addressing mode.
-        switch (I->getDesc().TSFlags & ARMII::AddrModeMask) {
+        switch (MI.getDesc().TSFlags & ARMII::AddrModeMask) {
         case ARMII::AddrMode3:
         case ARMII::AddrModeT2_i8:
           Limit = std::min(Limit, (1U << 8) - 1);
@@ -1374,7 +1423,7 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is used.
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
   for (unsigned i = 0; CSRegs[i]; ++i) {
     unsigned Reg = CSRegs[i];
     bool Spilled = false;
@@ -1486,6 +1535,10 @@ ARMFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
 
     if (hasFP(MF)) {
       MRI.setPhysRegUsed(FramePtr);
+      auto FPPos = std::find(UnspilledCS1GPRs.begin(), UnspilledCS1GPRs.end(),
+                             FramePtr);
+      if (FPPos != UnspilledCS1GPRs.end())
+        UnspilledCS1GPRs.erase(FPPos);
       NumGPRSpills++;
     }
 
@@ -1681,7 +1734,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   if (MF.getFunction()->isVarArg())
     report_fatal_error("Segmented stacks do not support vararg functions.");
   if (!ST->isTargetAndroid() && !ST->isTargetLinux())
-    report_fatal_error("Segmented stacks not supported on this platfrom.");
+    report_fatal_error("Segmented stacks not supported on this platform.");
 
   MachineBasicBlock &prologueMBB = MF.front();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1693,6 +1746,12 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   ARMFunctionInfo *ARMFI = MF.getInfo<ARMFunctionInfo>();
   DebugLoc DL;
 
+  uint64_t StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   // Use R4 and R5 as scratch registers.
   // We save R4 and R5 before use and restore them before leaving the function.
   unsigned ScratchReg0 = ARM::R4;
@@ -1722,8 +1781,6 @@ void ARMFrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(PrevStackMBB);
 
   // The required stack size that is aligned to ARM constant criterion.
-  uint64_t StackSize = MFI->getStackSize();
-
   AlignedStackSize = alignToARMConstant(StackSize);
 
   // When the frame size is less than 256 we just compare the stack
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index 524ee36..981d320 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -57,7 +57,7 @@ public:
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                             RegScavenger *RS) const override;
 
-  void adjustForSegmentedStacks(MachineFunction &MF) const;
+  void adjustForSegmentedStacks(MachineFunction &MF) const override;
 
  private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index 61d4e12..0885c4e 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -77,7 +77,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
 }
 
 void ARMHazardRecognizer::Reset() {
-  LastMI = 0;
+  LastMI = nullptr;
   FpMLxStalls = 0;
   ScoreboardHazardRecognizer::Reset();
 }
@@ -95,7 +95,7 @@ void ARMHazardRecognizer::EmitInstruction(SUnit *SU) {
 void ARMHazardRecognizer::AdvanceCycle() {
   if (FpMLxStalls && --FpMLxStalls == 0)
     // Stalled for 4 cycles but still can't schedule any other instructions.
-    LastMI = 0;
+    LastMI = nullptr;
   ScoreboardHazardRecognizer::AdvanceCycle();
 }
 
diff --git a/lib/Target/ARM/ARMHazardRecognizer.h b/lib/Target/ARM/ARMHazardRecognizer.h
index e88cd0d..a8198e2 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.h
+++ b/lib/Target/ARM/ARMHazardRecognizer.h
@@ -35,7 +35,7 @@ public:
   ARMHazardRecognizer(const InstrItineraryData *ItinData,
                       const ScheduleDAG *DAG)
     : ScoreboardHazardRecognizer(ItinData, DAG, "post-RA-sched"),
-      LastMI(0) {}
+      LastMI(nullptr) {}
 
   HazardType getHazardType(SUnit *SU, int Stalls) override;
   void Reset() override;
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 70e11c5..08d598d 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-isel"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMTargetMachine.h"
@@ -37,6 +36,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-isel"
+
 static cl::opt<bool>
 DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
@@ -72,6 +73,13 @@ public:
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    // Reset the subtarget each time through.
+    Subtarget = &TM.getSubtarget<ARMSubtarget>();
+    SelectionDAGISel::runOnMachineFunction(MF);
+    return true;
+  }
+
   const char *getPassName() const override {
     return "ARM Instruction Selection";
   }
@@ -397,7 +405,7 @@ void ARMDAGToDAGISel::PreprocessISelDAG() {
     N1 = CurDAG->getNode(ISD::SHL, SDLoc(N1), MVT::i32,
                          N1, CurDAG->getConstant(TZ, MVT::i32));
     CurDAG->UpdateNodeOperands(N, N0, N1);
-  }  
+  }
 }
 
 /// hasNoVMLxHazardUse - Return true if it's desirable to select a FP MLA / MLS
@@ -1440,7 +1448,7 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return NULL;
+    return nullptr;
 
   EVT LoadedVT = LD->getMemoryVT();
   SDValue Offset, AMOpc;
@@ -1506,14 +1514,14 @@ SDNode *ARMDAGToDAGISel::SelectARMIndexedLoad(SDNode *N) {
     }
   }
 
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::MemIndexedMode AM = LD->getAddressingMode();
   if (AM == ISD::UNINDEXED)
-    return NULL;
+    return nullptr;
 
   EVT LoadedVT = LD->getMemoryVT();
   bool isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
@@ -1540,7 +1548,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
         Opcode = isPre ? ARM::t2LDRB_PRE : ARM::t2LDRB_POST;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     Match = true;
   }
@@ -1554,7 +1562,7 @@ SDNode *ARMDAGToDAGISel::SelectT2IndexedLoad(SDNode *N) {
                                   MVT::Other, Ops);
   }
 
-  return NULL;
+  return nullptr;
 }
 
 /// \brief Form a GPRPair pseudo register from a pair of GPR regs.
@@ -1699,10 +1707,10 @@ static bool isVSTfixed(unsigned Opc)
   case ARM::VST1d16wb_fixed : return true;
   case ARM::VST1d32wb_fixed : return true;
   case ARM::VST1d64wb_fixed : return true;
-  case ARM::VST1q8wb_fixed : return true; 
-  case ARM::VST1q16wb_fixed : return true; 
-  case ARM::VST1q32wb_fixed : return true; 
-  case ARM::VST1q64wb_fixed : return true; 
+  case ARM::VST1q8wb_fixed : return true;
+  case ARM::VST1q16wb_fixed : return true;
+  case ARM::VST1q32wb_fixed : return true;
+  case ARM::VST1q64wb_fixed : return true;
   case ARM::VST1d64TPseudoWB_fixed : return true;
   case ARM::VST1d64QPseudoWB_fixed : return true;
   case ARM::VST2d8wb_fixed : return true;
@@ -1776,7 +1784,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   SDValue MemAddr, Align;
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   SDValue Chain = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -1895,7 +1903,7 @@ SDNode *ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLd, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
@@ -1909,7 +1917,7 @@ SDNode *ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2055,7 +2063,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   unsigned AddrOpIdx = isUpdating ? 1 : 2;
   unsigned Vec0Idx = 3; // AddrOpIdx + (isUpdating ? 2 : 1)
   if (!SelectAddrMode6(N, N->getOperand(AddrOpIdx), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2160,7 +2168,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdLn, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdLn, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
@@ -2171,7 +2179,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
 
   SDValue MemAddr, Align;
   if (!SelectAddrMode6(N, N->getOperand(1), MemAddr, Align))
-    return NULL;
+    return nullptr;
 
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
@@ -2243,7 +2251,7 @@ SDNode *ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool isUpdating,
   ReplaceUses(SDValue(N, NumVecs), SDValue(VLdDup, 1));
   if (isUpdating)
     ReplaceUses(SDValue(N, NumVecs + 1), SDValue(VLdDup, 2));
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
@@ -2282,7 +2290,7 @@ SDNode *ARMDAGToDAGISel::SelectVTBL(SDNode *N, bool IsExt, unsigned NumVecs,
 SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                                      bool isSigned) {
   if (!Subtarget->hasV6T2Ops())
-    return NULL;
+    return nullptr;
 
   unsigned Opc = isSigned
     ? (Subtarget->isThumb() ? ARM::t2SBFX : ARM::SBFX)
@@ -2295,7 +2303,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
 
       // The immediate is a mask of the low bits iff imm & (imm+1) == 0
       if (And_imm & (And_imm + 1))
-        return NULL;
+        return nullptr;
 
       unsigned Srl_imm = 0;
       if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SRL,
@@ -2315,7 +2323,7 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
             SDValue Ops[] = { N->getOperand(0).getOperand(0),
                               CurDAG->getTargetConstant(LSB, MVT::i32),
                               getAL(CurDAG), Reg0, Reg0 };
-            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+            return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
           }
 
           // ARM models shift instructions as MOVsi with shifter operand.
@@ -2325,17 +2333,17 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
                                       MVT::i32);
           SDValue Ops[] = { N->getOperand(0).getOperand(0), ShOpc,
                             getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops, 5);
+          return CurDAG->SelectNodeTo(N, ARM::MOVsi, MVT::i32, Ops);
         }
 
         SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           CurDAG->getTargetConstant(LSB, MVT::i32),
                           CurDAG->getTargetConstant(Width, MVT::i32),
-          getAL(CurDAG), Reg0 };
-        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+                          getAL(CurDAG), Reg0 };
+        return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
       }
     }
-    return NULL;
+    return nullptr;
   }
 
   // Otherwise, we're looking for a shift of a shift
@@ -2349,16 +2357,16 @@ SDNode *ARMDAGToDAGISel::SelectV6T2BitfieldExtractOp(SDNode *N,
       unsigned Width = 32 - Srl_imm - 1;
       int LSB = Srl_imm - Shl_imm;
       if (LSB < 0)
-        return NULL;
+        return nullptr;
       SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                         CurDAG->getTargetConstant(LSB, MVT::i32),
                         CurDAG->getTargetConstant(Width, MVT::i32),
                         getAL(CurDAG), Reg0 };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 /// Target-specific DAG combining for ISD::XOR.
@@ -2377,10 +2385,10 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   EVT VT = N->getValueType(0);
 
   if (Subtarget->isThumb1Only())
-    return NULL;
+    return nullptr;
 
   if (XORSrc0.getOpcode() != ISD::ADD || XORSrc1.getOpcode() != ISD::SRA)
-    return NULL;
+    return nullptr;
 
   SDValue ADDSrc0 = XORSrc0.getOperand(0);
   SDValue ADDSrc1 = XORSrc0.getOperand(1);
@@ -2391,13 +2399,13 @@ SDNode *ARMDAGToDAGISel::SelectABSOp(SDNode *N){
   unsigned Size = XType.getSizeInBits() - 1;
 
   if (ADDSrc1 == XORSrc1 && ADDSrc0 == SRASrc0 &&
-      XType.isInteger() && SRAConstant != NULL &&
+      XType.isInteger() && SRAConstant != nullptr &&
       Size == SRAConstant->getZExtValue()) {
     unsigned Opcode = Subtarget->isThumb2() ? ARM::t2ABS : ARM::ABS;
     return CurDAG->SelectNodeTo(N, Opcode, VT, ADDSrc0);
   }
 
-  return NULL;
+  return nullptr;
 }
 
 SDNode *ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
@@ -2414,7 +2422,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
@@ -2478,7 +2486,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                        Ops);
       }
       ReplaceUses(SDValue(N, 0), SDValue(ResNode, 0));
-      return NULL;
+      return nullptr;
     }
 
     // Other cases are autogenerated.
@@ -2492,14 +2500,14 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     if (Subtarget->isThumb1Only()) {
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, ARM::tADDrSPi, MVT::i32, Ops);
     } else {
       unsigned Opc = ((Subtarget->isThumb() && Subtarget->hasThumb2()) ?
                       ARM::t2ADDri : ARM::ADDri);
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32),
                         CurDAG->getRegister(0, MVT::i32) };
-      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops, 5);
+      return CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
     }
   }
   case ISD::SRL:
@@ -2526,10 +2534,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops, 6);
+          return CurDAG->SelectNodeTo(N, ARM::t2ADDrs, MVT::i32, Ops);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::ADDrsi, MVT::i32, Ops);
         }
       }
       if (isPowerOf2_32(RHSV+1)) {  // 2^n-1?
@@ -2542,10 +2550,10 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
         if (Subtarget->isThumb()) {
           SDValue Ops[] = { V, V, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops, 6);
+          return CurDAG->SelectNodeTo(N, ARM::t2RSBrs, MVT::i32, Ops);
         } else {
           SDValue Ops[] = { V, V, Reg0, ShImmOp, getAL(CurDAG), Reg0, Reg0 };
-          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops, 7);
+          return CurDAG->SelectNodeTo(N, ARM::RSBrsi, MVT::i32, Ops);
         }
       }
     }
@@ -2660,7 +2668,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
   }
   case ISD::LOAD: {
-    SDNode *ResNode = 0;
+    SDNode *ResNode = nullptr;
     if (Subtarget->isThumb() && Subtarget->hasThumb2())
       ResNode = SelectT2IndexedLoad(N);
     else
@@ -2707,13 +2715,13 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     }
     ReplaceUses(SDValue(N, 0),
                 SDValue(Chain.getNode(), Chain.getResNo()));
-    return NULL;
+    return nullptr;
   }
   case ARMISD::VZIP: {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VZIPd8; break;
     case MVT::v4i16: Opc = ARM::VZIPd16; break;
     case MVT::v2f32:
@@ -2733,7 +2741,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VUZPd8; break;
     case MVT::v4i16: Opc = ARM::VUZPd16; break;
     case MVT::v2f32:
@@ -2753,7 +2761,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     unsigned Opc = 0;
     EVT VT = N->getValueType(0);
     switch (VT.getSimpleVT().SimpleTy) {
-    default: return NULL;
+    default: return nullptr;
     case MVT::v8i8:  Opc = ARM::VTRNd8; break;
     case MVT::v4i16: Opc = ARM::VTRNd16; break;
     case MVT::v2f32:
@@ -2834,7 +2842,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VLD1q16wb_fixed,
                                          ARM::VLD1q32wb_fixed,
                                          ARM::VLD1q64wb_fixed };
-    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, 0);
+    return SelectVLD(N, true, 1, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VLD2_UPD: {
@@ -2845,7 +2853,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
                                          ARM::VLD2q16PseudoWB_fixed,
                                          ARM::VLD2q32PseudoWB_fixed };
-    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, 0);
+    return SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VLD3_UPD: {
@@ -2912,7 +2920,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                          ARM::VST1q16wb_fixed,
                                          ARM::VST1q32wb_fixed,
                                          ARM::VST1q64wb_fixed };
-    return SelectVST(N, true, 1, DOpcodes, QOpcodes, 0);
+    return SelectVST(N, true, 1, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VST2_UPD: {
@@ -2923,7 +2931,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
     static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
                                          ARM::VST2q16PseudoWB_fixed,
                                          ARM::VST2q32PseudoWB_fixed };
-    return SelectVST(N, true, 2, DOpcodes, QOpcodes, 0);
+    return SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
   }
 
   case ARMISD::VST3_UPD: {
@@ -3047,7 +3055,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
         ReplaceUses(SDValue(N, 1), Result);
       }
       ReplaceUses(SDValue(N, 2), OutChain);
-      return NULL;
+      return nullptr;
     }
     case Intrinsic::arm_stlexd:
     case Intrinsic::arm_strexd: {
@@ -3093,7 +3101,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD1d32, ARM::VLD1d64 };
       static const uint16_t QOpcodes[] = { ARM::VLD1q8, ARM::VLD1q16,
                                            ARM::VLD1q32, ARM::VLD1q64};
-      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 1, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vld2: {
@@ -3101,7 +3109,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VLD2d32, ARM::VLD1q64 };
       static const uint16_t QOpcodes[] = { ARM::VLD2q8Pseudo, ARM::VLD2q16Pseudo,
                                            ARM::VLD2q32Pseudo };
-      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, 0);
+      return SelectVLD(N, false, 2, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vld3: {
@@ -3164,7 +3172,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST1d32, ARM::VST1d64 };
       static const uint16_t QOpcodes[] = { ARM::VST1q8, ARM::VST1q16,
                                            ARM::VST1q32, ARM::VST1q64 };
-      return SelectVST(N, false, 1, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 1, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vst2: {
@@ -3172,7 +3180,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::VST2d32, ARM::VST1q64 };
       static uint16_t QOpcodes[] = { ARM::VST2q8Pseudo, ARM::VST2q16Pseudo,
                                      ARM::VST2q32Pseudo };
-      return SelectVST(N, false, 2, DOpcodes, QOpcodes, 0);
+      return SelectVST(N, false, 2, DOpcodes, QOpcodes, nullptr);
     }
 
     case Intrinsic::arm_neon_vst3: {
@@ -3306,7 +3314,8 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   // them into a GPRPair.
 
   SDLoc dl(N);
-  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(0,0);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1)
+                                   : SDValue(nullptr,0);
 
   SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
@@ -3388,7 +3397,7 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
       // Update the original glue user.
       std::vector<SDValue> Ops(GU->op_begin(), GU->op_end()-1);
       Ops.push_back(T1.getValue(1));
-      CurDAG->UpdateNodeOperands(GU, &Ops[0], Ops.size());
+      CurDAG->UpdateNodeOperands(GU, Ops);
       GU = T1.getNode();
     }
     else {
@@ -3435,11 +3444,10 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   if (Glue.getNode())
     AsmNodeOperands.push_back(Glue);
   if (!Changed)
-    return NULL;
+    return nullptr;
 
   SDValue New = CurDAG->getNode(ISD::INLINEASM, SDLoc(N),
-      CurDAG->getVTList(MVT::Other, MVT::Glue), &AsmNodeOperands[0],
-                        AsmNodeOperands.size());
+      CurDAG->getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
   New->setNodeId(-1);
   return New.getNode();
 }
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2ebad8e..00d07e8 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-isel"
 #include "ARMISelLowering.h"
 #include "ARMCallingConv.h"
 #include "ARMConstantPoolValue.h"
@@ -37,18 +36,22 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetOptions.h"
 #include <utility>
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-isel"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
 STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
@@ -79,7 +82,7 @@ namespace {
 }
 
 // The APCS parameter registers.
-static const uint16_t GPRArgRegs[] = {
+static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
 };
 
@@ -155,7 +158,8 @@ void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
 static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
   if (TM.getSubtarget<ARMSubtarget>().isTargetMachO())
     return new TargetLoweringObjectFileMachO();
-
+  if (TM.getSubtarget<ARMSubtarget>().isTargetWindows())
+    return new TargetLoweringObjectFileCOFF();
   return new ARMElfTargetObjectFile();
 }
 
@@ -170,7 +174,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (Subtarget->isTargetMachO()) {
     // Uses VFP for Thumb libfuncs if available.
     if (Subtarget->isThumb() && Subtarget->hasVFP2() &&
-        Subtarget->hasARMOps()) {
+        Subtarget->hasARMOps() && !TM.Options.UseSoftFloat) {
       // Single-precision floating-point arithmetic.
       setLibcallName(RTLIB::ADD_F32, "__addsf3vfp");
       setLibcallName(RTLIB::SUB_F32, "__subsf3vfp");
@@ -246,173 +250,134 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   }
 
   // These libcalls are not available in 32-bit.
-  setLibcallName(RTLIB::SHL_I128, 0);
-  setLibcallName(RTLIB::SRL_I128, 0);
-  setLibcallName(RTLIB::SRA_I128, 0);
+  setLibcallName(RTLIB::SHL_I128, nullptr);
+  setLibcallName(RTLIB::SRL_I128, nullptr);
+  setLibcallName(RTLIB::SRA_I128, nullptr);
 
   if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetMachO() &&
       !Subtarget->isTargetWindows()) {
-    // Double-precision floating-point arithmetic helper functions
-    // RTABI chapter 4.1.2, Table 2
-    setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
-    setLibcallName(RTLIB::DIV_F64, "__aeabi_ddiv");
-    setLibcallName(RTLIB::MUL_F64, "__aeabi_dmul");
-    setLibcallName(RTLIB::SUB_F64, "__aeabi_dsub");
-    setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::ARM_AAPCS);
-
-    // Double-precision floating-point comparison helper functions
-    // RTABI chapter 4.1.2, Table 3
-    setLibcallName(RTLIB::OEQ_F64, "__aeabi_dcmpeq");
-    setCmpLibcallCC(RTLIB::OEQ_F64, ISD::SETNE);
-    setLibcallName(RTLIB::UNE_F64, "__aeabi_dcmpeq");
-    setCmpLibcallCC(RTLIB::UNE_F64, ISD::SETEQ);
-    setLibcallName(RTLIB::OLT_F64, "__aeabi_dcmplt");
-    setCmpLibcallCC(RTLIB::OLT_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OLE_F64, "__aeabi_dcmple");
-    setCmpLibcallCC(RTLIB::OLE_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OGE_F64, "__aeabi_dcmpge");
-    setCmpLibcallCC(RTLIB::OGE_F64, ISD::SETNE);
-    setLibcallName(RTLIB::OGT_F64, "__aeabi_dcmpgt");
-    setCmpLibcallCC(RTLIB::OGT_F64, ISD::SETNE);
-    setLibcallName(RTLIB::UO_F64,  "__aeabi_dcmpun");
-    setCmpLibcallCC(RTLIB::UO_F64,  ISD::SETNE);
-    setLibcallName(RTLIB::O_F64,   "__aeabi_dcmpun");
-    setCmpLibcallCC(RTLIB::O_F64,   ISD::SETEQ);
-    setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UO_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::O_F64, CallingConv::ARM_AAPCS);
-
-    // Single-precision floating-point arithmetic helper functions
-    // RTABI chapter 4.1.2, Table 4
-    setLibcallName(RTLIB::ADD_F32, "__aeabi_fadd");
-    setLibcallName(RTLIB::DIV_F32, "__aeabi_fdiv");
-    setLibcallName(RTLIB::MUL_F32, "__aeabi_fmul");
-    setLibcallName(RTLIB::SUB_F32, "__aeabi_fsub");
-    setLibcallCallingConv(RTLIB::ADD_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::DIV_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MUL_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SUB_F32, CallingConv::ARM_AAPCS);
-
-    // Single-precision floating-point comparison helper functions
-    // RTABI chapter 4.1.2, Table 5
-    setLibcallName(RTLIB::OEQ_F32, "__aeabi_fcmpeq");
-    setCmpLibcallCC(RTLIB::OEQ_F32, ISD::SETNE);
-    setLibcallName(RTLIB::UNE_F32, "__aeabi_fcmpeq");
-    setCmpLibcallCC(RTLIB::UNE_F32, ISD::SETEQ);
-    setLibcallName(RTLIB::OLT_F32, "__aeabi_fcmplt");
-    setCmpLibcallCC(RTLIB::OLT_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OLE_F32, "__aeabi_fcmple");
-    setCmpLibcallCC(RTLIB::OLE_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OGE_F32, "__aeabi_fcmpge");
-    setCmpLibcallCC(RTLIB::OGE_F32, ISD::SETNE);
-    setLibcallName(RTLIB::OGT_F32, "__aeabi_fcmpgt");
-    setCmpLibcallCC(RTLIB::OGT_F32, ISD::SETNE);
-    setLibcallName(RTLIB::UO_F32,  "__aeabi_fcmpun");
-    setCmpLibcallCC(RTLIB::UO_F32,  ISD::SETNE);
-    setLibcallName(RTLIB::O_F32,   "__aeabi_fcmpun");
-    setCmpLibcallCC(RTLIB::O_F32,   ISD::SETEQ);
-    setLibcallCallingConv(RTLIB::OEQ_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UNE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLT_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OLE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGE_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::OGT_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UO_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::O_F32, CallingConv::ARM_AAPCS);
-
-    // Floating-point to integer conversions.
-    // RTABI chapter 4.1.2, Table 6
-    setLibcallName(RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz");
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz");
-    setLibcallName(RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz");
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz");
-    setLibcallName(RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz");
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz");
-    setLibcallName(RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz");
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz");
-    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F64_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F64_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOSINT_F32_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPTOUINT_F32_I64, CallingConv::ARM_AAPCS);
-
-    // Conversions between floating types.
-    // RTABI chapter 4.1.2, Table 7
-    setLibcallName(RTLIB::FPROUND_F64_F32, "__aeabi_d2f");
-    setLibcallName(RTLIB::FPEXT_F32_F64,   "__aeabi_f2d");
-    setLibcallCallingConv(RTLIB::FPROUND_F64_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::FPEXT_F32_F64, CallingConv::ARM_AAPCS);
-
-    // Integer to floating-point conversions.
-    // RTABI chapter 4.1.2, Table 8
-    setLibcallName(RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d");
-    setLibcallName(RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d");
-    setLibcallName(RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d");
-    setLibcallName(RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d");
-    setLibcallName(RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f");
-    setLibcallName(RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f");
-    setLibcallName(RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f");
-    setLibcallName(RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f");
-    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I32_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UINTTOFP_I64_F32, CallingConv::ARM_AAPCS);
-
-    // Long long helper functions
-    // RTABI chapter 4.2, Table 9
-    setLibcallName(RTLIB::MUL_I64,  "__aeabi_lmul");
-    setLibcallName(RTLIB::SHL_I64, "__aeabi_llsl");
-    setLibcallName(RTLIB::SRL_I64, "__aeabi_llsr");
-    setLibcallName(RTLIB::SRA_I64, "__aeabi_lasr");
-    setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SHL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SRL_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SRA_I64, CallingConv::ARM_AAPCS);
-
-    // Integer division functions
-    // RTABI chapter 4.3.1
-    setLibcallName(RTLIB::SDIV_I8,  "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I16, "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I32, "__aeabi_idiv");
-    setLibcallName(RTLIB::SDIV_I64, "__aeabi_ldivmod");
-    setLibcallName(RTLIB::UDIV_I8,  "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I16, "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I32, "__aeabi_uidiv");
-    setLibcallName(RTLIB::UDIV_I64, "__aeabi_uldivmod");
-    setLibcallCallingConv(RTLIB::SDIV_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I8, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I16, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I32, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::ARM_AAPCS);
-
-    // Memory operations
-    // RTABI chapter 4.3.4
-    setLibcallName(RTLIB::MEMCPY,  "__aeabi_memcpy");
-    setLibcallName(RTLIB::MEMMOVE, "__aeabi_memmove");
-    setLibcallName(RTLIB::MEMSET,  "__aeabi_memset");
-    setLibcallCallingConv(RTLIB::MEMCPY, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MEMMOVE, CallingConv::ARM_AAPCS);
-    setLibcallCallingConv(RTLIB::MEMSET, CallingConv::ARM_AAPCS);
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+      const ISD::CondCode Cond;
+    } LibraryCalls[] = {
+      // Double-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 2
+      { RTLIB::ADD_F64, "__aeabi_dadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F64, "__aeabi_ddiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F64, "__aeabi_dmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F64, "__aeabi_dsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Double-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 3
+      { RTLIB::OEQ_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F64, "__aeabi_dcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F64, "__aeabi_dcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F64, "__aeabi_dcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F64, "__aeabi_dcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F64, "__aeabi_dcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F64,  "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F64,   "__aeabi_dcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Single-precision floating-point arithmetic helper functions
+      // RTABI chapter 4.1.2, Table 4
+      { RTLIB::ADD_F32, "__aeabi_fadd", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::DIV_F32, "__aeabi_fdiv", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MUL_F32, "__aeabi_fmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SUB_F32, "__aeabi_fsub", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Single-precision floating-point comparison helper functions
+      // RTABI chapter 4.1.2, Table 5
+      { RTLIB::OEQ_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UNE_F32, "__aeabi_fcmpeq", CallingConv::ARM_AAPCS, ISD::SETEQ },
+      { RTLIB::OLT_F32, "__aeabi_fcmplt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OLE_F32, "__aeabi_fcmple", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGE_F32, "__aeabi_fcmpge", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::OGT_F32, "__aeabi_fcmpgt", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::UO_F32,  "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETNE },
+      { RTLIB::O_F32,   "__aeabi_fcmpun", CallingConv::ARM_AAPCS, ISD::SETEQ },
+
+      // Floating-point to integer conversions.
+      // RTABI chapter 4.1.2, Table 6
+      { RTLIB::FPTOSINT_F64_I32, "__aeabi_d2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I32, "__aeabi_d2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F64_I64, "__aeabi_d2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F64_I64, "__aeabi_d2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I32, "__aeabi_f2iz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I32, "__aeabi_f2uiz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOSINT_F32_I64, "__aeabi_f2lz",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPTOUINT_F32_I64, "__aeabi_f2ulz", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Conversions between floating types.
+      // RTABI chapter 4.1.2, Table 7
+      { RTLIB::FPROUND_F64_F32, "__aeabi_d2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::FPEXT_F32_F64,   "__aeabi_f2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer to floating-point conversions.
+      // RTABI chapter 4.1.2, Table 8
+      { RTLIB::SINTTOFP_I32_F64, "__aeabi_i2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F64, "__aeabi_ui2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F64, "__aeabi_l2d",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F64, "__aeabi_ul2d", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I32_F32, "__aeabi_i2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I32_F32, "__aeabi_ui2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SINTTOFP_I64_F32, "__aeabi_l2f",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UINTTOFP_I64_F32, "__aeabi_ul2f", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Long long helper functions
+      // RTABI chapter 4.2, Table 9
+      { RTLIB::MUL_I64, "__aeabi_lmul", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SHL_I64, "__aeabi_llsl", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRL_I64, "__aeabi_llsr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SRA_I64, "__aeabi_lasr", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Integer division functions
+      // RTABI chapter 4.3.1
+      { RTLIB::SDIV_I8,  "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I16, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I32, "__aeabi_idiv",     CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::SDIV_I64, "__aeabi_ldivmod",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I8,  "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I16, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I32, "__aeabi_uidiv",    CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::UDIV_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+
+      // Memory operations
+      // RTABI chapter 4.3.4
+      { RTLIB::MEMCPY,  "__aeabi_memcpy",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMMOVE, "__aeabi_memmove", CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+      { RTLIB::MEMSET,  "__aeabi_memset",  CallingConv::ARM_AAPCS, ISD::SETCC_INVALID },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+      if (LC.Cond != ISD::SETCC_INVALID)
+        setCmpLibcallCC(LC.Op, LC.Cond);
+    }
+  }
+
+  if (Subtarget->isTargetWindows()) {
+    static const struct {
+      const RTLIB::Libcall Op;
+      const char * const Name;
+      const CallingConv::ID CC;
+    } LibraryCalls[] = {
+      { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP },
+      { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP },
+    };
+
+    for (const auto &LC : LibraryCalls) {
+      setLibcallName(LC.Op, LC.Name);
+      setLibcallCallingConv(LC.Op, LC.CC);
+    }
   }
 
   // Use divmod compiler-rt calls for iOS 5.0 and later.
@@ -444,6 +409,13 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
     setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
     setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::MULHS, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::MULHU, (MVT::SimpleValueType)VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, (MVT::SimpleValueType)VT, Expand);
+
+    setOperationAction(ISD::BSWAP, (MVT::SimpleValueType)VT, Expand);
   }
 
   setOperationAction(ISD::ConstantFP, MVT::f32, Custom);
@@ -631,6 +603,11 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     }
   }
 
+  setOperationAction(ISD::SADDO, MVT::i32, Custom);
+  setOperationAction(ISD::UADDO, MVT::i32, Custom);
+  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
+  setOperationAction(ISD::USUBO, MVT::i32, Custom);
+
   // i64 operation support.
   setOperationAction(ISD::MUL,     MVT::i64, Expand);
   setOperationAction(ISD::MULHU,   MVT::i32, Expand);
@@ -850,7 +827,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
       setOperationAction(ISD::FP32_TO_FP16, MVT::i32, Expand);
     }
   }
-      
+
   // Combine sin / cos into one node or libcall if possible.
   if (Subtarget->hasSinCos()) {
     setLibcallName(RTLIB::SINCOS_F32, "sincosf");
@@ -913,7 +890,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
 // and extractions.
 std::pair<const TargetRegisterClass*, uint8_t>
 ARMTargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
+  const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
@@ -950,7 +927,7 @@ ARMTargetLowering::findRepresentativeClass(MVT VT) const{
 
 const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case ARMISD::Wrapper:       return "ARMISD::Wrapper";
   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
@@ -1204,40 +1181,58 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
 
 #include "ARMGenCallingConv.inc"
 
-/// CCAssignFnForNode - Selects the correct CCAssignFn for a the
-/// given CallingConvention value.
-CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
-                                                 bool Return,
-                                                 bool isVarArg) const {
+/// getEffectiveCallingConv - Get the effective calling convention, taking into
+/// account presence of floating point hardware and calling convention
+/// limitations, such as support for variadic functions.
+CallingConv::ID
+ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
+                                           bool isVarArg) const {
   switch (CC) {
   default:
     llvm_unreachable("Unsupported calling convention");
-  case CallingConv::Fast:
-    if (Subtarget->hasVFP2() && !isVarArg) {
-      if (!Subtarget->isAAPCS_ABI())
-        return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
-      // For AAPCS ABI targets, just use VFP variant of the calling convention.
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    }
-    // Fallthrough
-  case CallingConv::C: {
-    // Use target triple & subtarget features to do actual dispatch.
+  case CallingConv::ARM_AAPCS:
+  case CallingConv::ARM_APCS:
+  case CallingConv::GHC:
+    return CC;
+  case CallingConv::ARM_AAPCS_VFP:
+    return isVarArg ? CallingConv::ARM_AAPCS : CallingConv::ARM_AAPCS_VFP;
+  case CallingConv::C:
     if (!Subtarget->isAAPCS_ABI())
-      return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+      return CallingConv::ARM_APCS;
     else if (Subtarget->hasVFP2() &&
              getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
              !isVarArg)
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
+  case CallingConv::Fast:
+    if (!Subtarget->isAAPCS_ABI()) {
+      if (Subtarget->hasVFP2() && !isVarArg)
+        return CallingConv::Fast;
+      return CallingConv::ARM_APCS;
+    } else if (Subtarget->hasVFP2() && !isVarArg)
+      return CallingConv::ARM_AAPCS_VFP;
+    else
+      return CallingConv::ARM_AAPCS;
   }
-  case CallingConv::ARM_AAPCS_VFP:
-    if (!isVarArg)
-      return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
-    // Fallthrough
-  case CallingConv::ARM_AAPCS:
-    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+}
+
+/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
+/// CallingConvention.
+CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
+                                                 bool Return,
+                                                 bool isVarArg) const {
+  switch (getEffectiveCallingConv(CC, isVarArg)) {
+  default:
+    llvm_unreachable("Unsupported calling convention");
   case CallingConv::ARM_APCS:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
+  case CallingConv::ARM_AAPCS:
+    return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
+  case CallingConv::ARM_AAPCS_VFP:
+    return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
+  case CallingConv::Fast:
+    return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
   case CallingConv::GHC:
     return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
   }
@@ -1286,6 +1281,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                       InFlag);
       Chain = Hi.getValue(1);
       InFlag = Hi.getValue(2);
+      if (!Subtarget->isLittle())
+        std::swap (Lo, Hi);
       Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
 
       if (VA.getLocVT() == MVT::v2f64) {
@@ -1301,6 +1298,8 @@ ARMTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
         Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InFlag);
         Chain = Hi.getValue(1);
         InFlag = Hi.getValue(2);
+        if (!Subtarget->isLittle())
+          std::swap (Lo, Hi);
         Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
         Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
                           DAG.getConstant(1, MVT::i32));
@@ -1351,16 +1350,17 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
 
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                               DAG.getVTList(MVT::i32, MVT::i32), Arg);
-  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd));
+  unsigned id = Subtarget->isLittle() ? 0 : 1;
+  RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
 
   if (NextVA.isRegLoc())
-    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1)));
+    RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
   else {
     assert(NextVA.isMemLoc());
-    if (StackPtr.getNode() == 0)
+    if (!StackPtr.getNode())
       StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy());
 
-    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1),
+    MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, fmrrd.getValue(1-id),
                                            dl, DAG, NextVA,
                                            Flags));
   }
@@ -1398,6 +1398,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, isStructRet, MF.getFunction()->hasStructRetAttr(),
                                                    Outs, OutVals, Ins, DAG);
+    if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+      report_fatal_error("failed to perform tail call elimination on a call "
+                         "site marked musttail");
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
     // detected sibcalls.
     if (isTailCall) {
@@ -1542,7 +1545,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
         MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
-                                          Ops, array_lengthof(Ops)));
+                                          Ops));
       }
     } else if (!isSibCall) {
       assert(VA.isMemLoc());
@@ -1553,8 +1556,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
@@ -1741,10 +1743,10 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (isTailCall)
-    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(ARMISD::TC_RETURN, dl, NodeTys, Ops);
 
   // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
@@ -2049,8 +2051,7 @@ static SDValue LowerInterruptReturn(SmallVectorImpl<SDValue> &RetOps,
 
   RetOps.insert(RetOps.begin() + 1, DAG.getConstant(LROffset, MVT::i32, false));
 
-  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(ARMISD::INTRET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue
@@ -2074,6 +2075,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps;
   RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  bool isLittleEndian = Subtarget->isLittle();
 
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
@@ -2100,12 +2102,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), HalfGPRs, Flag);
+        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
         Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(1), Flag);
+                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
+                                 Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
@@ -2117,12 +2122,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
       // Legalize ret f64 -> ret 2 x i32.  We always have fmrrd if f64 is
       // available.
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
-                                  DAG.getVTList(MVT::i32, MVT::i32), &Arg, 1);
-      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd, Flag);
+                                  DAG.getVTList(MVT::i32, MVT::i32), Arg);
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 0 : 1),
+                               Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
-      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), fmrrd.getValue(1),
+      Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                               fmrrd.getValue(isLittleEndian ? 1 : 0),
                                Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
@@ -2151,8 +2159,7 @@ ARMTargetLowering::LowerReturn(SDValue Chain,
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
-  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2314,13 +2321,13 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Entry.Node = Argument;
   Entry.Ty = (Type *) Type::getInt32Ty(*DAG.getContext());
   Args.push_back(Entry);
+
   // FIXME: is there useful debug info available here?
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                (Type *) Type::getInt32Ty(*DAG.getContext()),
-                false, false, false, false,
-                0, CallingConv::C, /*isTailCall=*/false,
-                /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol("__tls_get_addr", PtrVT), Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+               DAG.getExternalSymbol("__tls_get_addr", PtrVT), &Args, 0);
+
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
 }
@@ -2466,6 +2473,23 @@ SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
   return Result;
 }
 
+SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWindows() && "non-Windows COFF is not supported");
+  assert(Subtarget->useMovt() && "Windows on ARM expects to use movw/movt");
+
+  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
+  EVT PtrVT = getPointerTy();
+  SDLoc DL(Op);
+
+  ++NumMovwMovt;
+
+  // FIXME: Once remat is capable of dealing with instructions with register
+  // operands, expand this into two nodes.
+  return DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
+                     DAG.getTargetGlobalAddress(GV, DL, PtrVT));
+}
+
 SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
                                                     SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() &&
@@ -2654,7 +2678,8 @@ ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
   }
-
+  if (!Subtarget->isLittle())
+    std::swap (ArgValue, ArgValue2);
   return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
 }
 
@@ -2803,8 +2828,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     AFI->setArgRegsSaveSize(ArgRegsSaveSize + AFI->getArgRegsSaveSize());
 
     if (!MemOps.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &MemOps[0], MemOps.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
     return FrameIndex;
   } else {
     if (ArgSize == 0) {
@@ -2834,8 +2858,9 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   // If there is no regs to be stored, just point address after last
   // argument passed via stack.
   int FrameIndex =
-    StoreByValRegs(CCInfo, DAG, dl, Chain, 0, CCInfo.getInRegsParamsCount(),
-                   0, ArgOffset, 0, ForceMutable, 0, TotalArgRegsSaveSize);
+    StoreByValRegs(CCInfo, DAG, dl, Chain, nullptr,
+                   CCInfo.getInRegsParamsCount(), 0, ArgOffset, 0, ForceMutable,
+                   0, TotalArgRegsSaveSize);
 
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
@@ -3166,11 +3191,96 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
 
+std::pair<SDValue, SDValue>
+ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
+                                 SDValue &ARMcc) const {
+  assert(Op.getValueType() == MVT::i32 &&  "Unsupported value type");
+
+  SDValue Value, OverflowCmp;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+
+
+  // FIXME: We are currently always generating CMPs because we don't support
+  // generating CMN through the backend. This is not as good as the natural
+  // CMP case because it causes a register dependency and cannot be folded
+  // later.
+
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Unknown overflow instruction!");
+  case ISD::SADDO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::UADDO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::ADD, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, Value, LHS);
+    break;
+  case ISD::SSUBO:
+    ARMcc = DAG.getConstant(ARMCC::VC, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  case ISD::USUBO:
+    ARMcc = DAG.getConstant(ARMCC::HS, MVT::i32);
+    Value = DAG.getNode(ISD::SUB, SDLoc(Op), Op.getValueType(), LHS, RHS);
+    OverflowCmp = DAG.getNode(ARMISD::CMP, SDLoc(Op), MVT::Glue, LHS, RHS);
+    break;
+  } // switch (...)
+
+  return std::make_pair(Value, OverflowCmp);
+}
+
+
+SDValue
+ARMTargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
+  // Let legalize expand this if it isn't a legal type yet.
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
+    return SDValue();
+
+  SDValue Value, OverflowCmp;
+  SDValue ARMcc;
+  std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  // We use 0 and 1 as false and true values.
+  SDValue TVal = DAG.getConstant(1, MVT::i32);
+  SDValue FVal = DAG.getConstant(0, MVT::i32);
+  EVT VT = Op.getValueType();
+
+  SDValue Overflow = DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, TVal, FVal,
+                                 ARMcc, CCR, OverflowCmp);
+
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
+}
+
+
 SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   SDValue Cond = Op.getOperand(0);
   SDValue SelectTrue = Op.getOperand(1);
   SDValue SelectFalse = Op.getOperand(2);
   SDLoc dl(Op);
+  unsigned Opc = Cond.getOpcode();
+
+  if (Cond.getResNo() == 1 &&
+      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
+       Opc == ISD::USUBO)) {
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(Cond->getValueType(0)))
+      return SDValue();
+
+    SDValue Value, OverflowCmp;
+    SDValue ARMcc;
+    std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
+    SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+    EVT VT = Op.getValueType();
+
+    return DAG.getNode(ARMISD::CMOV, SDLoc(Op), VT, SelectTrue, SelectFalse,
+                       ARMcc, CCR, OverflowCmp);
+
+  }
 
   // Convert:
   //
@@ -3472,7 +3582,7 @@ ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
     ARMcc = DAG.getConstant(CondCode, MVT::i32);
     SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
-    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops, 7);
+    return DAG.getNode(ARMISD::BCC_i64, dl, VTList, Ops);
   }
 
   return SDValue();
@@ -3512,11 +3622,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
-  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+  SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   if (CondCode2 != ARMCC::AL) {
     ARMcc = DAG.getConstant(CondCode2, MVT::i32);
     SDValue Ops[] = { Res, Dest, ARMcc, CCR, Res.getValue(1) };
-    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops, 5);
+    Res = DAG.getNode(ARMISD::BRCOND, dl, VTList, Ops);
   }
   return Res;
 }
@@ -3713,7 +3823,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
   // Bitcast operand 1 to i32.
   if (SrcVT == MVT::f64)
     Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                       &Tmp1, 1).getValue(1);
+                       Tmp1).getValue(1);
   Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
 
   // Or in the signbit with integer operations.
@@ -3729,7 +3839,7 @@ SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
 
   // f64: Or the high part with signbit and then combine two parts.
   Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
-                     &Tmp0, 1);
+                     Tmp0);
   SDValue Lo = Tmp0.getValue(0);
   SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
   Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
@@ -3761,14 +3871,16 @@ SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
 }
 
 SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  const ARMBaseRegisterInfo &ARI =
+    *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
   MFI->setFrameAddressIsTaken(true);
 
   EVT VT = Op.getValueType();
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = (Subtarget->isThumb() || Subtarget->isTargetMachO())
-    ? ARM::R7 : ARM::R11;
+  unsigned FrameReg = ARI.getFrameRegister(MF);
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -3777,6 +3889,18 @@ SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned ARMTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("sp", ARM::SP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 /// ExpandBITCAST - If the target supports VFP, this function is called to
 /// expand a bit convert where either the source or destination type is i64 to
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
@@ -3806,8 +3930,15 @@ static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG) {
 
   // Turn f64->i64 into VMOVRRD.
   if (DstVT == MVT::i64 && TLI.isTypeLegal(SrcVT)) {
-    SDValue Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
-                              DAG.getVTList(MVT::i32, MVT::i32), &Op, 1);
+    SDValue Cvt;
+    if (TLI.isBigEndian() && SrcVT.isVector() &&
+        SrcVT.getVectorNumElements() > 1)
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32),
+                        DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
+    else
+      Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
+                        DAG.getVTList(MVT::i32, MVT::i32), Op);
     // Merge the pieces into a single i64 value.
     return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
   }
@@ -3863,7 +3994,7 @@ SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
                            CCR, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 /// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
@@ -3897,7 +4028,7 @@ SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
                            CCR, Cmp);
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
@@ -4102,7 +4233,7 @@ static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG,
   // First, build a SRA_FLAG/SRL_FLAG op, which shifts the top part by one and
   // captures the result into a carry flag.
   unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::SRL_FLAG:ARMISD::SRA_FLAG;
-  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), &Hi, 1);
+  Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, MVT::Glue), Hi);
 
   // The low part is an ARMISD::RRX operand, which shifts the carry in.
   Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
@@ -4859,7 +4990,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
           Ops.push_back(N);
           Ops.push_back(Op.getOperand(I));
           Ops.push_back(DAG.getConstant(I, MVT::i32));
-          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, &Ops[0], 3);
+          N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
         }
       }
       return N;
@@ -4870,7 +5001,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
                                   Op.getOperand(i)));
       EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
+      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
         return DAG.getNode(ISD::BITCAST, dl, VT, Val);
@@ -4906,7 +5037,7 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     SmallVector<SDValue, 8> Ops;
     for (unsigned i = 0; i < NumElts; ++i)
       Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
-    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
@@ -5213,12 +5344,10 @@ static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op,
 
   if (V2.getNode()->getOpcode() == ISD::UNDEF)
     return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
-                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
-                                   &VTBLMask[0], 8));
+                       DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
 
   return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
-                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8,
-                                 &VTBLMask[0], 8));
+                     DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v8i8, VTBLMask));
 }
 
 static SDValue LowerReverse_VECTOR_SHUFFLEv16i8_v8i16(SDValue Op,
@@ -5371,7 +5500,7 @@ static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) {
                                   DAG.getConstant(ShuffleMask[i] & (NumElts-1),
                                                   MVT::i32)));
     }
-    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, &Ops[0],NumElts);
+    SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
@@ -5608,7 +5737,7 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
     Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), MVT::i32));
   }
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N),
-                     MVT::getVectorVT(TruncVT, NumElts), Ops.data(), NumElts);
+                     MVT::getVectorVT(TruncVT, NumElts), Ops);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -5946,12 +6075,12 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   ? "__sincos_stret" : "__sincosf_stret";
   SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
 
-  TargetLowering::
-  CallLoweringInfo CLI(DAG.getEntryNode(), Type::getVoidTy(*DAG.getContext()),
-                       false, false, false, false, 0,
-                       CallingConv::C, /*isTaillCall=*/false,
-                       /*doesNotRet=*/false, /*isReturnValueUsed*/false,
-                       Callee, Args, DAG, dl);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()), Callee,
+               &Args, 0)
+    .setDiscardResult();
+
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   SDValue LoadSin = DAG.getLoad(ArgVT, dl, CallResult.second, SRet,
@@ -5998,8 +6127,7 @@ static void ReplaceREADCYCLECOUNTER(SDNode *N,
     };
 
     Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
-                           DAG.getVTList(MVT::i32, MVT::Other), &Ops[0],
-                           array_lengthof(Ops));
+                           DAG.getVTList(MVT::i32, MVT::Other), Ops);
     OutChain = Cycles32.getValue(1);
   } else {
     // Intrinsic is defined to return 0 on unsupported platforms. Technically
@@ -6022,8 +6150,15 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
   case ISD::GlobalAddress:
-    return Subtarget->isTargetMachO() ? LowerGlobalAddressDarwin(Op, DAG) :
-      LowerGlobalAddressELF(Op, DAG);
+    switch (Subtarget->getTargetTriple().getObjectFormat()) {
+    default: llvm_unreachable("unknown object format");
+    case Triple::COFF:
+      return LowerGlobalAddressWindows(Op, DAG);
+    case Triple::ELF:
+      return LowerGlobalAddressELF(Op, DAG);
+    case Triple::MachO:
+      return LowerGlobalAddressDarwin(Op, DAG);
+    }
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SELECT:        return LowerSELECT(Op, DAG);
   case ISD::SELECT_CC:     return LowerSELECT_CC(Op, DAG);
@@ -6068,6 +6203,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::ADDE:
   case ISD::SUBC:
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+    return LowerXALUO(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
@@ -6558,7 +6698,7 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
+  const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
   for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
          I = InvokeBBs.begin(), E = InvokeBBs.end(); I != E; ++I) {
@@ -6755,8 +6895,8 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
   MachineFunction *MF = BB->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   unsigned UnitSize = 0;
-  const TargetRegisterClass *TRC = 0;
-  const TargetRegisterClass *VecTRC = 0;
+  const TargetRegisterClass *TRC = nullptr;
+  const TargetRegisterClass *VecTRC = nullptr;
 
   bool IsThumb1 = Subtarget->isThumb1Only();
   bool IsThumb2 = Subtarget->isThumb2();
@@ -6790,7 +6930,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr *MI,
                  ? (const TargetRegisterClass *)&ARM::DPairRegClass
                  : UnitSize == 8
                        ? (const TargetRegisterClass *)&ARM::DPRRegClass
-                       : 0;
+                       : nullptr;
 
   unsigned BytesLeft = SizeVal % UnitSize;
   unsigned LoopSize = SizeVal - BytesLeft;
@@ -7520,8 +7660,7 @@ static SDValue AddCombineToVPADDL(SDNode *N, SDValue N0, SDValue N1,
       llvm_unreachable("Invalid vector element type for padd optimization.");
   }
 
-  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
-                            widenType, &Ops[0], Ops.size());
+  SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), widenType, Ops);
   unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
   return DAG.getNode(ExtOp, SDLoc(N), VT, tmp);
 }
@@ -7581,7 +7720,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   // Look for the glued ADDE.
   SDNode* AddeNode = AddcNode->getGluedUser();
-  if (AddeNode == NULL)
+  if (!AddeNode)
     return SDValue();
 
   // Make sure it is really an ADDE.
@@ -7616,9 +7755,9 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   // Figure out the high and low input values to the MLAL node.
   SDValue* HiMul = &MULOp;
-  SDValue* HiAdd = NULL;
-  SDValue* LoMul = NULL;
-  SDValue* LowAdd = NULL;
+  SDValue* HiAdd = nullptr;
+  SDValue* LoMul = nullptr;
+  SDValue* LowAdd = nullptr;
 
   if (IsLeftOperandMUL)
     HiAdd = &AddeOp1;
@@ -7635,7 +7774,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
     LowAdd = &AddcOp0;
   }
 
-  if (LoMul == NULL)
+  if (!LoMul)
     return SDValue();
 
   if (LoMul->getNode() != HiMul->getNode())
@@ -7652,8 +7791,7 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   Ops.push_back(*HiAdd);
 
   SDValue MLALNode =  DAG.getNode(FinalOpc, SDLoc(AddcNode),
-                                 DAG.getVTList(MVT::i32, MVT::i32),
-                                 &Ops[0], Ops.size());
+                                 DAG.getVTList(MVT::i32, MVT::i32), Ops);
 
   // Replace the ADDs' nodes uses by the MLA node's values.
   SDValue HiMLALResult(MLALNode.getNode(), 1);
@@ -8290,8 +8428,7 @@ static SDValue PerformSTORECombine(SDNode *N,
                             Increment);
       Chains.push_back(Ch);
     }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
-                       Chains.size());
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
 
   if (!ISD::isNormalStore(St))
@@ -8302,16 +8439,18 @@ static SDValue PerformSTORECombine(SDNode *N,
   if (StVal.getNode()->getOpcode() == ARMISD::VMOVDRR &&
       StVal.getNode()->hasOneUse()) {
     SelectionDAG  &DAG = DCI.DAG;
+    bool isBigEndian = DAG.getTargetLoweringInfo().isBigEndian();
     SDLoc DL(St);
     SDValue BasePtr = St->getBasePtr();
     SDValue NewST1 = DAG.getStore(St->getChain(), DL,
-                                  StVal.getNode()->getOperand(0), BasePtr,
-                                  St->getPointerInfo(), St->isVolatile(),
+                                  StVal.getNode()->getOperand(isBigEndian ? 1 : 0 ),
+                                  BasePtr, St->getPointerInfo(), St->isVolatile(),
                                   St->isNonTemporal(), St->getAlignment());
 
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
                                     DAG.getConstant(4, MVT::i32));
-    return DAG.getStore(NewST1.getValue(0), DL, StVal.getNode()->getOperand(1),
+    return DAG.getStore(NewST1.getValue(0), DL,
+                        StVal.getNode()->getOperand(isBigEndian ? 0 : 1),
                         OffsetPtr, St->getPointerInfo(), St->isVolatile(),
                         St->isNonTemporal(),
                         std::min(4U, St->getAlignment() / 2));
@@ -8387,7 +8526,7 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
     DCI.AddToWorklist(V.getNode());
   }
   EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
-  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops.data(), NumElts);
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, FloatVT, Ops);
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
@@ -8470,7 +8609,7 @@ PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
       // Fold obvious case.
       V = V.getOperand(0);
     else {
-      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
       // Make the DAGCombiner fold the bitcasts.
       DCI.AddToWorklist(V.getNode());
     }
@@ -8666,7 +8805,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
       Tys[n] = VecTy;
     Tys[n++] = MVT::i32;
     Tys[n] = MVT::Other;
-    SDVTList SDTys = DAG.getVTList(Tys, NumResultVecs+2);
+    SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumResultVecs+2));
     SmallVector<SDValue, 8> Ops;
     Ops.push_back(N->getOperand(0)); // incoming chain
     Ops.push_back(N->getOperand(AddrOpIdx));
@@ -8676,8 +8815,7 @@ static SDValue CombineBaseUpdate(SDNode *N,
     }
     MemIntrinsicSDNode *MemInt = cast<MemIntrinsicSDNode>(N);
     SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, SDLoc(N), SDTys,
-                                           Ops.data(), Ops.size(),
-                                           MemInt->getMemoryVT(),
+                                           Ops, MemInt->getMemoryVT(),
                                            MemInt->getMemOperand());
 
     // Update the uses.
@@ -8746,11 +8884,11 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
   for (n = 0; n < NumVecs; ++n)
     Tys[n] = VT;
   Tys[n] = MVT::Other;
-  SDVTList SDTys = DAG.getVTList(Tys, NumVecs+1);
+  SDVTList SDTys = DAG.getVTList(ArrayRef<EVT>(Tys, NumVecs+1));
   SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
   MemIntrinsicSDNode *VLDMemInt = cast<MemIntrinsicSDNode>(VLD);
   SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
-                                           Ops, 2, VLDMemInt->getMemoryVT(),
+                                           Ops, VLDMemInt->getMemoryVT(),
                                            VLDMemInt->getMemOperand());
 
   // Update the uses.
@@ -9348,7 +9486,7 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
 
   if (Res.getNode()) {
     APInt KnownZero, KnownOne;
-    DAG.ComputeMaskedBits(SDValue(N,0), KnownZero, KnownOne);
+    DAG.computeKnownBits(SDValue(N,0), KnownZero, KnownOne);
     // Capture demanded bits information that would be otherwise lost.
     if (KnownZero == 0xfffffffe)
       Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
@@ -9935,11 +10073,11 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   return true;
 }
 
-void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   unsigned BitWidth = KnownOne.getBitWidth();
   KnownZero = KnownOne = APInt(BitWidth, 0);
   switch (Op.getOpcode()) {
@@ -9955,11 +10093,11 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
     break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
-    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     if (KnownZero == 0 && KnownOne == 0) return;
 
     APInt KnownZeroRHS, KnownOneRHS;
-    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(1), KnownZeroRHS, KnownOneRHS, Depth+1);
     KnownZero &= KnownZeroRHS;
     KnownOne  &= KnownOneRHS;
     return;
@@ -10053,7 +10191,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -10132,7 +10270,7 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Currently only support length 1 constraints.
   if (Constraint.length() != 1) return;
@@ -10331,13 +10469,12 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
 
   SDLoc dl(Op);
-  TargetLowering::
-  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
-                    0, getLibcallCallingConv(LC), /*isTailCall=*/false,
-                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, &Args, 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
 
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
 }
 
@@ -10494,3 +10631,160 @@ bool ARMTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
     return false;
   return true;
 }
+
+bool ARMTargetLowering::shouldExpandAtomicInIR(Instruction *Inst) const {
+  // Loads and stores less than 64-bits are already atomic; ones above that
+  // are doomed anyway, so defer to the default libcall and blame the OS when
+  // things go wrong:
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    return SI->getValueOperand()->getType()->getPrimitiveSizeInBits() == 64;
+  else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+    return LI->getType()->getPrimitiveSizeInBits() == 64;
+
+  // For the real atomic operations, we have ldrex/strex up to 64 bits.
+  return Inst->getType()->getPrimitiveSizeInBits() <= 64;
+}
+
+Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                                         AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  Type *ValTy = cast<PointerType>(Addr->getType())->getElementType();
+  bool IsAcquire =
+      Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
+  // intrinsic must return {i32, i32} and we have to recombine them into a
+  // single i64 here.
+  if (ValTy->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
+    Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int);
+
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+
+    Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
+    Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
+    Hi = Builder.CreateZExt(Hi, ValTy, "hi64");
+    return Builder.CreateOr(
+        Lo, Builder.CreateShl(Hi, ConstantInt::get(ValTy, 32)), "val64");
+  }
+
+  Type *Tys[] = { Addr->getType() };
+  Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
+  Function *Ldrex = llvm::Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateTruncOrBitCast(
+      Builder.CreateCall(Ldrex, Addr),
+      cast<PointerType>(Addr->getType())->getElementType());
+}
+
+Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                               Value *Addr,
+                                               AtomicOrdering Ord) const {
+  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
+  bool IsRelease =
+      Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent;
+
+  // Since the intrinsics must have legal type, the i64 intrinsics take two
+  // parameters: "i32, i32". We must marshal Val into the appropriate form
+  // before the call.
+  if (Val->getType()->getPrimitiveSizeInBits() == 64) {
+    Intrinsic::ID Int =
+        IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
+    Function *Strex = Intrinsic::getDeclaration(M, Int);
+    Type *Int32Ty = Type::getInt32Ty(M->getContext());
+
+    Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
+    Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
+    if (!Subtarget->isLittle())
+      std::swap (Lo, Hi);
+    Addr = Builder.CreateBitCast(Addr, Type::getInt8PtrTy(M->getContext()));
+    return Builder.CreateCall3(Strex, Lo, Hi, Addr);
+  }
+
+  Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
+  Type *Tys[] = { Addr->getType() };
+  Function *Strex = Intrinsic::getDeclaration(M, Int, Tys);
+
+  return Builder.CreateCall2(
+      Strex, Builder.CreateZExtOrBitCast(
+                 Val, Strex->getFunctionType()->getParamType(0)),
+      Addr);
+}
+
+enum HABaseType {
+  HA_UNKNOWN = 0,
+  HA_FLOAT,
+  HA_DOUBLE,
+  HA_VECT64,
+  HA_VECT128
+};
+
+static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
+                                   uint64_t &Members) {
+  if (const StructType *ST = dyn_cast<StructType>(Ty)) {
+    for (unsigned i = 0; i < ST->getNumElements(); ++i) {
+      uint64_t SubMembers = 0;
+      if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
+        return false;
+      Members += SubMembers;
+    }
+  } else if (const ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    uint64_t SubMembers = 0;
+    if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
+      return false;
+    Members += SubMembers * AT->getNumElements();
+  } else if (Ty->isFloatTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_FLOAT)
+      return false;
+    Members = 1;
+    Base = HA_FLOAT;
+  } else if (Ty->isDoubleTy()) {
+    if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
+      return false;
+    Members = 1;
+    Base = HA_DOUBLE;
+  } else if (const VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    Members = 1;
+    switch (Base) {
+    case HA_FLOAT:
+    case HA_DOUBLE:
+      return false;
+    case HA_VECT64:
+      return VT->getBitWidth() == 64;
+    case HA_VECT128:
+      return VT->getBitWidth() == 128;
+    case HA_UNKNOWN:
+      switch (VT->getBitWidth()) {
+      case 64:
+        Base = HA_VECT64;
+        return true;
+      case 128:
+        Base = HA_VECT128;
+        return true;
+      default:
+        return false;
+      }
+    }
+  }
+
+  return (Members > 0 && Members <= 4);
+}
+
+/// \brief Return true if a type is an AAPCS-VFP homogeneous aggregate.
+bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
+    Type *Ty, CallingConv::ID CallConv, bool isVarArg) const {
+  if (getEffectiveCallingConv(CallConv, isVarArg) !=
+      CallingConv::ARM_AAPCS_VFP)
+    return false;
+
+  HABaseType Base = HA_UNKNOWN;
+  uint64_t Members = 0;
+  bool result = isHomogeneousAggregate(Ty, Base, Members);
+  DEBUG(dbgs() << "isHA: " << result << " "; Ty->dump(); dbgs() << "\n");
+  return result;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index f33e6db..c15305c 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -313,10 +313,10 @@ namespace llvm {
                                     SDValue &Offset, ISD::MemIndexedMode &AM,
                                     SelectionDAG &DAG) const override;
 
-    void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                        APInt &KnownOne,
-                                        const SelectionDAG &DAG,
-                                        unsigned Depth) const override;
+    void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth) const override;
 
 
     bool ExpandInlineAsm(CallInst *CI) const override;
@@ -384,6 +384,18 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    /// \brief Returns true if an argument of type Ty needs to be passed in a
+    /// contiguous block of registers in calling convention CallConv.
+    bool functionArgumentNeedsConsecutiveRegisters(
+        Type *Ty, CallingConv::ID CallConv, bool isVarArg) const override;
+
+    Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
+                          AtomicOrdering Ord) const override;
+    Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
+                                Value *Addr, AtomicOrdering Ord) const override;
+
+    bool shouldExpandAtomicInIR(Instruction *Inst) const override;
+
   protected:
     std::pair<const TargetRegisterClass*, uint8_t>
     findRepresentativeClass(MVT VT) const override;
@@ -404,6 +416,7 @@ namespace llvm {
     void addTypeForNEON(MVT VT, MVT PromotedLdStVT, MVT PromotedBitwiseVT);
     void addDRTypeForNEON(MVT VT);
     void addQRTypeForNEON(MVT VT);
+    std::pair<SDValue, SDValue> getARMXALUOOp(SDValue Op, SelectionDAG &DAG, SDValue &ARMcc) const;
 
     typedef SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPassVector;
     void PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
@@ -417,6 +430,8 @@ namespace llvm {
                                  SDValue &Root, SelectionDAG &DAG,
                                  SDLoc dl) const;
 
+    CallingConv::ID getEffectiveCallingConv(CallingConv::ID CC,
+                                            bool isVarArg) const;
     CCAssignFn *CCAssignFnForNode(CallingConv::ID CC, bool Return,
                                   bool isVarArg) const;
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
@@ -430,6 +445,7 @@ namespace llvm {
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressDarwin(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddressELF(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerGlobalAddressWindows(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                             SelectionDAG &DAG) const;
@@ -438,6 +454,7 @@ namespace llvm {
                                  TLSModel::Model model) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
@@ -454,6 +471,8 @@ namespace llvm {
     SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
 
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
@@ -567,7 +586,6 @@ namespace llvm {
     OtherModImm
   };
 
-
   namespace ARM {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index aafff98..59e9260 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -2029,7 +2029,7 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
 // Same as N2V but not predicated.
 class N2Vnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
             dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
-            string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
+            string Dt, list<dag> pattern>
    : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
              OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
   bits<5> Vd;
@@ -2138,8 +2138,7 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
 
 class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
-                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
-                SDPatternOperator IntOp, bit Commutable, list<dag> pattern>
+                string OpcodeStr, string Dt, list<dag> pattern>
   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
             Dt, "$Vd, $Vn, $Vm", "", pattern> {
   bits<5> Vd;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 75a109e..718d5da 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -991,6 +991,81 @@ def addrmode6oneL32 : Operand<i32>,
   let EncoderMethod = "getAddrMode6OneLane32AddressOpValue";
 }
 
+// Base class for addrmode6 with specific alignment restrictions.
+class AddrMode6Align : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm:$align);
+  let EncoderMethod = "getAddrMode6AddressOpValue";
+  let DecoderMethod = "DecodeAddrMode6Operand";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD/VST instructions and checking the alignment is not specified.
+def AddrMode6AlignNoneAsmOperand : AsmOperandClass {
+  let Name = "AlignedMemoryNone";
+  let DiagnosticType = "AlignedMemoryRequiresNone";
+}
+def addrmode6alignNone : AddrMode6Align {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6AlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align16AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory16";
+  let DiagnosticType = "AlignedMemoryRequires16";
+}
+def addrmode6align16 : AddrMode6Align {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6Align16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align32AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory32";
+  let DiagnosticType = "AlignedMemoryRequires32";
+}
+def addrmode6align32 : AddrMode6Align {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6Align32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for
+// VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64";
+  let DiagnosticType = "AlignedMemoryRequires64";
+}
+def addrmode6align64 : AddrMode6Align {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6Align64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128";
+  let DiagnosticType = "AlignedMemoryRequires64or128";
+}
+def addrmode6align64or128 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit, 128-bit or 256-bit alignment
+// encoding for VLD/VST instructions and checking the alignment value.
+def AddrMode6Align64or128or256AsmOperand : AsmOperandClass {
+  let Name = "AlignedMemory64or128or256";
+  let DiagnosticType = "AlignedMemoryRequires64or128or256";
+}
+def addrmode6align64or128or256 : AddrMode6Align {
+  // The alignment specifier can only be 64, 128, 256 or omitted.
+  let ParserMatchClass = AddrMode6Align64or128or256AsmOperand;
+}
+
 // Special version of addrmode6 to handle alignment encoding for VLD-dup
 // instructions, specifically VLD4-dup.
 def addrmode6dup : Operand<i32>,
@@ -1003,6 +1078,69 @@ def addrmode6dup : Operand<i32>,
   let ParserMatchClass = AddrMode6AsmOperand;
 }
 
+// Base class for addrmode6dup with specific alignment restrictions.
+class AddrMode6DupAlign : Operand<i32>,
+                ComplexPattern<i32, 2, "SelectAddrMode6", [], [SDNPWantParent]>{
+  let PrintMethod = "printAddrMode6Operand";
+  let MIOperandInfo = (ops GPR:$addr, i32imm);
+  let EncoderMethod = "getAddrMode6DupAddressOpValue";
+}
+
+// Special version of addrmode6 to handle no allowed alignment encoding for
+// VLD-dup instruction and checking the alignment is not specified.
+def AddrMode6dupAlignNoneAsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemoryNone";
+  let DiagnosticType = "DupAlignedMemoryRequiresNone";
+}
+def addrmode6dupalignNone : AddrMode6DupAlign {
+  // The alignment specifier can only be omitted.
+  let ParserMatchClass = AddrMode6dupAlignNoneAsmOperand;
+}
+
+// Special version of addrmode6 to handle 16-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign16AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory16";
+  let DiagnosticType = "DupAlignedMemoryRequires16";
+}
+def addrmode6dupalign16 : AddrMode6DupAlign {
+  // The alignment specifier can only be 16 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign16AsmOperand;
+}
+
+// Special version of addrmode6 to handle 32-bit alignment encoding for VLD-dup
+// instruction and checking the alignment value.
+def AddrMode6dupAlign32AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory32";
+  let DiagnosticType = "DupAlignedMemoryRequires32";
+}
+def addrmode6dupalign32 : AddrMode6DupAlign {
+  // The alignment specifier can only be 32 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign32AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit alignment encoding for VLD
+// instructions and checking the alignment value.
+def AddrMode6dupAlign64AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64";
+  let DiagnosticType = "DupAlignedMemoryRequires64";
+}
+def addrmode6dupalign64 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64AsmOperand;
+}
+
+// Special version of addrmode6 to handle 64-bit or 128-bit alignment encoding
+// for VLD instructions and checking the alignment value.
+def AddrMode6dupAlign64or128AsmOperand : AsmOperandClass {
+  let Name = "DupAlignedMemory64or128";
+  let DiagnosticType = "DupAlignedMemoryRequires64or128";
+}
+def addrmode6dupalign64or128 : AddrMode6DupAlign {
+  // The alignment specifier can only be 64, 128 or omitted.
+  let ParserMatchClass = AddrMode6dupAlign64or128AsmOperand;
+}
+
 // addrmodepc := pc + reg
 //
 def addrmodepc : Operand<i32>,
@@ -1689,7 +1827,8 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
 }
 
 def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary,
-              "hint", "\t$imm", []>, Requires<[IsARM, HasV6]> {
+              "hint", "\t$imm", [(int_arm_hint imm0_239:$imm)]>,
+           Requires<[IsARM, HasV6]> {
   bits<8> imm;
   let Inst{27-8} = 0b00110010000011110000;
   let Inst{7-0} = imm;
@@ -1702,8 +1841,6 @@ def : InstAlias<"wfi$p", (HINT 3, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sev$p", (HINT 4, pred:$p)>, Requires<[IsARM, HasV6T2]>;
 def : InstAlias<"sevl$p", (HINT 5, pred:$p)>, Requires<[IsARM, HasV8]>;
 
-def : Pat<(int_arm_sevl), (HINT 5)>;
-
 def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
              "\t$Rd, $Rn, $Rm", []>, Requires<[IsARM, HasV6]> {
   bits<4> Rd;
@@ -1830,6 +1967,18 @@ def DBG : AI<(outs), (ins imm0_15:$opt), MiscFrm, NoItinerary, "dbg", "\t$opt",
   let Inst{3-0} = opt;
 }
 
+// A8.8.247  UDF - Undefined (Encoding A1)
+def UDF : AInoP<(outs), (ins imm0_65535:$imm16), MiscFrm, NoItinerary,
+                "udf", "\t$imm16", [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-28} = 0b1110; // AL
+  let Inst{27-25} = 0b011;
+  let Inst{24-20} = 0b11111;
+  let Inst{19-8} = imm16{15-4};
+  let Inst{7-4} = 0b1111;
+  let Inst{3-0} = imm16{3-0};
+}
+
 /*
  * A5.4 Permanently UNDEFINED instructions.
  *
@@ -2282,12 +2431,6 @@ let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
   def LDRD : AI3ld<0b1101, 0, (outs GPR:$Rt, GPR:$Rt2), (ins addrmode3:$addr),
                    LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $Rt2, $addr", []>,
              Requires<[IsARM, HasV5TE]>;
-
-  // GNU Assembler extension (compatibility)
-  let isAsmParserOnly = 1 in
-    def LDRD_PAIR : AI3ld<0b1101, 0, (outs GPRPairOp:$Rt), (ins addrmode3:$addr),
-                          LdMiscFrm, IIC_iLoad_d_r, "ldrd", "\t$Rt, $addr", []>,
-                    Requires<[IsARM, HasV5TE]>;
 }
 
 def LDA : AIldracq<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
@@ -2557,14 +2700,6 @@ let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in {
              Requires<[IsARM, HasV5TE]> {
     let Inst{21} = 0;
   }
-
-  // GNU Assembler extension (compatibility)
-  let isAsmParserOnly = 1 in
-    def STRD_PAIR : AI3str<0b1111, (outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
-                           StMiscFrm, IIC_iStore_d_r, "strd", "\t$Rt, $addr", []>,
-                    Requires<[IsARM, HasV5TE]> {
-      let Inst{21} = 0;
-    }
 }
 
 // Indexed stores
@@ -3999,6 +4134,11 @@ def REV16 : AMiscA1I<0b01101011, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
                Requires<[IsARM, HasV6]>,
            Sched<[WriteALU]>;
 
+def : ARMV6Pat<(srl (bswap (extloadi16 addrmode3:$addr)), (i32 16)),
+              (REV16 (LDRH addrmode3:$addr))>;
+def : ARMV6Pat<(truncstorei16 (srl (bswap GPR:$Rn), (i32 16)), addrmode3:$addr),
+               (STRH (REV16 GPR:$Rn), addrmode3:$addr)>;
+
 let AddedComplexity = 5 in
 def REVSH : AMiscA1I<0b01101111, 0b1011, (outs GPR:$Rd), (ins GPR:$Rm),
                IIC_iUNAr, "revsh", "\t$Rd, $Rm",
@@ -4816,7 +4956,7 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>,
                       Requires<[PreV8]>;
-def : ARMInstAlias<"mcr2$ $cop, $opc1, $Rt, $CRn, $CRm",
+def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
@@ -4824,7 +4964,7 @@ def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>,
                       Requires<[PreV8]>;
-def : ARMInstAlias<"mrc2$ $cop, $opc1, $Rt, $CRn, $CRm",
+def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
 
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 0d46c49..b32b5d2 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -39,6 +39,49 @@ def nImmVMOVI32 : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
   let ParserMatchClass = nImmVMOVI32AsmOperand;
 }
+
+def nImmVMOVI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16vmovByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMOVI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32vmovByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONvmovByteReplicateOperands";
+}
+def nImmVMVNI16AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi16invByteReplicate";
+  let PredicateMethod = "isNEONi16ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+def nImmVMVNI32AsmOperandByteReplicate :
+  AsmOperandClass {
+  let Name = "NEONi32invByteReplicate";
+  let PredicateMethod = "isNEONi32ByteReplicate";
+  let RenderMethod = "addNEONinvByteReplicateOperands";
+}
+
+def nImmVMOVI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI16AsmOperandByteReplicate;
+}
+def nImmVMOVI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMOVI32AsmOperandByteReplicate;
+}
+def nImmVMVNI16ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI16AsmOperandByteReplicate;
+}
+def nImmVMVNI32ByteReplicate : Operand<i32> {
+  let PrintMethod = "printNEONModImmOperand";
+  let ParserMatchClass = nImmVMVNI32AsmOperandByteReplicate;
+}
+
 def nImmVMOVI32NegAsmOperand : AsmOperandClass { let Name = "NEONi32vmovNeg"; }
 def nImmVMOVI32Neg : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
@@ -617,37 +660,37 @@ class VLDQQQQWBPseudo<InstrItinClass itin>
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 
 //   VLD1     : Vector Load (multiple single elements)
-class VLD1D<bits<4> op7_4, string Dt>
+class VLD1D<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1,
+          (ins AddrMode:$Rn), IIC_VLD1,
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-class VLD1Q<bits<4> op7_4, string Dt>
+class VLD1Q<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x2,
+          (ins AddrMode:$Rn), IIC_VLD1x2,
           "vld1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
-def  VLD1d8   : VLD1D<{0,0,0,?}, "8">;
-def  VLD1d16  : VLD1D<{0,1,0,?}, "16">;
-def  VLD1d32  : VLD1D<{1,0,0,?}, "32">;
-def  VLD1d64  : VLD1D<{1,1,0,?}, "64">;
+def  VLD1d8   : VLD1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VLD1d16  : VLD1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VLD1d32  : VLD1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VLD1d64  : VLD1D<{1,1,0,?}, "64", addrmode6align64>;
 
-def  VLD1q8   : VLD1Q<{0,0,?,?}, "8">;
-def  VLD1q16  : VLD1Q<{0,1,?,?}, "16">;
-def  VLD1q32  : VLD1Q<{1,0,?,?}, "32">;
-def  VLD1q64  : VLD1Q<{1,1,?,?}, "64">;
+def  VLD1q8   : VLD1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VLD1q16  : VLD1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VLD1q32  : VLD1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VLD1q64  : VLD1Q<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with address register writeback:
-multiclass VLD1DWB<bits<4> op7_4, string Dt> {
+multiclass VLD1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10, 0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
-                     (ins addrmode6:$Rn), IIC_VLD1u,
+                     (ins AddrMode:$Rn), IIC_VLD1u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -655,16 +698,16 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
-multiclass VLD1QWB<bits<4> op7_4, string Dt> {
+multiclass VLD1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -672,7 +715,7 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -680,27 +723,27 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8">;
-defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16">;
-defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32">;
-defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64">;
-defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8">;
-defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16">;
-defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32">;
-defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64">;
+defm VLD1d8wb  : VLD1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16wb : VLD1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32wb : VLD1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64wb : VLD1DWB<{1,1,0,?}, "64", addrmode6align64>;
+defm VLD1q8wb  : VLD1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VLD1q16wb : VLD1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VLD1q32wb : VLD1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VLD1q64wb : VLD1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with 3 registers
-class VLD1D3<bits<4> op7_4, string Dt>
+class VLD1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x3, "vld1", Dt,
+          (ins AddrMode:$Rn), IIC_VLD1x3, "vld1", Dt,
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
+multiclass VLD1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0110, op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -708,7 +751,7 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -716,32 +759,32 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VLD1d8T      : VLD1D3<{0,0,0,?}, "8">;
-def VLD1d16T     : VLD1D3<{0,1,0,?}, "16">;
-def VLD1d32T     : VLD1D3<{1,0,0,?}, "32">;
-def VLD1d64T     : VLD1D3<{1,1,0,?}, "64">;
+def VLD1d8T      : VLD1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VLD1d16T     : VLD1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VLD1d32T     : VLD1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VLD1d64T     : VLD1D3<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8">;
-defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16">;
-defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32">;
-defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64">;
+defm VLD1d8Twb  : VLD1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VLD1d16Twb : VLD1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VLD1d32Twb : VLD1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VLD1d64Twb : VLD1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
 def VLD1d64TPseudo : VLDQQPseudo<IIC_VLD1x3>;
 def VLD1d64TPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x3>;
 def VLD1d64TPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x3>;
 
 // ...with 4 registers
-class VLD1D4<bits<4> op7_4, string Dt>
+class VLD1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b10, 0b0010, op7_4, (outs VecListFourD:$Vd),
-          (ins addrmode6:$Rn), IIC_VLD1x4, "vld1", Dt,
+          (ins AddrMode:$Rn), IIC_VLD1x4, "vld1", Dt,
           "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
+multiclass VLD1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b10,0b0010, op7_4, (outs VecListFourD:$Vd, GPR:$wb),
-                    (ins addrmode6:$Rn), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn), IIC_VLD1x2u,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -749,7 +792,7 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1x2u,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -757,15 +800,15 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8">;
-def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16">;
-def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32">;
-def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64">;
+def VLD1d8Q      : VLD1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VLD1d16Q     : VLD1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VLD1d32Q     : VLD1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VLD1d64Q     : VLD1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8">;
-defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16">;
-defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32">;
-defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64">;
+defm VLD1d8Qwb   : VLD1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VLD1d16Qwb  : VLD1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VLD1d32Qwb  : VLD1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VLD1d64Qwb  : VLD1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
 def VLD1d64QPseudo : VLDQQPseudo<IIC_VLD1x4>;
 def VLD1d64QPseudoWB_fixed : VLDQQWBfixedPseudo<IIC_VLD1x4>;
@@ -773,22 +816,28 @@ def VLD1d64QPseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD1x4>;
 
 //   VLD2     : Vector Load (multiple 2-element structures)
 class VLD2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
-           InstrItinClass itin>
+           InstrItinClass itin, Operand AddrMode>
   : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd),
-          (ins addrmode6:$Rn), itin,
+          (ins AddrMode:$Rn), itin,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
-def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2>;
-def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2>;
-def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2>;
+def  VLD2d8   : VLD2<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d16  : VLD2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
+def  VLD2d32  : VLD2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2,
+                     addrmode6align64or128>;
 
-def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2>;
-def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2>;
-def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2>;
+def  VLD2q8   : VLD2<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q16  : VLD2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
+def  VLD2q32  : VLD2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2,
+                     addrmode6align64or128or256>;
 
 def  VLD2q8Pseudo  : VLDQQPseudo<IIC_VLD2x2>;
 def  VLD2q16Pseudo : VLDQQPseudo<IIC_VLD2x2>;
@@ -796,9 +845,9 @@ def  VLD2q32Pseudo : VLDQQPseudo<IIC_VLD2x2>;
 
 // ...with address register writeback:
 multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
-                  RegisterOperand VdTy, InstrItinClass itin> {
+                  RegisterOperand VdTy, InstrItinClass itin, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
-                     (ins addrmode6:$Rn), itin,
+                     (ins AddrMode:$Rn), itin,
                      "vld2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -806,7 +855,7 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm), itin,
+                        (ins AddrMode:$Rn, rGPR:$Rm), itin,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
@@ -814,13 +863,19 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
   }
 }
 
-defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u>;
-defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u>;
-defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u>;
+defm VLD2d8wb  : VLD2WB<0b1000, {0,0,?,?}, "8", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d16wb : VLD2WB<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2d32wb : VLD2WB<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VLD2u,
+                        addrmode6align64or128>;
 
-defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u>;
-defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u>;
-defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u>;
+defm VLD2q8wb  : VLD2WB<0b0011, {0,0,?,?}, "8", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q16wb : VLD2WB<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
+defm VLD2q32wb : VLD2WB<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VLD2x2u,
+                        addrmode6align64or128or256>;
 
 def VLD2q8PseudoWB_fixed     : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
 def VLD2q16PseudoWB_fixed    : VLDQQWBfixedPseudo<IIC_VLD2x2u>;
@@ -830,12 +885,18 @@ def VLD2q16PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
 def VLD2q32PseudoWB_register : VLDQQWBregisterPseudo<IIC_VLD2x2u>;
 
 // ...with double-spaced registers
-def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2>;
-def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2>;
-def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2>;
-defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u>;
-defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u>;
-defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u>;
+def  VLD2b8    : VLD2<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b16   : VLD2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+def  VLD2b32   : VLD2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2,
+                      addrmode6align64or128>;
+defm VLD2b8wb  : VLD2WB<0b1001, {0,0,?,?}, "8", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b16wb : VLD2WB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
+defm VLD2b32wb : VLD2WB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VLD2u,
+                        addrmode6align64or128>;
 
 //   VLD3     : Vector Load (multiple 3-element structures)
 class VLD3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -1293,47 +1354,55 @@ def VLD4LNq32Pseudo_UPD : VLDQQQQLNWBPseudo<IIC_VLD4lnu>;
 } // mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1
 
 //   VLD1DUP  : Vector Load (single element to all lanes)
-class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+class VLD1DUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+              Operand AddrMode>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListOneDAllLanes:$Vd),
-          (ins addrmode6dup:$Rn),
+          (ins AddrMode:$Rn),
           IIC_VLD1dup, "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListOneDAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
-def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8>;
-def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16>;
-def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load>;
+def VLD1DUPd8  : VLD1DUP<{0,0,0,?}, "8", v8i8, extloadi8,
+                         addrmode6dupalignNone>;
+def VLD1DUPd16 : VLD1DUP<{0,1,0,?}, "16", v4i16, extloadi16,
+                         addrmode6dupalign16>;
+def VLD1DUPd32 : VLD1DUP<{1,0,0,?}, "32", v2i32, load,
+                         addrmode6dupalign32>;
 
 def : Pat<(v2f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPd32 addrmode6:$addr)>;
 
-class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp>
+class VLD1QDUP<bits<4> op7_4, string Dt, ValueType Ty, PatFrag LoadOp,
+               Operand AddrMode>
   : NLdSt<1, 0b10, 0b1100, op7_4, (outs VecListDPairAllLanes:$Vd),
-          (ins addrmode6dup:$Rn), IIC_VLD1dup,
+          (ins AddrMode:$Rn), IIC_VLD1dup,
           "vld1", Dt, "$Vd, $Rn", "",
           [(set VecListDPairAllLanes:$Vd,
-                (Ty (NEONvdup (i32 (LoadOp addrmode6dup:$Rn)))))]> {
+                (Ty (NEONvdup (i32 (LoadOp AddrMode:$Rn)))))]> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD1DupInstruction";
 }
 
-def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8>;
-def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16>;
-def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load>;
+def VLD1DUPq8  : VLD1QDUP<{0,0,1,0}, "8", v16i8, extloadi8,
+                          addrmode6dupalignNone>;
+def VLD1DUPq16 : VLD1QDUP<{0,1,1,?}, "16", v8i16, extloadi16,
+                          addrmode6dupalign16>;
+def VLD1DUPq32 : VLD1QDUP<{1,0,1,?}, "32", v4i32, load,
+                          addrmode6dupalign32>;
 
 def : Pat<(v4f32 (NEONvdup (f32 (load addrmode6dup:$addr)))),
           (VLD1DUPq32 addrmode6:$addr)>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // ...with address register writeback:
-multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
+multiclass VLD1DUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
                      (outs VecListOneDAllLanes:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1342,17 +1411,17 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListOneDAllLanes:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
   }
 }
-multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
+multiclass VLD1QDUPWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1100, op7_4,
                      (outs VecListDPairAllLanes:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD1dupu,
+                     (ins AddrMode:$Rn), IIC_VLD1dupu,
                      "vld1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1361,7 +1430,7 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListDPairAllLanes:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD1dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD1dupu,
                         "vld1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -1369,38 +1438,47 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8">;
-defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16">;
-defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32">;
+defm VLD1DUPd8wb  : VLD1DUPWB<{0,0,0,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPd16wb : VLD1DUPWB<{0,1,0,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPd32wb : VLD1DUPWB<{1,0,0,?}, "32", addrmode6dupalign32>;
 
-defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8">;
-defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16">;
-defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32">;
+defm VLD1DUPq8wb  : VLD1QDUPWB<{0,0,1,0}, "8", addrmode6dupalignNone>;
+defm VLD1DUPq16wb : VLD1QDUPWB<{0,1,1,?}, "16", addrmode6dupalign16>;
+defm VLD1DUPq32wb : VLD1QDUPWB<{1,0,1,?}, "32", addrmode6dupalign32>;
 
 //   VLD2DUP  : Vector Load (single 2-element structure to all lanes)
-class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy>
+class VLD2DUP<bits<4> op7_4, string Dt, RegisterOperand VdTy, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1101, op7_4, (outs VdTy:$Vd),
-          (ins addrmode6dup:$Rn), IIC_VLD2dup,
+          (ins AddrMode:$Rn), IIC_VLD2dup,
           "vld2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLD2DupInstruction";
 }
 
-def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes>;
-def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes>;
-def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes>;
+def VLD2DUPd8  : VLD2DUP<{0,0,0,?}, "8",  VecListDPairAllLanes,
+                         addrmode6dupalign16>;
+def VLD2DUPd16 : VLD2DUP<{0,1,0,?}, "16", VecListDPairAllLanes,
+                         addrmode6dupalign32>;
+def VLD2DUPd32 : VLD2DUP<{1,0,0,?}, "32", VecListDPairAllLanes,
+                         addrmode6dupalign64>;
 
+// HACK this one, VLD2DUPd8x2 must be changed at the same time with VLD2b8 or
+// "vld2.8 {d0[], d2[]}, [r4:32]" will become "vld2.8 {d0, d2}, [r4:32]".
 // ...with double-spaced registers
-def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes>;
-def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
-def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+def VLD2DUPd8x2  : VLD2DUP<{0,0,1,?}, "8",  VecListDPairSpacedAllLanes,
+                           addrmode6dupalign16>;
+def VLD2DUPd16x2 : VLD2DUP<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign32>;
+def VLD2DUPd32x2 : VLD2DUP<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                           addrmode6dupalign64>;
 
 // ...with address register writeback:
-multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
+multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy,
+                     Operand AddrMode> {
   def _fixed : NLdSt<1, 0b10, 0b1101, op7_4,
                      (outs VdTy:$Vd, GPR:$wb),
-                     (ins addrmode6dup:$Rn), IIC_VLD2dupu,
+                     (ins AddrMode:$Rn), IIC_VLD2dupu,
                      "vld2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1409,7 +1487,7 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
   }
   def _register : NLdSt<1, 0b10, 0b1101, op7_4,
                         (outs VdTy:$Vd, GPR:$wb),
-                        (ins addrmode6dup:$Rn, rGPR:$Rm), IIC_VLD2dupu,
+                        (ins AddrMode:$Rn, rGPR:$Rm), IIC_VLD2dupu,
                         "vld2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
@@ -1417,13 +1495,19 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
   }
 }
 
-defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes>;
-defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes>;
-defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes>;
+defm VLD2DUPd8wb    : VLD2DUPWB<{0,0,0,0}, "8",  VecListDPairAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16wb   : VLD2DUPWB<{0,1,0,?}, "16", VecListDPairAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32wb   : VLD2DUPWB<{1,0,0,?}, "32", VecListDPairAllLanes,
+                                addrmode6dupalign64>;
 
-defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes>;
-defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes>;
-defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes>;
+defm VLD2DUPd8x2wb  : VLD2DUPWB<{0,0,1,0}, "8",  VecListDPairSpacedAllLanes,
+                                addrmode6dupalign16>;
+defm VLD2DUPd16x2wb : VLD2DUPWB<{0,1,1,?}, "16", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign32>;
+defm VLD2DUPd32x2wb : VLD2DUPWB<{1,0,1,?}, "32", VecListDPairSpacedAllLanes,
+                                addrmode6dupalign64>;
 
 //   VLD3DUP  : Vector Load (single 3-element structure to all lanes)
 class VLD3DUP<bits<4> op7_4, string Dt>
@@ -1449,22 +1533,22 @@ def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
 def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
 
 // ...with address register writeback:
-class VLD3DUPWB<bits<4> op7_4, string Dt>
+class VLD3DUPWB<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<1, 0b10, 0b1110, op7_4, (outs DPR:$Vd, DPR:$dst2, DPR:$dst3, GPR:$wb),
-          (ins addrmode6dup:$Rn, am6offset:$Rm), IIC_VLD3dupu,
+          (ins AddrMode:$Rn, am6offset:$Rm), IIC_VLD3dupu,
           "vld3", Dt, "\\{$Vd[], $dst2[], $dst3[]\\}, $Rn$Rm",
           "$Rn.addr = $wb", []> {
   let Inst{4} = 0;
   let DecoderMethod = "DecodeVLD3DupInstruction";
 }
 
-def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8">;
-def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
-def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
+def VLD3DUPd8_UPD  : VLD3DUPWB<{0,0,0,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16", addrmode6dupalign64>;
+def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32", addrmode6dupalign64>;
 
-def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8">;
-def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
-def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+def VLD3DUPq8_UPD  : VLD3DUPWB<{0,0,1,0}, "8",  addrmode6dupalign64>;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16", addrmode6dupalign64>;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32", addrmode6dupalign64>;
 
 def VLD3DUPd8Pseudo_UPD  : VLDQQWBPseudo<IIC_VLD3dupu>;
 def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
@@ -1560,35 +1644,35 @@ class VSTQQQQWBPseudo<InstrItinClass itin>
                 "$addr.addr = $wb">;
 
 //   VST1     : Vector Store (multiple single elements)
-class VST1D<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins addrmode6:$Rn, VecListOneD:$Vd),
+class VST1D<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b0111,op7_4, (outs), (ins AddrMode:$Rn, VecListOneD:$Vd),
           IIC_VST1, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-class VST1Q<bits<4> op7_4, string Dt>
-  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins addrmode6:$Rn, VecListDPair:$Vd),
+class VST1Q<bits<4> op7_4, string Dt, Operand AddrMode>
+  : NLdSt<0,0b00,0b1010,op7_4, (outs), (ins AddrMode:$Rn, VecListDPair:$Vd),
           IIC_VST1x2, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
 
-def  VST1d8   : VST1D<{0,0,0,?}, "8">;
-def  VST1d16  : VST1D<{0,1,0,?}, "16">;
-def  VST1d32  : VST1D<{1,0,0,?}, "32">;
-def  VST1d64  : VST1D<{1,1,0,?}, "64">;
+def  VST1d8   : VST1D<{0,0,0,?}, "8",  addrmode6align64>;
+def  VST1d16  : VST1D<{0,1,0,?}, "16", addrmode6align64>;
+def  VST1d32  : VST1D<{1,0,0,?}, "32", addrmode6align64>;
+def  VST1d64  : VST1D<{1,1,0,?}, "64", addrmode6align64>;
 
-def  VST1q8   : VST1Q<{0,0,?,?}, "8">;
-def  VST1q16  : VST1Q<{0,1,?,?}, "16">;
-def  VST1q32  : VST1Q<{1,0,?,?}, "32">;
-def  VST1q64  : VST1Q<{1,1,?,?}, "64">;
+def  VST1q8   : VST1Q<{0,0,?,?}, "8",  addrmode6align64or128>;
+def  VST1q16  : VST1Q<{0,1,?,?}, "16", addrmode6align64or128>;
+def  VST1q32  : VST1Q<{1,0,?,?}, "32", addrmode6align64or128>;
+def  VST1q64  : VST1Q<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with address register writeback:
-multiclass VST1DWB<bits<4> op7_4, string Dt> {
+multiclass VST1DWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00, 0b0111,op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VecListOneD:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VecListOneD:$Vd), IIC_VLD1u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1596,7 +1680,7 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListOneD:$Vd),
                         IIC_VLD1u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1604,9 +1688,9 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
 }
-multiclass VST1QWB<bits<4> op7_4, string Dt> {
+multiclass VST1QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
+                    (ins AddrMode:$Rn, VecListDPair:$Vd), IIC_VLD1x2u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1614,7 +1698,7 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListDPair:$Vd),
                         IIC_VLD1x2u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1623,28 +1707,28 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8">;
-defm VST1d16wb : VST1DWB<{0,1,0,?}, "16">;
-defm VST1d32wb : VST1DWB<{1,0,0,?}, "32">;
-defm VST1d64wb : VST1DWB<{1,1,0,?}, "64">;
+defm VST1d8wb  : VST1DWB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16wb : VST1DWB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32wb : VST1DWB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64wb : VST1DWB<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8">;
-defm VST1q16wb : VST1QWB<{0,1,?,?}, "16">;
-defm VST1q32wb : VST1QWB<{1,0,?,?}, "32">;
-defm VST1q64wb : VST1QWB<{1,1,?,?}, "64">;
+defm VST1q8wb  : VST1QWB<{0,0,?,?}, "8",  addrmode6align64or128>;
+defm VST1q16wb : VST1QWB<{0,1,?,?}, "16", addrmode6align64or128>;
+defm VST1q32wb : VST1QWB<{1,0,?,?}, "32", addrmode6align64or128>;
+defm VST1q64wb : VST1QWB<{1,1,?,?}, "64", addrmode6align64or128>;
 
 // ...with 3 registers
-class VST1D3<bits<4> op7_4, string Dt>
+class VST1D3<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0110, op7_4, (outs),
-          (ins addrmode6:$Rn, VecListThreeD:$Vd),
+          (ins AddrMode:$Rn, VecListThreeD:$Vd),
           IIC_VST1x3, "vst1", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{4} = Rn{4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VST1D3WB<bits<4> op7_4, string Dt> {
+multiclass VST1D3WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
+                    (ins AddrMode:$Rn, VecListThreeD:$Vd), IIC_VLD1x3u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1652,7 +1736,7 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
                         IIC_VLD1x3u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1661,33 +1745,33 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VST1d8T     : VST1D3<{0,0,0,?}, "8">;
-def VST1d16T    : VST1D3<{0,1,0,?}, "16">;
-def VST1d32T    : VST1D3<{1,0,0,?}, "32">;
-def VST1d64T    : VST1D3<{1,1,0,?}, "64">;
+def VST1d8T     : VST1D3<{0,0,0,?}, "8",  addrmode6align64>;
+def VST1d16T    : VST1D3<{0,1,0,?}, "16", addrmode6align64>;
+def VST1d32T    : VST1D3<{1,0,0,?}, "32", addrmode6align64>;
+def VST1d64T    : VST1D3<{1,1,0,?}, "64", addrmode6align64>;
 
-defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8">;
-defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16">;
-defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32">;
-defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64">;
+defm VST1d8Twb  : VST1D3WB<{0,0,0,?}, "8",  addrmode6align64>;
+defm VST1d16Twb : VST1D3WB<{0,1,0,?}, "16", addrmode6align64>;
+defm VST1d32Twb : VST1D3WB<{1,0,0,?}, "32", addrmode6align64>;
+defm VST1d64Twb : VST1D3WB<{1,1,0,?}, "64", addrmode6align64>;
 
 def VST1d64TPseudo            : VSTQQPseudo<IIC_VST1x3>;
 def VST1d64TPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x3u>;
 def VST1d64TPseudoWB_register : VSTQQWBPseudo<IIC_VST1x3u>;
 
 // ...with 4 registers
-class VST1D4<bits<4> op7_4, string Dt>
+class VST1D4<bits<4> op7_4, string Dt, Operand AddrMode>
   : NLdSt<0, 0b00, 0b0010, op7_4, (outs),
-          (ins addrmode6:$Rn, VecListFourD:$Vd),
+          (ins AddrMode:$Rn, VecListFourD:$Vd),
           IIC_VST1x4, "vst1", Dt, "$Vd, $Rn", "",
           []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST1Instruction";
 }
-multiclass VST1D4WB<bits<4> op7_4, string Dt> {
+multiclass VST1D4WB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
-                    (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
+                    (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1x4u,
                      "vst1", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1695,7 +1779,7 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST1Instruction";
   }
   def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1x4u,
                         "vst1", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1704,15 +1788,15 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
   }
 }
 
-def VST1d8Q     : VST1D4<{0,0,?,?}, "8">;
-def VST1d16Q    : VST1D4<{0,1,?,?}, "16">;
-def VST1d32Q    : VST1D4<{1,0,?,?}, "32">;
-def VST1d64Q    : VST1D4<{1,1,?,?}, "64">;
+def VST1d8Q     : VST1D4<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+def VST1d16Q    : VST1D4<{0,1,?,?}, "16", addrmode6align64or128or256>;
+def VST1d32Q    : VST1D4<{1,0,?,?}, "32", addrmode6align64or128or256>;
+def VST1d64Q    : VST1D4<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
-defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8">;
-defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16">;
-defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32">;
-defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64">;
+defm VST1d8Qwb  : VST1D4WB<{0,0,?,?}, "8",  addrmode6align64or128or256>;
+defm VST1d16Qwb : VST1D4WB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST1d32Qwb : VST1D4WB<{1,0,?,?}, "32", addrmode6align64or128or256>;
+defm VST1d64Qwb : VST1D4WB<{1,1,?,?}, "64", addrmode6align64or128or256>;
 
 def VST1d64QPseudo            : VSTQQPseudo<IIC_VST1x4>;
 def VST1d64QPseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST1x4u>;
@@ -1720,21 +1804,27 @@ def VST1d64QPseudoWB_register : VSTQQWBPseudo<IIC_VST1x4u>;
 
 //   VST2     : Vector Store (multiple 2-element structures)
 class VST2<bits<4> op11_8, bits<4> op7_4, string Dt, RegisterOperand VdTy,
-            InstrItinClass itin>
-  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins addrmode6:$Rn, VdTy:$Vd),
+            InstrItinClass itin, Operand AddrMode>
+  : NLdSt<0, 0b00, op11_8, op7_4, (outs), (ins AddrMode:$Rn, VdTy:$Vd),
           itin, "vst2", Dt, "$Vd, $Rn", "", []> {
   let Rm = 0b1111;
   let Inst{5-4} = Rn{5-4};
   let DecoderMethod = "DecodeVLDST2Instruction";
 }
 
-def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2>;
-def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2>;
-def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2>;
+def  VST2d8   : VST2<0b1000, {0,0,?,?}, "8",  VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d16  : VST2<0b1000, {0,1,?,?}, "16", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
+def  VST2d32  : VST2<0b1000, {1,0,?,?}, "32", VecListDPair, IIC_VST2,
+                     addrmode6align64or128>;
 
-def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2>;
-def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2>;
-def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2>;
+def  VST2q8   : VST2<0b0011, {0,0,?,?}, "8",  VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q16  : VST2<0b0011, {0,1,?,?}, "16", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
+def  VST2q32  : VST2<0b0011, {1,0,?,?}, "32", VecListFourD, IIC_VST2x2,
+                     addrmode6align64or128or256>;
 
 def  VST2q8Pseudo  : VSTQQPseudo<IIC_VST2x2>;
 def  VST2q16Pseudo : VSTQQPseudo<IIC_VST2x2>;
@@ -1742,9 +1832,9 @@ def  VST2q32Pseudo : VSTQQPseudo<IIC_VST2x2>;
 
 // ...with address register writeback:
 multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
-                   RegisterOperand VdTy> {
+                   RegisterOperand VdTy, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VdTy:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VdTy:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1752,16 +1842,16 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
+                        (ins AddrMode:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
 }
-multiclass VST2QWB<bits<4> op7_4, string Dt> {
+multiclass VST2QWB<bits<4> op7_4, string Dt, Operand AddrMode> {
   def _fixed : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-                     (ins addrmode6:$Rn, VecListFourD:$Vd), IIC_VLD1u,
+                     (ins AddrMode:$Rn, VecListFourD:$Vd), IIC_VLD1u,
                      "vst2", Dt, "$Vd, $Rn!",
                      "$Rn.addr = $wb", []> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
@@ -1769,7 +1859,7 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
     let DecoderMethod = "DecodeVLDST2Instruction";
   }
   def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
-                        (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
+                        (ins AddrMode:$Rn, rGPR:$Rm, VecListFourD:$Vd),
                         IIC_VLD1u,
                         "vst2", Dt, "$Vd, $Rn, $Rm",
                         "$Rn.addr = $wb", []> {
@@ -1778,13 +1868,16 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
   }
 }
 
-defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair>;
-defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair>;
-defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair>;
+defm VST2d8wb    : VST2DWB<0b1000, {0,0,?,?}, "8",  VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d16wb   : VST2DWB<0b1000, {0,1,?,?}, "16", VecListDPair,
+                           addrmode6align64or128>;
+defm VST2d32wb   : VST2DWB<0b1000, {1,0,?,?}, "32", VecListDPair,
+                           addrmode6align64or128>;
 
-defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8">;
-defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16">;
-defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32">;
+defm VST2q8wb    : VST2QWB<{0,0,?,?}, "8", addrmode6align64or128or256>;
+defm VST2q16wb   : VST2QWB<{0,1,?,?}, "16", addrmode6align64or128or256>;
+defm VST2q32wb   : VST2QWB<{1,0,?,?}, "32", addrmode6align64or128or256>;
 
 def VST2q8PseudoWB_fixed     : VSTQQWBfixedPseudo<IIC_VST2x2u>;
 def VST2q16PseudoWB_fixed    : VSTQQWBfixedPseudo<IIC_VST2x2u>;
@@ -1794,12 +1887,18 @@ def VST2q16PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
 def VST2q32PseudoWB_register : VSTQQWBregisterPseudo<IIC_VST2x2u>;
 
 // ...with double-spaced registers
-def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2>;
-def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2>;
-def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2>;
-defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced>;
-defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced>;
-defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced>;
+def VST2b8      : VST2<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b16     : VST2<0b1001, {0,1,?,?}, "16", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+def VST2b32     : VST2<0b1001, {1,0,?,?}, "32", VecListDPairSpaced, IIC_VST2,
+                      addrmode6align64or128>;
+defm VST2b8wb   : VST2DWB<0b1001, {0,0,?,?}, "8",  VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b16wb  : VST2DWB<0b1001, {0,1,?,?}, "16", VecListDPairSpaced,
+                          addrmode6align64or128>;
+defm VST2b32wb  : VST2DWB<0b1001, {1,0,?,?}, "32", VecListDPairSpaced,
+                          addrmode6align64or128>;
 
 //   VST3     : Vector Store (multiple 3-element structures)
 class VST3D<bits<4> op11_8, bits<4> op7_4, string Dt>
@@ -2267,9 +2366,9 @@ def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
 def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
           (VST1q64 addrmode6:$addr, QPR:$value)>;
 def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
-          (VLD1q32 addrmode6:$addr)>;
+          (VLD1q32 addrmode6:$addr)>, Requires<[IsLE]>;
 def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
-          (VST1q32 addrmode6:$addr, QPR:$value)>;
+          (VST1q32 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
 def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
           (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
 def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
@@ -2357,14 +2456,14 @@ class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<0b10, op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
 
 class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
               InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<0b10, op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Similar to NV2VQIntnp with some more encoding bits exposed (crypto).
@@ -2372,7 +2471,7 @@ class N2VQIntXnp<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
               bit op7, InstrItinClass itin, string OpcodeStr, string Dt,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<op19_18, op17_16, op10_8, op7, op6,  (outs QPR:$Vd), (ins QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
 // Same as N2VQIntXnp but with Vd as a src register.
@@ -2381,7 +2480,7 @@ class N2VQIntX2np<bits<2> op19_18, bits<2> op17_16, bits<3> op10_8, bit op6,
               ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
   : N2Vnp<op19_18, op17_16, op10_8, op7, op6,
           (outs QPR:$Vd), (ins QPR:$src, QPR:$Vm),
-          itin, OpcodeStr, Dt, ResTy, OpTy,
+          itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vm))))]> {
   let Constraints = "$src = $Vd";
 }
@@ -2555,7 +2654,6 @@ class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
@@ -2609,7 +2707,6 @@ class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
 
 // Same as N3VQIntnp but with Vd as a src register.
@@ -2618,8 +2715,8 @@ class N3VQInt3np<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 string Dt, ValueType ResTy, ValueType OpTy,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
-          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr,
-          Dt, ResTy, OpTy, IntOp, Commutable,
+          (outs QPR:$Vd), (ins QPR:$src, QPR:$Vn, QPR:$Vm),
+          f, itin, OpcodeStr, Dt,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$src), (OpTy QPR:$Vn),
                                        (OpTy QPR:$Vm))))]> {
   let Constraints = "$src = $Vd";
@@ -2939,7 +3036,6 @@ class N3VLIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 SDPatternOperator IntOp, bit Commutable>
   : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
           (outs QPR:$Vd), (ins DPR:$Vn, DPR:$Vm), N3RegFrm, itin, OpcodeStr, Dt,
-          ResTy, OpTy, IntOp, Commutable,
           [(set QPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
 
 class N3VLIntSL<bit op24, bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
@@ -5245,6 +5341,35 @@ def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
                          [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
 } // isReMaterializable
 
+// Add support for bytes replication feature, so it could be GAS compatible.
+// E.g. instructions below:
+// "vmov.i32 d0, 0xffffffff"
+// "vmov.i32 d0, 0xabababab"
+// "vmov.i16 d0, 0xabab"
+// are incorrect, but we could deal with such cases.
+// For last two instructions, for example, it should emit:
+// "vmov.i8 d0, 0xab"
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmov${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMOVI32ByteReplicate:$Vm, pred:$p)>;
+
+// Also add same support for VMVN instructions. So instruction:
+// "vmvn.i32 d0, 0xabababab"
+// actually means:
+// "vmov.i8 d0, 0x54"
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv8i8 DPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i16 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI16ByteReplicate:$Vm, pred:$p)>;
+def : NEONInstAlias<"vmvn${p}.i32 $Vd, $Vm",
+                    (VMOVv16i8 QPR:$Vd, nImmVMVNI32ByteReplicate:$Vm, pred:$p)>;
 
 // On some CPUs the two instructions "vmov.i32 dD, #0" and "vmov.i32 qD, #0"
 // require zero cycles to execute so they should be used wherever possible for
@@ -5617,22 +5742,22 @@ def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
 }
 
-def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Dd, $Dm, #0",
                     (VCVTf2sd DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Dd, $Dm, #0",
                     (VCVTf2ud DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Dd, $Dm, #0",
                     (VCVTs2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Dd, $Dm, #0",
                     (VCVTu2fd DPR:$Dd, DPR:$Dm, pred:$p)>;
 
-def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.s32.f32 $Qd, $Qm, #0",
                     (VCVTf2sq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.u32.f32 $Qd, $Qm, #0",
                     (VCVTf2uq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.s32 $Qd, $Qm, #0",
                     (VCVTs2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
-def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0", 
+def : NEONInstAlias<"vcvt${p}.f32.u32 $Qd, $Qm, #0",
                     (VCVTu2fq QPR:$Qd, QPR:$Qm, pred:$p)>;
 
 
@@ -6051,67 +6176,145 @@ def : Pat<(f32 (bitconvert GPR:$a)),
 //===----------------------------------------------------------------------===//
 
 // bit_convert
-def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
+}
 def : Pat<(v1i64 (bitconvert (f64   DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
+}
 def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
+}
 def : Pat<(f64   (bitconvert (v1i64 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
+}
 def : Pat<(v2f32 (bitconvert (v2i32 DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
+}
 
-def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
+}
 def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
+}
 def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
+}
 def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
+}
 def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
+}
+
+let Predicates = [IsBE] in {
+  // 64 bit conversions
+  def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v1i64 DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (f64   DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
+
+  // 128 bit conversions
+  def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
+}
 
 // Fold extracting an element out of a v2i32 into a vfp register.
 def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
@@ -6120,7 +6323,7 @@ def : Pat<(f32 (bitconvert (i32 (extractelt (v2i32 DPR:$src), imm:$lane)))),
 // Vector lengthening move with load, matching extending loads.
 
 // extload, zextload and sextload for a standard lengthening load. Example:
-// Lengthen_Single<"8", "i16", "8"> = 
+// Lengthen_Single<"8", "i16", "8"> =
 //     Pat<(v8i16 (extloadvi8 addrmode6:$addr))
 //         (VMOVLuv8i16 (VLD1d8 addrmode6:$addr,
 //                              (f64 (IMPLICIT_DEF)), (i32 0)))>;
@@ -6147,7 +6350,7 @@ multiclass Lengthen_Single<string DestLanes, string DestTy, string SrcTy> {
 // half the lanes available. Example:
 // Lengthen_HalfSingle<"4", "i16", "8", "i16", "i8"> =
 //     Pat<(v4i16 (extloadvi8 addrmode6oneL32:$addr)),
-//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr, 
+//         (EXTRACT_SUBREG (VMOVLuv8i16 (VLD1LNd32 addrmode6oneL32:$addr,
 //                                      (f64 (IMPLICIT_DEF)), (i32 0))),
 //                         dsub_0)>;
 multiclass Lengthen_HalfSingle<string DestLanes, string DestTy, string SrcTy,
@@ -6257,7 +6460,7 @@ defm : Lengthen_Double<"2", "i64", "i16", "4", "i32", "2", "i64">;
 // Triple lengthening - v2i8 -> v2i16 -> v2i32 -> v2i64
 def : Pat<(v2i64 (extloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
-         (VLD1LNd16 addrmode6:$addr, 
+         (VLD1LNd16 addrmode6:$addr,
                     (f64 (IMPLICIT_DEF)), (i32 0))), dsub_0)), dsub_0))>;
 def : Pat<(v2i64 (zextloadvi8 addrmode6:$addr)),
       (VMOVLuv2i64 (EXTRACT_SUBREG (VMOVLuv4i32 (EXTRACT_SUBREG (VMOVLuv8i16
@@ -6311,379 +6514,442 @@ defm : NEONDTAnyInstAlias<"vorr${p}", "$Vdn, $Vm",
 // VLD1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VLD1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 
 def VLD1LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr!",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr!",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr!",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD1LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD1LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD1LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld1${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VST1 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST1LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VST1LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST1LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 
 def VST1LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr!",
-                 (ins VecListOneDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
+                      pred:$p)>;
 def VST1LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr!",
-                 (ins VecListOneDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST1LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr!",
-                 (ins VecListOneDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST1LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListOneDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST1LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListOneDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDHWordIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST1LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst1${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListOneDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListOneDWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                  pred:$p)>;
 def VLD2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 
 def VLD2LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr!",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VLD2LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VLD2LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VLD2LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD2LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VST2 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST2LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST2LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 
 def VST2LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr!",
-                 (ins VecListTwoDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
+                      pred:$p)>;
 def VST2LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16", "$list, $addr!",
-                 (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
+                      pred:$p)>;
 def VST2LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr!",
-                 (ins VecListTwoQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+                 (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
+                      pred:$p)>;
 def VST2LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListTwoDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDByteIndexed:$list, addrmode6align16:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
-                  (ins VecListTwoDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoDWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".16","$list, $addr, $Rm",
-                  (ins VecListTwoQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQHWordIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST2LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst2${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListTwoQWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD3 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 
 def VLD3DUPdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
+                    pred:$p)>;
 def VLD3DUPdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeDAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3DUPqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListThreeQAllLanes:$list, addrmode6dupalignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
 // VLD3 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 
 def VLD3LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VLD3LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VLD3LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VLD3LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD3 multiple structure pseudo-instructions. These need special handling for
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VLD3dAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 
 def VLD3dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VLD3dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD3qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VST3 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST3LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 
 def VST3LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQHWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
+                    pred:$p)>;
 def VST3LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDByteIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeDHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeDHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VST3LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeDWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeDWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQHWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListThreeQHWordIndexed:$list,
+                       addrmode6alignNone:$addr, rGPR:$Rm, pred:$p)>;
 def VST3LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListThreeQWordIndexed:$list, addrmode6alignNone:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 
@@ -6691,168 +6957,190 @@ def VST3LNqWB_register_Asm_32 :
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VST3dAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_8 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_16 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qAsm_32 : NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 
 def VST3dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeD:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr!",
-               (ins VecListThreeQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListThreeQ:$list, addrmode6align64:$addr, pred:$p)>;
 def VST3dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeD:$list, addrmode6:$addr,
+                  (ins VecListThreeD:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST3qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst3${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListThreeQ:$list, addrmode6:$addr,
+                  (ins VecListThreeQ:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VLD4 all-lanes pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD4DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPdAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPdAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_16: NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPqAsm_32: NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 
 def VLD4DUPdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
+                    pred:$p)>;
 def VLD4DUPqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQAllLanes:$list, addrmode6dupalign64or128:$addr,
+                    pred:$p)>;
 def VLD4DUPdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourDAllLanes:$list, addrmode6dupalign64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDAllLanes:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
+                  (ins VecListFourQAllLanes:$list, addrmode6dupalign64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4DUPqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQAllLanes:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQAllLanes:$list,
+                       addrmode6dupalign64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 // VLD4 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VLD4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VLD4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 
 def VLD4LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VLD4LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VLD4LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VLD4LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 
@@ -6860,168 +7148,202 @@ def VLD4LNqWB_register_Asm_32 :
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VLD4dAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_8 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_16 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qAsm_32 : NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 
 def VLD4dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                pred:$p)>;
 def VLD4dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VLD4qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vld4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VST4 single-lane pseudo-instructions. These need special handling for
 // the lane index that an InstAlias can't handle, so we use these instead.
 def VST4LNdAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VST4LNdAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNdAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNqAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNqAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 
 def VST4LNdWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourDByteIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
+                    pred:$p)>;
 def VST4LNdWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourDHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNdWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourDWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourDWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNqWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQHWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
+                    pred:$p)>;
 def VST4LNqWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQWordIndexed:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQWordIndexed:$list, addrmode6align64or128:$addr,
+                    pred:$p)>;
 def VST4LNdWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourDByteIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDByteIndexed:$list, addrmode6align32:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNdWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourDHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourDHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNdWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourDWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourDWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 def VST4LNqWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQHWordIndexed:$list, addrmode6:$addr,
+                  (ins VecListFourQHWordIndexed:$list, addrmode6align64:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4LNqWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQWordIndexed:$list, addrmode6:$addr,
-                       rGPR:$Rm, pred:$p)>;
+                  (ins VecListFourQWordIndexed:$list,
+                       addrmode6align64or128:$addr, rGPR:$Rm, pred:$p)>;
 
 
 // VST4 multiple structure pseudo-instructions. These need special handling for
 // the vector operands that the normal instructions don't yet model.
 // FIXME: Remove these when the register classes and instructions are updated.
 def VST4dAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_8 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_16 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qAsm_32 : NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 
 def VST4dWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourD:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4qWB_fixed_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr!",
-               (ins VecListFourQ:$list, addrmode6:$addr, pred:$p)>;
+               (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
+                    pred:$p)>;
 def VST4dWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4dWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4dWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourD:$list, addrmode6:$addr,
+                  (ins VecListFourD:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_8 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".8", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_16 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".16", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 def VST4qWB_register_Asm_32 :
         NEONDataTypeAsmPseudoInst<"vst4${p}", ".32", "$list, $addr, $Rm",
-                  (ins VecListFourQ:$list, addrmode6:$addr,
+                  (ins VecListFourQ:$list, addrmode6align64or128or256:$addr,
                        rGPR:$Rm, pred:$p)>;
 
 // VMOV/VMVN takes an optional datatype suffix
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 754295f..e17f73a 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -269,7 +269,8 @@ class T1SystemEncoding<bits<8> opc>
   let Inst{7-0} = opc;
 }
 
-def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm", []>,
+def tHINT : T1pI<(outs), (ins imm0_15:$imm), NoItinerary, "hint", "\t$imm",
+                 [(int_arm_hint imm0_15:$imm)]>,
             T1SystemEncoding<0x00>,
             Requires<[IsThumb, HasV6M]> {
   bits<4> imm;
@@ -288,7 +289,6 @@ def : tHintAlias<"sev$p", (tHINT 4, pred:$p)>; // A8.6.157
 def : tInstAlias<"sevl$p", (tHINT 5, pred:$p)> {
   let Predicates = [IsThumb2, HasV8];
 }
-def : T2Pat<(int_arm_sevl), (tHINT 5)>;
 
 // The imm operand $val can be used by a debugger to store more information
 // about the breakpoint.
@@ -1193,6 +1193,15 @@ def tTST :                      // A8.6.230
                [(ARMcmpZ (and_su tGPR:$Rn, tGPR:$Rm), 0)]>,
                Sched<[WriteALU]>;
 
+// A8.8.247  UDF - Undefined (Encoding T1)
+def tUDF : TI<(outs), (ins imm0_255:$imm8), IIC_Br, "udf\t$imm8",
+              [(int_arm_undefined imm0_255:$imm8)]>, Encoding16 {
+  bits<8> imm8;
+  let Inst{15-12} = 0b1101;
+  let Inst{11-8} = 0b1110;
+  let Inst{7-0} = imm8;
+}
+
 // Zero-extend byte
 def tUXTB :                     // A8.6.262
   T1pIMiscEncode<{0,0,1,0,1,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1308,6 +1317,18 @@ def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
 def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
             (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
 
+// Bswap 16 with load/store
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_rrs2:$addr)), (i32 16)),
+            (tREV16 (tLDRHr t_addrmode_rrs2:$addr))>;
+def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
+            (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_rrs2:$addr),
+            (tSTRHr (tREV16 tGPR:$Rn), t_addrmode_rrs2:$addr)>;
+def : T1Pat<(truncstorei16 (srl (bswap tGPR:$Rn), (i32 16)),
+                           t_addrmode_is2:$addr),
+            (tSTRHi(tREV16 tGPR:$Rn), t_addrmode_is2:$addr)>;
+
 // ConstantPool
 def : T1Pat<(ARMWrapper  tconstpool  :$dst), (tLEApcrel tconstpool  :$dst)>;
 
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 387bd60..c30d6ab 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1445,7 +1445,7 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 // Store doubleword
 let mayStore = 1, neverHasSideEffects = 1, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
-                       (ins GPR:$Rt, GPR:$Rt2, t2addrmode_imm8s4:$addr),
+                       (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
                IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
 
 // Indexed stores
@@ -1676,7 +1676,7 @@ defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 // pci variant is very similar to i12, but supports negative offsets
 // from the PC. Only PLD and PLI have pci variants (not PLDW)
 class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
-               IIC_Preload, opc, "\t$addr", 
+               IIC_Preload, opc, "\t$addr",
                [(ARMPreload (ARMWrapper tconstpool:$addr),
                 (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
   let Inst{31-25} = 0b1111100;
@@ -1918,7 +1918,7 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
   let DecoderMethod = "DecodeT2MOVTWInstruction";
 }
 
-def : t2InstAlias<"mov${p} $Rd, $imm", 
+def : t2InstAlias<"mov${p} $Rd, $imm",
                   (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p)>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
@@ -2407,6 +2407,19 @@ def t2UBFX: T2TwoRegBitFI<
   let Inst{15} = 0;
 }
 
+// A8.8.247  UDF - Undefined (Encoding T2)
+def t2UDF : T2XI<(outs), (ins imm0_65535:$imm16), IIC_Br, "udf.w\t$imm16",
+                 [(int_arm_undefined imm0_65535:$imm16)]> {
+  bits<16> imm16;
+  let Inst{31-29} = 0b111;
+  let Inst{28-27} = 0b10;
+  let Inst{26-20} = 0b1111111;
+  let Inst{19-16} = imm16{15-12};
+  let Inst{15} = 0b1;
+  let Inst{14-12} = 0b010;
+  let Inst{11-0} = imm16{11-0};
+}
+
 // A8.6.18  BFI - Bitfield insert (Encoding T1)
 let Constraints = "$src = $Rd" in {
   def t2BFI : T2TwoRegBitFI<(outs rGPR:$Rd),
@@ -3495,8 +3508,8 @@ def t2B   : T2I<(outs), (ins uncondbrtarget:$target), IIC_Br,
   let Inst{25-16} = target{20-11};
   let Inst{10-0} = target{10-0};
   let DecoderMethod = "DecodeT2BInstruction";
-  let AsmMatchConverter = "cvtThumbBranches"; 
-} 
+  let AsmMatchConverter = "cvtThumbBranches";
+}
 
 let isNotDuplicable = 1, isIndirectBranch = 1 in {
 def t2BR_JT : t2PseudoInst<(outs),
@@ -3671,7 +3684,8 @@ def : t2InstAlias<"cps.w $mode", (t2CPS1p imm0_31:$mode), 0>;
 
 // A6.3.4 Branches and miscellaneous control
 // Table A6-14 Change Processor State, and hint instructions
-def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",[]> {
+def t2HINT : T2I<(outs), (ins imm0_239:$imm), NoItinerary, "hint", ".w\t$imm",
+                  [(int_arm_hint imm0_239:$imm)]> {
   bits<8> imm;
   let Inst{31-3} = 0b11110011101011111000000000000;
   let Inst{7-0} = imm;
@@ -3698,7 +3712,7 @@ def t2DBG : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "dbg", "\t$opt", []> {
 
 // Secure Monitor Call is a system instruction.
 // Option = Inst{19-16}
-def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt", 
+def t2SMC : T2I<(outs), (ins imm0_15:$opt), NoItinerary, "smc", "\t$opt",
                 []>, Requires<[IsThumb2, HasTrustZone]> {
   let Inst{31-27} = 0b11110;
   let Inst{26-20} = 0b1111111;
@@ -4278,7 +4292,7 @@ def : t2InstAlias<"sbc${s}${p} $Rd, $Rn, $ShiftedRm",
 
 // Aliases for ADD without the ".w" optional width specifier.
 def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
-        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p, 
+        (t2ADDri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm:$imm, pred:$p,
          cc_out:$s)>;
 def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
            (t2ADDri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095:$imm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMJITInfo.cpp b/lib/Target/ARM/ARMJITInfo.cpp
index 73c6eb7..8821c2d 100644
--- a/lib/Target/ARM/ARMJITInfo.cpp
+++ b/lib/Target/ARM/ARMJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "ARMJITInfo.h"
 #include "ARMConstantPoolValue.h"
 #include "ARMRelocations.h"
@@ -25,6 +24,8 @@
 #include <cstdlib>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 void ARMJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   report_fatal_error("ARMJITInfo::replaceMachineCodeForFunction");
 }
@@ -319,13 +320,13 @@ void ARMJITInfo::relocate(void *Function, MachineRelocation *MR,
       break;
     }
     case ARM::reloc_arm_movw: {
-      ResultPtr = ResultPtr & 0xFFFF; 
+      ResultPtr = ResultPtr & 0xFFFF;
       *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
       *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
       break;
     }
     case ARM::reloc_arm_movt: {
-      ResultPtr = (ResultPtr >> 16) & 0xFFFF; 
+      ResultPtr = (ResultPtr >> 16) & 0xFFFF;
       *((intptr_t*)RelocPos) |= ResultPtr & 0xFFF;
       *((intptr_t*)RelocPos) |= ((ResultPtr >> 12) & 0xF) << 16;
       break;
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 48e0bd7..ee7df54 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -12,13 +12,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-ldst-opt"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMISelLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "Thumb1RegisterInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -42,6 +43,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-ldst-opt"
+
 STATISTIC(NumLDMGened , "Number of ldm instructions generated");
 STATISTIC(NumSTMGened , "Number of stm instructions generated");
 STATISTIC(NumVLDMGened, "Number of vldm instructions generated");
@@ -65,9 +68,10 @@ namespace {
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
+    const TargetLowering *TL;
     ARMFunctionInfo *AFI;
     RegScavenger *RS;
-    bool isThumb2;
+    bool isThumb1, isThumb2;
 
     bool runOnMachineFunction(MachineFunction &Fn) override;
 
@@ -93,7 +97,10 @@ namespace {
     void findUsesOfImpDef(SmallVectorImpl<MachineOperand *> &UsesOfImpDefs,
                           const MemOpQueue &MemOps, unsigned DefReg,
                           unsigned RangeBegin, unsigned RangeEnd);
-
+    void UpdateBaseRegUses(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           DebugLoc dl, unsigned Base, unsigned WordOffset,
+                           ARMCC::CondCodes Pred, unsigned PredReg);
     bool MergeOps(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                   int Offset, unsigned Base, bool BaseKill, int Opcode,
                   ARMCC::CondCodes Pred, unsigned PredReg, unsigned Scratch,
@@ -119,7 +126,6 @@ namespace {
                       ARMCC::CondCodes Pred, unsigned PredReg,
                       unsigned Scratch, MemOpQueue &MemOps,
                       SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
-
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI);
@@ -159,6 +165,21 @@ static int getLoadStoreMultipleOpcode(int Opcode, ARM_AM::AMSubMode Mode) {
     case ARM_AM::db: return ARM::STMDB;
     case ARM_AM::ib: return ARM::STMIB;
     }
+  case ARM::tLDRi:
+    // tLDMIA is writeback-only - unless the base register is in the input
+    // reglist.
+    ++NumLDMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tLDMIA;
+    }
+  case ARM::tSTRi:
+    // There is no non-writeback tSTMIA either.
+    ++NumSTMGened;
+    switch (Mode) {
+    default: llvm_unreachable("Unhandled submode!");
+    case ARM_AM::ia: return ARM::tSTMIA_UPD;
+    }
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
     ++NumLDMGened;
@@ -217,6 +238,9 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
   case ARM::LDMIA_UPD:
   case ARM::STMIA:
   case ARM::STMIA_UPD:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
   case ARM::t2LDMIA_RET:
   case ARM::t2LDMIA:
   case ARM::t2LDMIA_UPD:
@@ -263,12 +287,20 @@ AMSubMode getLoadStoreMultipleSubMode(int Opcode) {
   } // end namespace ARM_AM
 } // end namespace llvm
 
+static bool isT1i32Load(unsigned Opc) {
+  return Opc == ARM::tLDRi;
+}
+
 static bool isT2i32Load(unsigned Opc) {
   return Opc == ARM::t2LDRi12 || Opc == ARM::t2LDRi8;
 }
 
 static bool isi32Load(unsigned Opc) {
-  return Opc == ARM::LDRi12 || isT2i32Load(Opc);
+  return Opc == ARM::LDRi12 || isT1i32Load(Opc) || isT2i32Load(Opc) ;
+}
+
+static bool isT1i32Store(unsigned Opc) {
+  return Opc == ARM::tSTRi;
 }
 
 static bool isT2i32Store(unsigned Opc) {
@@ -276,7 +308,102 @@ static bool isT2i32Store(unsigned Opc) {
 }
 
 static bool isi32Store(unsigned Opc) {
-  return Opc == ARM::STRi12 || isT2i32Store(Opc);
+  return Opc == ARM::STRi12 || isT1i32Store(Opc) || isT2i32Store(Opc);
+}
+
+static unsigned getImmScale(unsigned Opc) {
+  switch (Opc) {
+  default: llvm_unreachable("Unhandled opcode!");
+  case ARM::tLDRi:
+  case ARM::tSTRi:
+    return 1;
+  case ARM::tLDRHi:
+  case ARM::tSTRHi:
+    return 2;
+  case ARM::tLDRBi:
+  case ARM::tSTRBi:
+    return 4;
+  }
+}
+
+/// Update future uses of the base register with the offset introduced
+/// due to writeback. This function only works on Thumb1.
+void
+ARMLoadStoreOpt::UpdateBaseRegUses(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   DebugLoc dl, unsigned Base,
+                                   unsigned WordOffset,
+                                   ARMCC::CondCodes Pred, unsigned PredReg) {
+  assert(isThumb1 && "Can only update base register uses for Thumb1!");
+
+  // Start updating any instructions with immediate offsets. Insert a sub before
+  // the first non-updateable instruction (if any).
+  for (; MBBI != MBB.end(); ++MBBI) {
+    if (MBBI->readsRegister(Base)) {
+      unsigned Opc = MBBI->getOpcode();
+      int Offset;
+      bool InsertSub = false;
+
+      if (Opc == ARM::tLDRi  || Opc == ARM::tSTRi  ||
+          Opc == ARM::tLDRHi || Opc == ARM::tSTRHi ||
+          Opc == ARM::tLDRBi || Opc == ARM::tSTRBi) {
+        // Loads and stores with immediate offsets can be updated, but only if
+        // the new offset isn't negative.
+        // The MachineOperand containing the offset immediate is the last one
+        // before predicates.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        // The offsets are scaled by 1, 2 or 4 depending on the Opcode
+        Offset = MO.getImm() - WordOffset * getImmScale(Opc);
+        if (Offset >= 0)
+          MO.setImm(Offset);
+        else
+          InsertSub = true;
+
+      } else if (Opc == ARM::tSUBi8 || Opc == ARM::tADDi8) {
+        // SUB/ADD using this register. Merge it with the update.
+        // If the merged offset is too large, insert a new sub instead.
+        MachineOperand &MO =
+          MBBI->getOperand(MBBI->getDesc().getNumOperands() - 3);
+        Offset = (Opc == ARM::tSUBi8) ?
+          MO.getImm() + WordOffset * 4 :
+          MO.getImm() - WordOffset * 4 ;
+        if (TL->isLegalAddImmediate(Offset)) {
+          MO.setImm(Offset);
+          // The base register has now been reset, so exit early.
+          return;
+        } else {
+          InsertSub = true;
+        }
+
+      } else {
+        // Can't update the instruction.
+        InsertSub = true;
+      }
+
+      if (InsertSub) {
+        // An instruction above couldn't be updated, so insert a sub.
+        AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(ARM::tSUBi8), Base))
+          .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+          .addImm(Pred).addReg(PredReg);
+        return;
+      }
+    }
+
+    if (MBBI->killsRegister(Base))
+      // Register got killed. Stop updating.
+      return;
+  }
+
+  // The end of the block was reached. This means register liveness escapes the
+  // block, and it's necessary to insert a sub before the last instruction.
+  if (MBB.succ_size() > 0)
+    // But only insert the SUB if there is actually a successor block.
+    // FIXME: Check more carefully if register is live at this point, e.g. by
+    // also examining the successor block's register liveness information.
+    AddDefaultT1CC(BuildMI(MBB, --MBBI, dl, TII->get(ARM::tSUBi8), Base))
+      .addReg(Base, getKillRegState(true)).addImm(WordOffset * 4)
+      .addImm(Pred).addReg(PredReg);
 }
 
 /// MergeOps - Create and insert a LDM or STM with Base as base register and
@@ -296,18 +423,19 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
     return false;
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
-  // VFP and Thumb2 do not support IB or DA modes.
+  // VFP and Thumb2 do not support IB or DA modes. Thumb1 only supports IA.
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
-  bool haveIBAndDA = isNotVFP && !isThumb2;
-  if (Offset == 4 && haveIBAndDA)
+  bool haveIBAndDA = isNotVFP && !isThumb2 && !isThumb1;
+
+  if (Offset == 4 && haveIBAndDA) {
     Mode = ARM_AM::ib;
-  else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA)
+  } else if (Offset == -4 * (int)NumRegs + 4 && haveIBAndDA) {
     Mode = ARM_AM::da;
-  else if (Offset == -4 * (int)NumRegs && isNotVFP)
+  } else if (Offset == -4 * (int)NumRegs && isNotVFP && !isThumb1) {
     // VLDM/VSTM do not support DB mode without also updating the base reg.
     Mode = ARM_AM::db;
-  else if (Offset != 0) {
-    // Check if this is a supported opcode before we insert instructions to
+  } else if (Offset != 0) {
+    // Check if this is a supported opcode before inserting instructions to
     // calculate a new base register.
     if (!getLoadStoreMultipleOpcode(Opcode, Mode)) return false;
 
@@ -318,41 +446,98 @@ ARMLoadStoreOpt::MergeOps(MachineBasicBlock &MBB,
       return false;
 
     unsigned NewBase;
-    if (isi32Load(Opcode))
+    if (isi32Load(Opcode)) {
       // If it is a load, then just use one of the destination register to
       // use as the new base.
       NewBase = Regs[NumRegs-1].first;
-    else {
+    } else {
       // Use the scratch register to use as a new base.
       NewBase = Scratch;
       if (NewBase == 0)
         return false;
     }
-    int BaseOpc = !isThumb2 ? ARM::ADDri : ARM::t2ADDri;
+
+    int BaseOpc =
+      isThumb2 ? ARM::t2ADDri :
+      isThumb1 ? ARM::tADDi8  : ARM::ADDri;
+
     if (Offset < 0) {
-      BaseOpc = !isThumb2 ? ARM::SUBri : ARM::t2SUBri;
+      BaseOpc =
+        isThumb2 ? ARM::t2SUBri :
+        isThumb1 ? ARM::tSUBi8  : ARM::SUBri;
       Offset = - Offset;
     }
-    int ImmedOffset = isThumb2
-      ? ARM_AM::getT2SOImmVal(Offset) : ARM_AM::getSOImmVal(Offset);
-    if (ImmedOffset == -1)
-      // FIXME: Try t2ADDri12 or t2SUBri12?
-      return false;  // Probably not worth it then.
-
-    BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
-      .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
-      .addImm(Pred).addReg(PredReg).addReg(0);
+
+    if (!TL->isLegalAddImmediate(Offset))
+      // FIXME: Try add with register operand?
+      return false; // Probably not worth it then.
+
+    if (isThumb1) {
+      if (Base != NewBase) {
+        // Need to insert a MOV to the new base first.
+        // FIXME: If the immediate fits in 3 bits, use ADD instead.
+        BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
+          .addReg(Base, getKillRegState(BaseKill))
+          .addImm(Pred).addReg(PredReg);
+      }
+      AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))
+        .addReg(NewBase, getKillRegState(true)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg);
+    } else {
+      BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
+        .addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
+        .addImm(Pred).addReg(PredReg).addReg(0);
+    }
+
     Base = NewBase;
-    BaseKill = true;  // New base is always killed right its use.
+    BaseKill = true; // New base is always killed straight away.
   }
 
   bool isDef = (isi32Load(Opcode) || Opcode == ARM::VLDRS ||
                 Opcode == ARM::VLDRD);
+
+  // Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
+  // base register writeback.
   Opcode = getLoadStoreMultipleOpcode(Opcode, Mode);
   if (!Opcode) return false;
-  MachineInstrBuilder MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode))
-    .addReg(Base, getKillRegState(BaseKill))
-    .addImm(Pred).addReg(PredReg);
+
+  bool Writeback = isThumb1; // Thumb1 LDM/STM have base reg writeback.
+
+  // Exception: If the base register is in the input reglist, Thumb1 LDM is
+  // non-writeback. Check for this.
+  if (Opcode == ARM::tLDRi && isThumb1)
+    for (unsigned I = 0; I < NumRegs; ++I)
+      if (Base == Regs[I].first) {
+        Writeback = false;
+        break;
+      }
+
+  MachineInstrBuilder MIB;
+
+  if (Writeback) {
+    if (Opcode == ARM::tLDMIA)
+      // Update tLDMIA with writeback if necessary.
+      Opcode = ARM::tLDMIA_UPD;
+
+    // The base isn't dead after a merged instruction with writeback. Update
+    // future uses of the base with the added offset (if possible), or reset
+    // the base register as necessary.
+    if (!BaseKill)
+      UpdateBaseRegUses(MBB, MBBI, dl, Base, NumRegs, Pred, PredReg);
+
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+
+    // Thumb1: we might need to set base writeback when building the MI.
+    MIB.addReg(Base, getDefRegState(true))
+       .addReg(Base, getKillRegState(BaseKill));
+  } else {
+    // No writeback, simply build the MachineInstr.
+    MIB = BuildMI(MBB, MBBI, dl, TII->get(Opcode));
+    MIB.addReg(Base, getKillRegState(BaseKill));
+  }
+
+  MIB.addImm(Pred).addReg(PredReg);
+
   for (unsigned i = 0; i != NumRegs; ++i)
     MIB = MIB.addReg(Regs[i].first, getDefRegState(isDef)
                      | getKillRegState(Regs[i].second));
@@ -492,7 +677,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
   // affected uses.
   for (SmallVectorImpl<MachineOperand *>::iterator I = UsesOfImpDefs.begin(),
                                                    E = UsesOfImpDefs.end();
-       I != E; ++I)
+                                                   I != E; ++I)
     (*I)->setIsUndef();
 
   for (unsigned i = memOpsBegin; i < memOpsEnd; ++i) {
@@ -589,7 +774,6 @@ ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
   bool BaseKill = Loc->findRegisterUseOperandIdx(Base, true) != -1;
   MergeOpsUpdate(MBB, MemOps, SIndex, MemOps.size(), insertAfter, SOffset,
                  Base, BaseKill, Opcode, Pred, PredReg, Scratch, dl, Merges);
-  return;
 }
 
 static bool definesCPSR(MachineInstr *MI) {
@@ -616,6 +800,7 @@ static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   bool CheckCPSRDef = false;
   switch (MI->getOpcode()) {
   default: return false;
+  case ARM::tSUBi8:
   case ARM::t2SUBri:
   case ARM::SUBri:
     CheckCPSRDef = true;
@@ -628,10 +813,11 @@ static bool isMatchingDecrement(MachineInstr *MI, unsigned Base,
   if (Bytes == 0 || (Limit && Bytes >= Limit))
     return false;
 
-  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi) ? 4 : 1; // FIXME
+  unsigned Scale = (MI->getOpcode() == ARM::tSUBspi ||
+                    MI->getOpcode() == ARM::tSUBi8) ? 4 : 1; // FIXME
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
         getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
@@ -649,6 +835,7 @@ static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
   bool CheckCPSRDef = false;
   switch (MI->getOpcode()) {
   default: return false;
+  case ARM::tADDi8:
   case ARM::t2ADDri:
   case ARM::ADDri:
     CheckCPSRDef = true;
@@ -661,10 +848,11 @@ static bool isMatchingIncrement(MachineInstr *MI, unsigned Base,
     // Make sure the offset fits in 8 bits.
     return false;
 
-  unsigned Scale = (MI->getOpcode() == ARM::tADDspi) ? 4 : 1; // FIXME
+  unsigned Scale = (MI->getOpcode() == ARM::tADDspi ||
+                    MI->getOpcode() == ARM::tADDi8) ? 4 : 1; // FIXME
   if (!(MI->getOperand(0).getReg() == Base &&
         MI->getOperand(1).getReg() == Base &&
-        (MI->getOperand(2).getImm()*Scale) == Bytes &&
+        (MI->getOperand(2).getImm() * Scale) == Bytes &&
         getInstrPredicate(MI, MyPredReg) == Pred &&
         MyPredReg == PredReg))
     return false;
@@ -677,6 +865,8 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   default: return 0;
   case ARM::LDRi12:
   case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -695,6 +885,9 @@ static inline unsigned getLSMultipleTransferSize(MachineInstr *MI) {
   case ARM::STMDA:
   case ARM::STMDB:
   case ARM::STMIB:
+  case ARM::tLDMIA:
+  case ARM::tLDMIA_UPD:
+  case ARM::tSTMIA_UPD:
   case ARM::t2LDMIA:
   case ARM::t2LDMDB:
   case ARM::t2STMIA:
@@ -791,6 +984,9 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MBBI,
                                                bool &Advance,
                                                MachineBasicBlock::iterator &I) {
+  // Thumb1 is already using updating loads/stores.
+  if (isThumb1) return false;
+
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(0).getReg();
   bool BaseKill = MI->getOperand(0).isKill();
@@ -927,6 +1123,10 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
                                                const TargetInstrInfo *TII,
                                                bool &Advance,
                                                MachineBasicBlock::iterator &I) {
+  // Thumb1 doesn't have updating LDR/STR.
+  // FIXME: Use LDM/STM with single register instead.
+  if (isThumb1) return false;
+
   MachineInstr *MI = MBBI;
   unsigned Base = MI->getOperand(1).getReg();
   bool BaseKill = MI->getOperand(1).isKill();
@@ -1002,7 +1202,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineBasicBlock &MBB,
     return false;
 
   if (isAM5) {
-    // VLDM[SD}_UPD, VSTM[SD]_UPD
+    // VLDM[SD]_UPD, VSTM[SD]_UPD
     // (There are no base-updating versions of VLDR/VSTR instructions, but the
     // updating load/store-multiple instructions can be used with only one
     // register.)
@@ -1100,6 +1300,8 @@ static bool isMemoryOp(const MachineInstr *MI) {
     return MI->getOperand(1).isReg();
   case ARM::LDRi12:
   case ARM::STRi12:
+  case ARM::tLDRi:
+  case ARM::tSTRi:
   case ARM::t2LDRi8:
   case ARM::t2LDRi12:
   case ARM::t2STRi8:
@@ -1137,6 +1339,10 @@ static int getMemoryOpOffset(const MachineInstr *MI) {
       Opcode == ARM::LDRi12   || Opcode == ARM::STRi12)
     return OffField;
 
+  // Thumb1 immediate offsets are scaled by 4
+  if (Opcode == ARM::tLDRi || Opcode == ARM::tSTRi)
+    return OffField * 4;
+
   int Offset = isAM3 ? ARM_AM::getAM3Offset(OffField)
     : ARM_AM::getAM5Offset(OffField) * 4;
   if (isAM3) {
@@ -1408,16 +1614,20 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       if (MBBI == E)
         // Reach the end of the block, try merging the memory instructions.
         TryMerge = true;
-    } else
+    } else {
       TryMerge = true;
+    }
 
     if (TryMerge) {
       if (NumMemOps > 1) {
         // Try to find a free register to use as a new base in case it's needed.
         // First advance to the instruction just before the start of the chain.
         AdvanceRS(MBB, MemOps);
+
         // Find a scratch register.
-        unsigned Scratch = RS->FindUnusedReg(&ARM::GPRRegClass);
+        unsigned Scratch =
+          RS->FindUnusedReg(isThumb1 ? &ARM::tGPRRegClass : &ARM::GPRRegClass);
+
         // Process the load / store instructions.
         RS->forward(std::prev(MBBI));
 
@@ -1483,6 +1693,8 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 /// =>
 ///   ldmfd sp!, {..., pc}
 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  // Thumb1 LDM doesn't allow high registers.
+  if (isThumb1) return false;
   if (MBB.empty()) return false;
 
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
@@ -1513,12 +1725,14 @@ bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
 
 bool ARMLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   const TargetMachine &TM = Fn.getTarget();
+  TL = TM.getTargetLowering();
   AFI = Fn.getInfo<ARMFunctionInfo>();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   STI = &TM.getSubtarget<ARMSubtarget>();
   RS = new RegScavenger();
   isThumb2 = AFI->isThumb2Function();
+  isThumb1 = AFI->isThumbFunction() && !isThumb2;
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
@@ -1666,11 +1880,11 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
   // FIXME: VLDRS / VSTRS -> VLDRD / VSTRD
   unsigned Scale = 1;
   unsigned Opcode = Op0->getOpcode();
-  if (Opcode == ARM::LDRi12)
+  if (Opcode == ARM::LDRi12) {
     NewOpc = ARM::LDRD;
-  else if (Opcode == ARM::STRi12)
+  } else if (Opcode == ARM::STRi12) {
     NewOpc = ARM::STRD;
-  else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
+  } else if (Opcode == ARM::t2LDRi8 || Opcode == ARM::t2LDRi12) {
     NewOpc = ARM::t2LDRDi8;
     Scale = 4;
     isT2 = true;
@@ -1678,8 +1892,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     NewOpc = ARM::t2STRDi8;
     Scale = 4;
     isT2 = true;
-  } else
+  } else {
     return false;
+  }
 
   // Make sure the base address satisfies i64 ld / st alignment requirement.
   // At the moment, we ignore the memoryoperand's value.
@@ -1746,8 +1961,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
   while (Ops.size() > 1) {
     unsigned FirstLoc = ~0U;
     unsigned LastLoc = 0;
-    MachineInstr *FirstOp = 0;
-    MachineInstr *LastOp = 0;
+    MachineInstr *FirstOp = nullptr;
+    MachineInstr *LastOp = nullptr;
     int LastOffset = 0;
     unsigned LastOpcode = 0;
     unsigned LastBytes = 0;
diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
index 20619fa..2a49255 100644
--- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
+++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp
@@ -8,8 +8,6 @@
 //
 //===------------------------------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "double barriers"
-
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMInstrInfo.h"
@@ -17,6 +15,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "double barriers"
+
 STATISTIC(NumDMBsRemoved, "Number of DMBs removed");
 
 namespace {
@@ -25,9 +25,9 @@ public:
   static char ID;
   ARMOptimizeBarriersPass() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "optimise barriers pass";
   }
 
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 7f0fe05..b290e7f 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -116,13 +116,13 @@ def D15 : ARMReg<15, "d15", [S30, S31]>, DwarfRegNum<[271]>;
 }
 
 // VFP3 defines 16 additional double registers
-def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>; 
+def D16 : ARMFReg<16, "d16">, DwarfRegNum<[272]>;
 def D17 : ARMFReg<17, "d17">, DwarfRegNum<[273]>;
 def D18 : ARMFReg<18, "d18">, DwarfRegNum<[274]>;
 def D19 : ARMFReg<19, "d19">, DwarfRegNum<[275]>;
 def D20 : ARMFReg<20, "d20">, DwarfRegNum<[276]>;
 def D21 : ARMFReg<21, "d21">, DwarfRegNum<[277]>;
-def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>; 
+def D22 : ARMFReg<22, "d22">, DwarfRegNum<[278]>;
 def D23 : ARMFReg<23, "d23">, DwarfRegNum<[279]>;
 def D24 : ARMFReg<24, "d24">, DwarfRegNum<[280]>;
 def D25 : ARMFReg<25, "d25">, DwarfRegNum<[281]>;
@@ -158,11 +158,11 @@ def Q15 : ARMReg<15, "q15", [D30, D31]>;
 // Current Program Status Register.
 // We model fpscr with two registers: FPSCR models the control bits and will be
 // reserved. FPSCR_NZCV models the flag bits and will be unreserved. APSR_NZCV
-// models the APSR when it's accessed by some special instructions. In such cases 
+// models the APSR when it's accessed by some special instructions. In such cases
 // it has the same encoding as PC.
 def CPSR       : ARMReg<0,  "cpsr">;
 def APSR       : ARMReg<1,  "apsr">;
-def APSR_NZCV  : ARMReg<15, "apsr_nzcv">; 
+def APSR_NZCV  : ARMReg<15, "apsr_nzcv">;
 def SPSR       : ARMReg<2,  "spsr">;
 def FPSCR      : ARMReg<3,  "fpscr">;
 def FPSCR_NZCV : ARMReg<3,  "fpscr_nzcv"> {
diff --git a/lib/Target/ARM/ARMScheduleV6.td b/lib/Target/ARM/ARMScheduleV6.td
index 0ace9bc..57d0bfb 100644
--- a/lib/Target/ARM/ARMScheduleV6.td
+++ b/lib/Target/ARM/ARMScheduleV6.td
@@ -93,7 +93,7 @@ def ARMV6Itineraries : ProcessorItineraries<
   InstrItinData<IIC_iMAC32   , [InstrStage<2, [V6_Pipe]>], [5, 1, 1, 2]>,
   InstrItinData<IIC_iMUL64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1]>,
   InstrItinData<IIC_iMAC64   , [InstrStage<3, [V6_Pipe]>], [6, 1, 1, 2]>,
-  
+
   // Integer load pipeline
   //
   // Immediate offset
@@ -181,7 +181,7 @@ def ARMV6Itineraries : ProcessorItineraries<
   //
   // Store multiple + update
   InstrItinData<IIC_iStore_mu , [InstrStage<3, [V6_Pipe]>], [2]>,
-  
+
   // Branch
   //
   // no delay slots, so the latency of a branch is unimportant
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index ba3cf4d..008ad64 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-selectiondag-info"
 #include "ARMTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-selectiondag-info"
+
 ARMSelectionDAGInfo::ARMSelectionDAGInfo(const TargetMachine &TM)
   : TargetSelectionDAGInfo(TM),
     Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
@@ -52,9 +53,10 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   EVT VT = MVT::i32;
   unsigned VTSize = 4;
   unsigned i = 0;
-  const unsigned MAX_LOADS_IN_LDM = 6;
-  SDValue TFOps[MAX_LOADS_IN_LDM];
-  SDValue Loads[MAX_LOADS_IN_LDM];
+  // Emit a maximum of 4 loads in Thumb1 since we have fewer registers
+  const unsigned MAX_LOADS_IN_LDM = Subtarget->isThumb1Only() ? 4 : 6;
+  SDValue TFOps[6];
+  SDValue Loads[6];
   uint64_t SrcOff = 0, DstOff = 0;
 
   // Emit up to MAX_LOADS_IN_LDM loads, then a TokenFactor barrier, then the
@@ -71,7 +73,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
       TFOps[i] = Loads[i].getValue(1);
       SrcOff += VTSize;
     }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
 
     for (i = 0;
          i < MAX_LOADS_IN_LDM && EmittedNumMemOps + i < NumMemOps; ++i) {
@@ -82,7 +85,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                               isVolatile, false, 0);
       DstOff += VTSize;
     }
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                        makeArrayRef(TFOps, i));
 
     EmittedNumMemOps += i;
   }
@@ -112,7 +116,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     SrcOff += VTSize;
     BytesLeft -= VTSize;
   }
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                      makeArrayRef(TFOps, i));
 
   i = 0;
   BytesLeft = BytesLeftSave;
@@ -133,7 +138,8 @@ ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
     DstOff += VTSize;
     BytesLeft -= VTSize;
   }
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &TFOps[0], i);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                     makeArrayRef(TFOps, i));
 }
 
 // Adjust parameters for memset, EABI uses format (ptr, size, value),
@@ -146,7 +152,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
   // Use default for non-AAPCS (or MachO) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO())
+  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetMachO() ||
+      Subtarget->isTargetWindows())
     return SDValue();
 
   const ARMTargetLowering &TLI =
@@ -179,22 +186,14 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   Args.push_back(Entry);
 
   // Emit __eabi_memset call
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                    Type::getVoidTy(*DAG.getContext()), // return type
-                    false, // return sign ext
-                    false, // return zero ext
-                    false, // is var arg
-                    false, // is in regs
-                    0,     // number of fixed arguments
-                    TLI.getLibcallCallingConv(RTLIB::MEMSET), // call conv
-                    false, // is tail call
-                    false, // does not return
-                    false, // is return val used
-                    DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
-                                          TLI.getPointerTy()), // callee
-                    Args, DAG, dl);
-  std::pair<SDValue,SDValue> CallResult =
-    TLI.LowerCallTo(CLI);
-
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMSET),
+               Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::MEMSET),
+                                     TLI.getPointerTy()), &Args, 0)
+    .setDiscardResult();
+
+  std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
   return CallResult.second;
 }
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 73e2018..5b204f6 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -21,12 +21,14 @@
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "ARMGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 3855419..38536b2 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -31,7 +31,7 @@ class TargetOptions;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15, 
+    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
     CortexR5, Swift, CortexA53, CortexA57, Krait
   };
   enum ARMProcClassEnum {
@@ -242,9 +242,7 @@ protected:
   /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
   /// that still makes it profitable to inline the call.
   unsigned getMaxInlineSizeThreshold() const {
-    // FIXME: For now, we don't lower memcpy's to loads / stores for Thumb1.
-    // Change this once Thumb1 ldmia / stmia support is added.
-    return isThumb1Only() ? 0 : 64;
+    return 64;
   }
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
@@ -396,7 +394,7 @@ public:
   bool isLittle() const { return IsLittle; }
 
   unsigned getMispredictionPenalty() const;
-  
+
   /// This function returns true if the target has sincos() routine in its
   /// compiler runtime or math libraries.
   bool hasSinCos() const;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 4ae539a..8876227 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -228,7 +228,7 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 bool ARMPassConfig::addPreISel() {
   const ARMSubtarget *Subtarget = &getARMSubtarget();
   if (Subtarget->hasAnyDataBarrier() && !Subtarget->isThumb1Only())
-    addPass(createARMAtomicExpandPass(TM));
+    addPass(createAtomicExpandLoadLinkedPass(TM));
 
   if (TM->getOptLevel() != CodeGenOpt::None)
     addPass(createGlobalMergePass(TM));
@@ -247,8 +247,7 @@ bool ARMPassConfig::addInstSelector() {
 }
 
 bool ARMPassConfig::addPreRegAlloc() {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
-  if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
+  if (getOptLevel() != CodeGenOpt::None)
     addPass(createARMLoadStoreOptimizationPass(true));
   if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
     addPass(createMLxExpansionPass());
@@ -262,12 +261,10 @@ bool ARMPassConfig::addPreRegAlloc() {
 }
 
 bool ARMPassConfig::addPreSched2() {
-  // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None) {
-    if (!getARMSubtarget().isThumb1Only()) {
-      addPass(createARMLoadStoreOptimizationPass());
-      printAndVerify("After ARM load / store optimizer");
-    }
+    addPass(createARMLoadStoreOptimizationPass());
+    printAndVerify("After ARM load / store optimizer");
+
     if (getARMSubtarget().hasNEON())
       addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
   }
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 0c80a95..664c992 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -23,7 +23,6 @@
 #include "Thumb1FrameLowering.h"
 #include "Thumb1InstrInfo.h"
 #include "Thumb2InstrInfo.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Target/TargetMachine.h"
@@ -102,7 +101,7 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
 /// ARMLETargetMachine - ARM little endian target machine.
 ///
 class ARMLETargetMachine : public ARMTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ARMLETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -113,7 +112,7 @@ public:
 /// ARMBETargetMachine - ARM big endian target machine.
 ///
 class ARMBETargetMachine : public ARMTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ARMBETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -128,12 +127,12 @@ public:
 class ThumbTargetMachine : public ARMBaseTargetMachine {
   virtual void anchor();
   // Either Thumb1InstrInfo or Thumb2InstrInfo.
-  OwningPtr<ARMBaseInstrInfo> InstrInfo;
+  std::unique_ptr<ARMBaseInstrInfo> InstrInfo;
   const DataLayout    DL;   // Calculates type size & alignment
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
   // Either Thumb1FrameLowering or ARMFrameLowering.
-  OwningPtr<ARMFrameLowering> FrameLowering;
+  std::unique_ptr<ARMFrameLowering> FrameLowering;
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
@@ -169,7 +168,7 @@ public:
 /// ThumbLETargetMachine - Thumb little endian target machine.
 ///
 class ThumbLETargetMachine : public ThumbTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
   ThumbLETargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS, const TargetOptions &Options,
@@ -180,10 +179,10 @@ public:
 /// ThumbBETargetMachine - Thumb big endian target machine.
 ///
 class ThumbBETargetMachine : public ThumbTargetMachine {
-  virtual void anchor();
+  void anchor() override;
 public:
-  ThumbBETargetMachine(const Target &T, StringRef TT,
-                       StringRef CPU, StringRef FS, const TargetOptions &Options,
+  ThumbBETargetMachine(const Target &T, StringRef TT, StringRef CPU,
+                       StringRef FS, const TargetOptions &Options,
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 };
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 3379f85..48238bf 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -11,6 +11,7 @@
 #include "ARMSubtarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionELF.h"
@@ -31,7 +32,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(isAAPCS_ABI);
 
   if (isAAPCS_ABI) {
-    LSDASection = NULL;
+    LSDASection = nullptr;
   }
 
   AttributesSection =
@@ -45,6 +46,10 @@ const MCExpr *ARMElfTargetObjectFile::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
     const TargetMachine &TM, MachineModuleInfo *MMI,
     MCStreamer &Streamer) const {
+  if (TM.getMCAsmInfo()->getExceptionHandlingType() != ExceptionHandling::ARM)
+    return TargetLoweringObjectFileELF::getTTypeGlobalReference(
+        GV, Encoding, Mang, TM, MMI, Streamer);
+
   assert(Encoding == DW_EH_PE_absptr && "Can handle absptr encoding only");
 
   return MCSymbolRefExpr::Create(TM.getSymbol(GV, Mang),
diff --git a/lib/Target/ARM/ARMTargetObjectFile.h b/lib/Target/ARM/ARMTargetObjectFile.h
index 5f8d612..c926421 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.h
+++ b/lib/Target/ARM/ARMTargetObjectFile.h
@@ -23,7 +23,7 @@ protected:
 public:
   ARMElfTargetObjectFile() :
     TargetLoweringObjectFileELF(),
-    AttributesSection(NULL)
+    AttributesSection(nullptr)
   {}
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index d3b43cd..57df7da 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -14,7 +14,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "armtti"
 #include "ARM.h"
 #include "ARMTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -23,8 +22,10 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "armtti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeARMTTIPass(PassRegistry &);
@@ -42,7 +43,7 @@ class ARMTTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  ARMTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  ARMTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
diff --git a/lib/Target/ARM/Android.mk b/lib/Target/ARM/Android.mk
index 4be95aa..095955b 100644
--- a/lib/Target/ARM/Android.mk
+++ b/lib/Target/ARM/Android.mk
@@ -17,7 +17,6 @@ arm_codegen_TBLGEN_TABLES := \
 arm_codegen_SRC_FILES := \
   A15SDOptimizer.cpp \
   ARMAsmPrinter.cpp \
-  ARMAtomicExpandPass.cpp \
   ARMBaseInstrInfo.cpp \
   ARMBaseRegisterInfo.cpp \
   ARMCodeEmitter.cpp \
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 9c57a24..5cdf394 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -13,7 +13,6 @@
 #include "MCTargetDesc/ARMArchName.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMMCExpr.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
@@ -23,9 +22,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
-#include "llvm/MC/MCELF.h"
 #include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -345,7 +342,8 @@ public:
   };
 
   ARMAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &MII)
+               const MCInstrInfo &MII,
+               const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(MII), UC(_Parser) {
     MCAsmParserExtension::Initialize(_Parser);
 
@@ -416,7 +414,7 @@ class ARMOperand : public MCParsedAsmOperand {
     k_Token
   } Kind;
 
-  SMLoc StartLoc, EndLoc;
+  SMLoc StartLoc, EndLoc, AlignmentLoc;
   SmallVector<unsigned, 8> Registers;
 
   struct CCOp {
@@ -633,6 +631,12 @@ public:
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
+  /// getAlignmentLoc - Get the location of the Alignment token of this operand.
+  SMLoc getAlignmentLoc() const {
+    assert(Kind == k_Memory && "Invalid access!");
+    return AlignmentLoc;
+  }
+
   ARMCC::CondCodes getCondCode() const {
     assert(Kind == k_CondCode && "Invalid access!");
     return CC.Val;
@@ -1089,12 +1093,12 @@ public:
   bool isPostIdxReg() const {
     return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
   }
-  bool isMemNoOffset(bool alignOK = false) const {
+  bool isMemNoOffset(bool alignOK = false, unsigned Alignment = 0) const {
     if (!isMem())
       return false;
     // No offset of any kind.
-    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
-     (alignOK || Memory.Alignment == 0);
+    return Memory.OffsetRegNum == 0 && Memory.OffsetImm == nullptr &&
+     (alignOK || Memory.Alignment == Alignment);
   }
   bool isMemPCRelImm12() const {
     if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
@@ -1110,6 +1114,65 @@ public:
   bool isAlignedMemory() const {
     return isMemNoOffset(true);
   }
+  bool isAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemoryNone() const {
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory16() const {
+    if (isMemNoOffset(false, 2)) // alignment in bytes for 16-bits is 2.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory32() const {
+    if (isMemNoOffset(false, 4)) // alignment in bytes for 32-bits is 4.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isDupAlignedMemory64or128() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
+  bool isAlignedMemory64or128or256() const {
+    if (isMemNoOffset(false, 8)) // alignment in bytes for 64-bits is 8.
+      return true;
+    if (isMemNoOffset(false, 16)) // alignment in bytes for 128-bits is 16.
+      return true;
+    if (isMemNoOffset(false, 32)) // alignment in bytes for 256-bits is 32.
+      return true;
+    return isMemNoOffset(false, 0);
+  }
   bool isAddrMode2() const {
     if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
@@ -1545,7 +1608,10 @@ public:
   }
 
   bool isNEONi16splat() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(2))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
@@ -1555,7 +1621,10 @@ public:
   }
 
   bool isNEONi32splat() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(4))
+      return false; // Leave that for bytes replication and forbid by default.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
     if (!CE) return false;
@@ -1567,11 +1636,36 @@ public:
       (Value >= 0x01000000 && Value <= 0xff000000);
   }
 
+  bool isNEONByteReplicate(unsigned NumBytes) const {
+    if (!isImm())
+      return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    // Must be a constant.
+    if (!CE)
+      return false;
+    int64_t Value = CE->getValue();
+    if (!Value)
+      return false; // Don't bother with zero.
+
+    unsigned char B = Value & 0xff;
+    for (unsigned i = 1; i < NumBytes; ++i) {
+      Value >>= 8;
+      if ((Value & 0xff) != B)
+        return false;
+    }
+    return true;
+  }
+  bool isNEONi16ByteReplicate() const { return isNEONByteReplicate(2); }
+  bool isNEONi32ByteReplicate() const { return isNEONByteReplicate(4); }
   bool isNEONi32vmov() const {
-    if (!isImm()) return false;
+    if (isNEONByteReplicate(4))
+      return false; // Let it to be classified as byte-replicate case.
+    if (!isImm())
+      return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     // Must be a constant.
-    if (!CE) return false;
+    if (!CE)
+      return false;
     int64_t Value = CE->getValue();
     // i32 value with set bits only in one byte X000, 0X00, 00X0, or 000X,
     // for VMOV/VMVN only, 00Xf or 0Xff are also accepted.
@@ -1612,7 +1706,7 @@ public:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -1926,6 +2020,50 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Memory.Alignment));
   }
 
+  void addDupAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemoryNoneOperands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory16Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory32Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addDupAlignedMemory64or128Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
+  void addAlignedMemory64or128or256Operands(MCInst &Inst, unsigned N) const {
+    addAlignedMemoryOperands(Inst, N);
+  }
+
   void addAddrMode2Operands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     int32_t Val = Memory.OffsetImm ? Memory.OffsetImm->getValue() : 0;
@@ -2275,6 +2413,19 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
+  void addNEONinvByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All vmvn instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = ((~Value) & 0xff);
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
   void addNEONi32vmovOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
@@ -2289,6 +2440,19 @@ public:
     Inst.addOperand(MCOperand::CreateImm(Value));
   }
 
+  void addNEONvmovByteReplicateOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    // The immediate encodes the type of constant as well as the value.
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    unsigned Value = CE->getValue();
+    assert((Inst.getOpcode() == ARM::VMOVv8i8 ||
+            Inst.getOpcode() == ARM::VMOVv16i8) &&
+           "All instructions that wants to replicate non-zero byte "
+           "always must be replaced with VMOVv8i8 or VMOVv16i8.");
+    unsigned B = Value & 0xff;
+    B |= 0xe00; // cmode = 0b1110
+    Inst.addOperand(MCOperand::CreateImm(B));
+  }
   void addNEONi32vmovNegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The immediate encodes the type of constant as well as the value.
@@ -2523,7 +2687,8 @@ public:
                                unsigned ShiftImm,
                                unsigned Alignment,
                                bool isNegative,
-                               SMLoc S, SMLoc E) {
+                               SMLoc S, SMLoc E,
+                               SMLoc AlignmentLoc = SMLoc()) {
     ARMOperand *Op = new ARMOperand(k_Memory);
     Op->Memory.BaseRegNum = BaseRegNum;
     Op->Memory.OffsetImm = OffsetImm;
@@ -2534,6 +2699,7 @@ public:
     Op->Memory.isNegative = isNegative;
     Op->StartLoc = S;
     Op->EndLoc = E;
+    Op->AlignmentLoc = AlignmentLoc;
     return Op;
   }
 
@@ -2806,7 +2972,7 @@ int ARMAsmParser::tryParseShiftRegister(
   // The source register for the shift has already been added to the
   // operand list, so we need to pop it off and combine it into the shifted
   // register operand instead.
-  OwningPtr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
+  std::unique_ptr<ARMOperand> PrevOp((ARMOperand*)Operands.pop_back_val());
   if (!PrevOp->isReg())
     return Error(PrevOp->getStartLoc(), "shift must be of a register");
   int SrcReg = PrevOp->getReg();
@@ -2825,7 +2991,7 @@ int ARMAsmParser::tryParseShiftRegister(
         Parser.getTok().is(AsmToken::Dollar)) {
       Parser.Lex(); // Eat hash.
       SMLoc ImmLoc = Parser.getTok().getLoc();
-      const MCExpr *ShiftExpr = 0;
+      const MCExpr *ShiftExpr = nullptr;
       if (getParser().parseExpression(ShiftExpr, EndLoc)) {
         Error(ImmLoc, "invalid immediate shift value");
         return -1;
@@ -2855,12 +3021,12 @@ int ARMAsmParser::tryParseShiftRegister(
       EndLoc = Parser.getTok().getEndLoc();
       ShiftReg = tryParseRegister();
       if (ShiftReg == -1) {
-        Error (L, "expected immediate or register in shift operand");
+        Error(L, "expected immediate or register in shift operand");
         return -1;
       }
     } else {
-      Error (Parser.getTok().getLoc(),
-                    "expected immediate or register in shift operand");
+      Error(Parser.getTok().getLoc(),
+            "expected immediate or register in shift operand");
       return -1;
     }
   }
@@ -4323,8 +4489,9 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     E = Tok.getEndLoc();
     Parser.Lex(); // Eat right bracket token.
 
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0, ARM_AM::no_shift,
-                                             0, 0, false, S, E));
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
+                                             ARM_AM::no_shift, 0, 0, false,
+                                             S, E));
 
     // If there's a pre-indexing writeback marker, '!', just add it as a token
     // operand. It's rather odd, but syntactically valid.
@@ -4346,6 +4513,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   if (Parser.getTok().is(AsmToken::Colon)) {
     Parser.Lex(); // Eat the ':'.
     E = Parser.getTok().getLoc();
+    SMLoc AlignmentLoc = Tok.getLoc();
 
     const MCExpr *Expr;
     if (getParser().parseExpression(Expr))
@@ -4378,9 +4546,9 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
     // Don't worry about range checking the value here. That's handled by
     // the is*() predicates.
-    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, 0,
+    Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, 0,
                                              ARM_AM::no_shift, 0, Align,
-                                             false, S, E));
+                                             false, S, E, AlignmentLoc));
 
     // If there's a pre-indexing writeback marker, '!', just add it as a token
     // operand.
@@ -4471,7 +4639,7 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   E = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat right bracket token.
 
-  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, 0, OffsetRegNum,
+  Operands.push_back(ARMOperand::CreateMem(BaseRegNum, nullptr, OffsetRegNum,
                                            ShiftType, ShiftImm, 0, isNegative,
                                            S, E));
 
@@ -4926,8 +5094,9 @@ getMnemonicAcceptInfo(StringRef Mnemonic, StringRef FullInst,
 
   if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
       Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
-      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic.startswith("crc32") ||
-      Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
+      Mnemonic == "trap" || Mnemonic == "hlt" || Mnemonic == "udf" ||
+      Mnemonic.startswith("crc32") || Mnemonic.startswith("cps") ||
+      Mnemonic.startswith("vsel") ||
       Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
@@ -5404,21 +5573,24 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   }
 
   // GNU Assembler extension (compatibility)
-  if ((Mnemonic == "ldrd" || Mnemonic == "strd") && !isThumb() &&
-      Operands.size() == 4) {
-    ARMOperand *Op = static_cast<ARMOperand *>(Operands[2]);
-    assert(Op->isReg() && "expected register argument");
+  if ((Mnemonic == "ldrd" || Mnemonic == "strd")) {
+    ARMOperand *Op2 = static_cast<ARMOperand *>(Operands[2]);
+    ARMOperand *Op3 = static_cast<ARMOperand *>(Operands[3]);
+    if (Op3->isMem()) {
+      assert(Op2->isReg() && "expected register argument");
 
-    unsigned SuperReg = MRI->getMatchingSuperReg(
-        Op->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
+      unsigned SuperReg = MRI->getMatchingSuperReg(
+          Op2->getReg(), ARM::gsub_0, &MRI->getRegClass(ARM::GPRPairRegClassID));
 
-    assert(SuperReg && "expected register pair");
+      assert(SuperReg && "expected register pair");
 
-    unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
+      unsigned PairedReg = MRI->getSubReg(SuperReg, ARM::gsub_1);
 
-    Operands.insert(Operands.begin() + 3,
-                    ARMOperand::CreateReg(PairedReg, Op->getStartLoc(),
-                                          Op->getEndLoc()));
+      Operands.insert(Operands.begin() + 3,
+                      ARMOperand::CreateReg(PairedReg,
+                                            Op2->getStartLoc(),
+                                            Op2->getEndLoc()));
+    }
   }
 
   // FIXME: As said above, this is all a pretty gross hack.  This instruction
@@ -5748,6 +5920,30 @@ validateInstruction(MCInst &Inst,
       return Error(Operands[Op]->getStartLoc(), "branch target out of range");
     break;
   }
+  case ARM::MOVi16:
+  case ARM::t2MOVi16:
+  case ARM::t2MOVTi16:
+    {
+    // We want to avoid misleadingly allowing something like "mov r0, <symbol>"
+    // especially when we turn it into a movw and the expression <symbol> does
+    // not have a :lower16: or :upper16 as part of the expression.  We don't
+    // want the behavior of silently truncating, which can be unexpected and
+    // lead to bugs that are difficult to find since this is an easy mistake
+    // to make.
+    int i = (Operands[3]->isImm()) ? 3 : 4;
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[i]);
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
+    if (CE) break;
+    const MCExpr *E = dyn_cast<MCExpr>(Op->getImm());
+    if (!E) break;
+    const ARMMCExpr *ARM16Expr = dyn_cast<ARMMCExpr>(E);
+    if (!ARM16Expr || (ARM16Expr->getKind() != ARMMCExpr::VK_ARM_HI16 &&
+                       ARM16Expr->getKind() != ARMMCExpr::VK_ARM_LO16)) {
+      return Error(Op->getStartLoc(),
+	     "immediate expression for mov requires :lower16: or :upper16");
+      break;
+    }
+    }
   }
 
   return false;
@@ -5898,7 +6094,7 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
   case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
   case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
   case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
-  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPq16_UPD;
+  case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
   case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
   case ARM::VLD3DUPdWB_register_Asm_8:  Spacing = 1; return ARM::VLD3DUPd8_UPD;
   case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
@@ -7860,9 +8056,11 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
   return Match_Success;
 }
 
-template<> inline bool IsCPSRDead<MCInst>(MCInst* Instr) {
+namespace llvm {
+template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
   return true; // In an assembly source, no need to second-guess
 }
+}
 
 static const char *getSubtargetFeatureName(unsigned Val);
 bool ARMAsmParser::
@@ -7965,6 +8163,42 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
   }
+  case Match_AlignedMemoryRequiresNone:
+  case Match_DupAlignedMemoryRequiresNone:
+  case Match_AlignedMemoryRequires16:
+  case Match_DupAlignedMemoryRequires16:
+  case Match_AlignedMemoryRequires32:
+  case Match_DupAlignedMemoryRequires32:
+  case Match_AlignedMemoryRequires64:
+  case Match_DupAlignedMemoryRequires64:
+  case Match_AlignedMemoryRequires64or128:
+  case Match_DupAlignedMemoryRequires64or128:
+  case Match_AlignedMemoryRequires64or128or256:
+  {
+    SMLoc ErrorLoc = ((ARMOperand*)Operands[ErrorInfo])->getAlignmentLoc();
+    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+    switch (MatchResult) {
+      default:
+        llvm_unreachable("Missing Match_Aligned type");
+      case Match_AlignedMemoryRequiresNone:
+      case Match_DupAlignedMemoryRequiresNone:
+        return Error(ErrorLoc, "alignment must be omitted");
+      case Match_AlignedMemoryRequires16:
+      case Match_DupAlignedMemoryRequires16:
+        return Error(ErrorLoc, "alignment must be 16 or omitted");
+      case Match_AlignedMemoryRequires32:
+      case Match_DupAlignedMemoryRequires32:
+        return Error(ErrorLoc, "alignment must be 32 or omitted");
+      case Match_AlignedMemoryRequires64:
+      case Match_DupAlignedMemoryRequires64:
+        return Error(ErrorLoc, "alignment must be 64 or omitted");
+      case Match_AlignedMemoryRequires64or128:
+      case Match_DupAlignedMemoryRequires64or128:
+        return Error(ErrorLoc, "alignment must be 64, 128 or omitted");
+      case Match_AlignedMemoryRequires64or128or256:
+        return Error(ErrorLoc, "alignment must be 64, 128, 256 or omitted");
+    }
+  }
   }
 
   llvm_unreachable("Implement any new match types added!");
@@ -7972,6 +8206,10 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 /// parseDirective parses the arm specific directives
 bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
+  const MCObjectFileInfo::Environment Format =
+    getContext().getObjectFileInfo()->getObjectFileType();
+  bool IsMachO = Format == MCObjectFileInfo::IsMachO;
+
   StringRef IDVal = DirectiveID.getIdentifier();
   if (IDVal == ".word")
     return parseLiteralValues(4, DirectiveID.getLoc());
@@ -7989,16 +8227,6 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveSyntax(DirectiveID.getLoc());
   else if (IDVal == ".unreq")
     return parseDirectiveUnreq(DirectiveID.getLoc());
-  else if (IDVal == ".arch")
-    return parseDirectiveArch(DirectiveID.getLoc());
-  else if (IDVal == ".eabi_attribute")
-    return parseDirectiveEabiAttr(DirectiveID.getLoc());
-  else if (IDVal == ".cpu")
-    return parseDirectiveCPU(DirectiveID.getLoc());
-  else if (IDVal == ".fpu")
-    return parseDirectiveFPU(DirectiveID.getLoc());
-  else if (IDVal == ".fnstart")
-    return parseDirectiveFnStart(DirectiveID.getLoc());
   else if (IDVal == ".fnend")
     return parseDirectiveFnEnd(DirectiveID.getLoc());
   else if (IDVal == ".cantunwind")
@@ -8015,12 +8243,6 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectiveRegSave(DirectiveID.getLoc(), false);
   else if (IDVal == ".vsave")
     return parseDirectiveRegSave(DirectiveID.getLoc(), true);
-  else if (IDVal == ".inst")
-    return parseDirectiveInst(DirectiveID.getLoc());
-  else if (IDVal == ".inst.n")
-    return parseDirectiveInst(DirectiveID.getLoc(), 'n');
-  else if (IDVal == ".inst.w")
-    return parseDirectiveInst(DirectiveID.getLoc(), 'w');
   else if (IDVal == ".ltorg" || IDVal == ".pool")
     return parseDirectiveLtorg(DirectiveID.getLoc());
   else if (IDVal == ".even")
@@ -8029,18 +8251,38 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
     return parseDirectivePersonalityIndex(DirectiveID.getLoc());
   else if (IDVal == ".unwind_raw")
     return parseDirectiveUnwindRaw(DirectiveID.getLoc());
-  else if (IDVal == ".tlsdescseq")
-    return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
   else if (IDVal == ".movsp")
     return parseDirectiveMovSP(DirectiveID.getLoc());
-  else if (IDVal == ".object_arch")
-    return parseDirectiveObjectArch(DirectiveID.getLoc());
   else if (IDVal == ".arch_extension")
     return parseDirectiveArchExtension(DirectiveID.getLoc());
   else if (IDVal == ".align")
     return parseDirectiveAlign(DirectiveID.getLoc());
   else if (IDVal == ".thumb_set")
     return parseDirectiveThumbSet(DirectiveID.getLoc());
+
+  if (!IsMachO) {
+    if (IDVal == ".arch")
+      return parseDirectiveArch(DirectiveID.getLoc());
+    else if (IDVal == ".cpu")
+      return parseDirectiveCPU(DirectiveID.getLoc());
+    else if (IDVal == ".eabi_attribute")
+      return parseDirectiveEabiAttr(DirectiveID.getLoc());
+    else if (IDVal == ".fpu")
+      return parseDirectiveFPU(DirectiveID.getLoc());
+    else if (IDVal == ".fnstart")
+      return parseDirectiveFnStart(DirectiveID.getLoc());
+    else if (IDVal == ".inst")
+      return parseDirectiveInst(DirectiveID.getLoc());
+    else if (IDVal == ".inst.n")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'n');
+    else if (IDVal == ".inst.w")
+      return parseDirectiveInst(DirectiveID.getLoc(), 'w');
+    else if (IDVal == ".object_arch")
+      return parseDirectiveObjectArch(DirectiveID.getLoc());
+    else if (IDVal == ".tlsdescseq")
+      return parseDirectiveTLSDescSeq(DirectiveID.getLoc());
+  }
+
   return true;
 }
 
@@ -8121,32 +8363,6 @@ void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
   if (NextSymbolIsThumb) {
     getParser().getStreamer().EmitThumbFunc(Symbol);
     NextSymbolIsThumb = false;
-    return;
-  }
-
-  if (!isThumb())
-    return;
-
-  const MCObjectFileInfo::Environment Format =
-    getContext().getObjectFileInfo()->getObjectFileType();
-  switch (Format) {
-  case MCObjectFileInfo::IsCOFF: {
-    const MCSymbolData &SD =
-      getParser().getStreamer().getOrCreateSymbolData(Symbol);
-    char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
-    if (SD.getFlags() & (Type << COFF::SF_TypeShift))
-      getParser().getStreamer().EmitThumbFunc(Symbol);
-    break;
-  }
-  case MCObjectFileInfo::IsELF: {
-    const MCSymbolData &SD =
-      getParser().getStreamer().getOrCreateSymbolData(Symbol);
-    if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
-      getParser().getStreamer().EmitThumbFunc(Symbol);
-    break;
-  }
-  case MCObjectFileInfo::IsMachO:
-    break;
   }
 }
 
@@ -8303,14 +8519,6 @@ bool ARMAsmParser::parseDirectiveUnreq(SMLoc L) {
 /// parseDirectiveArch
 ///  ::= .arch token
 bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".arch directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef Arch = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(Arch)
@@ -8334,14 +8542,6 @@ bool ARMAsmParser::parseDirectiveArch(SMLoc L) {
 ///  ::= .eabi_attribute int, int [, "str"]
 ///  ::= .eabi_attribute Tag_name, int [, "str"]
 bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".eabi_attribute directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   int64_t Tag;
   SMLoc TagLoc;
   TagLoc = Parser.getTok().getLoc();
@@ -8447,14 +8647,6 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
 /// parseDirectiveCPU
 ///  ::= .cpu str
 bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".cpu directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef CPU = getParser().parseStringToEndOfStatement().trim();
   getTargetStreamer().emitTextAttribute(ARMBuildAttrs::CPU_name, CPU);
   return false;
@@ -8463,14 +8655,6 @@ bool ARMAsmParser::parseDirectiveCPU(SMLoc L) {
 /// parseDirectiveFPU
 ///  ::= .fpu str
 bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".fpu directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   StringRef FPU = getParser().parseStringToEndOfStatement().trim();
 
   unsigned ID = StringSwitch<unsigned>(FPU)
@@ -8490,14 +8674,6 @@ bool ARMAsmParser::parseDirectiveFPU(SMLoc L) {
 /// parseDirectiveFnStart
 ///  ::= .fnstart
 bool ARMAsmParser::parseDirectiveFnStart(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".fnstart directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (UC.hasFnStart()) {
     Error(L, ".fnstart starts before the end of previous one");
     UC.emitFnStartLocNotes();
@@ -8777,14 +8953,6 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
 ///  ::= .inst.n opcode [, ...]
 ///  ::= .inst.w opcode [, ...]
 bool ARMAsmParser::parseDirectiveInst(SMLoc Loc, char Suffix) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(Loc, ".inst directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   int Width;
 
   if (isThumb()) {
@@ -9033,14 +9201,6 @@ bool ARMAsmParser::parseDirectiveUnwindRaw(SMLoc L) {
 /// parseDirectiveTLSDescSeq
 ///   ::= .tlsdescseq tls-variable
 bool ARMAsmParser::parseDirectiveTLSDescSeq(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".tlsdescseq directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (getLexer().isNot(AsmToken::Identifier)) {
     TokError("expected variable after '.tlsdescseq' directive");
     Parser.eatToEndOfStatement();
@@ -9128,14 +9288,6 @@ bool ARMAsmParser::parseDirectiveMovSP(SMLoc L) {
 /// parseDirectiveObjectArch
 ///   ::= .object_arch name
 bool ARMAsmParser::parseDirectiveObjectArch(SMLoc L) {
-  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI->hasSubsectionsViaSymbols();
-  if (isMachO) {
-    Error(L, ".object_arch directive not valid for Mach-O");
-    Parser.eatToEndOfStatement();
-    return false;
-  }
-
   if (getLexer().isNot(AsmToken::Identifier)) {
     Error(getLexer().getLoc(), "unexpected token");
     Parser.eatToEndOfStatement();
@@ -9221,36 +9373,7 @@ bool ARMAsmParser::parseDirectiveThumbSet(SMLoc L) {
   Lex();
 
   MCSymbol *Alias = getContext().GetOrCreateSymbol(Name);
-  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
-    MCSymbol *Sym = getContext().LookupSymbol(SRE->getSymbol().getName());
-    if (!Sym->isDefined()) {
-      getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
-      getStreamer().EmitAssignment(Alias, Value);
-      return false;
-    }
-
-    const MCObjectFileInfo::Environment Format =
-      getContext().getObjectFileInfo()->getObjectFileType();
-    switch (Format) {
-    case MCObjectFileInfo::IsCOFF: {
-      char Type = COFF::IMAGE_SYM_DTYPE_FUNCTION << COFF::SCT_COMPLEX_TYPE_SHIFT;
-      getStreamer().EmitCOFFSymbolType(Type);
-      // .set values are always local in COFF
-      getStreamer().EmitSymbolAttribute(Alias, MCSA_Local);
-      break;
-    }
-    case MCObjectFileInfo::IsELF:
-      getStreamer().EmitSymbolAttribute(Alias, MCSA_ELF_TypeFunction);
-      break;
-    case MCObjectFileInfo::IsMachO:
-      break;
-    }
-  }
-
-  // FIXME: set the function as being a thumb function via the assembler
-  getStreamer().EmitThumbFunc(Alias);
-  getStreamer().EmitAssignment(Alias, Value);
-
+  getTargetStreamer().emitThumbSet(Alias, Value);
   return false;
 }
 
@@ -9365,8 +9488,8 @@ unsigned ARMAsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
       int64_t Value;
       if (!SOExpr->EvaluateAsAbsolute(Value))
         return Match_Success;
-      assert((Value >= INT32_MIN && Value <= INT32_MAX) &&
-             "expression value must be representiable in 32 bits");
+      assert((Value >= INT32_MIN && Value <= UINT32_MAX) &&
+             "expression value must be representable in 32 bits");
     }
     break;
   case MCK_GPRPair:
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 8e14883..9b5fa75 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -17,7 +17,6 @@ add_public_tablegen_target(ARMCommonTableGen)
 add_llvm_target(ARMCodeGen
   A15SDOptimizer.cpp
   ARMAsmPrinter.cpp
-  ARMAtomicExpandPass.cpp
   ARMBaseInstrInfo.cpp
   ARMBaseRegisterInfo.cpp
   ARMCodeEmitter.cpp
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 9e40381..4d4038d 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "arm-disassembler"
-
 #include "llvm/MC/MCDisassembler.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -29,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "arm-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -90,8 +90,8 @@ class ARMDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  ARMDisassembler(const MCSubtargetInfo &STI) :
-    MCDisassembler(STI) {
+  ARMDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
   }
 
   ~ARMDisassembler() {
@@ -109,8 +109,8 @@ class ThumbDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  ThumbDisassembler(const MCSubtargetInfo &STI) :
-    MCDisassembler(STI) {
+  ThumbDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {
   }
 
   ~ThumbDisassembler() {
@@ -400,12 +400,16 @@ static DecodeStatus DecodeMRRC2(llvm::MCInst &Inst, unsigned Val,
                                 uint64_t Address, const void *Decoder);
 #include "ARMGenDisassemblerTables.inc"
 
-static MCDisassembler *createARMDisassembler(const Target &T, const MCSubtargetInfo &STI) {
-  return new ARMDisassembler(STI);
+static MCDisassembler *createARMDisassembler(const Target &T,
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new ARMDisassembler(STI, Ctx);
 }
 
-static MCDisassembler *createThumbDisassembler(const Target &T, const MCSubtargetInfo &STI) {
-  return new ThumbDisassembler(STI);
+static MCDisassembler *createThumbDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new ThumbDisassembler(STI, Ctx);
 }
 
 DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index da3fe01..e4b785d 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "ARMInstPrinter.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -23,6 +22,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "ARMGenAsmWriter.inc"
 
 /// translateShiftImm - Convert shift immediate from 0-31 to 1-32 for printing.
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1db517f..7acd9cc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -306,8 +306,36 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
+static uint32_t swapHalfWords(uint32_t Value, bool IsLittleEndian) {
+  if (IsLittleEndian) {
+    // Note that the halfwords are stored high first and low second in thumb;
+    // so we need to swap the fixup value here to map properly.
+    uint32_t Swapped = (Value & 0xFFFF0000) >> 16;
+    Swapped |= (Value & 0x0000FFFF) << 16;
+    return Swapped;
+  }
+  else
+    return Value;
+}
+
+static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
+                              bool IsLittleEndian) {
+  uint32_t Value;
+
+  if (IsLittleEndian) {
+    Value = (SecondHalf & 0xFFFF) << 16;
+    Value |= (FirstHalf & 0xFFFF);
+  } else {
+    Value = (SecondHalf & 0xFFFF);
+    Value |= (FirstHalf & 0xFFFF) << 16;
+  }
+
+  return Value;
+}
+
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 bool IsPCRel, MCContext *Ctx) {
+                                 bool IsPCRel, MCContext *Ctx,
+                                 bool IsLittleEndian) {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
@@ -316,6 +344,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case FK_Data_2:
   case FK_Data_4:
     return Value;
+  case FK_SecRel_2:
+    return Value;
+  case FK_SecRel_4:
+    return Value;
   case ARM::fixup_arm_movt_hi16:
     if (!IsPCRel)
       Value >>= 16;
@@ -342,9 +374,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // inst{14-12} = Mid3;
     // inst{7-0} = Lo8;
     Value = (Hi4 << 16) | (i << 26) | (Mid3 << 12) | (Lo8);
-    uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-    swapped |= (Value & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(Value, IsLittleEndian);
   }
   case ARM::fixup_arm_ldst_pcrel_12:
     // ARM PC-relative values are offset by 8.
@@ -364,11 +394,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     // Same addressing mode as fixup_arm_pcrel_10,
     // but with 16-bit halfwords swapped.
-    if (Kind == ARM::fixup_t2_ldst_pcrel_12) {
-      uint64_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
+    if (Kind == ARM::fixup_t2_ldst_pcrel_12)
+      return swapHalfWords(Value, IsLittleEndian);
 
     return Value;
   }
@@ -401,9 +428,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x700) << 4;
     out |= (Value & 0x0FF);
 
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
 
   case ARM::fixup_arm_condbranch:
@@ -434,9 +459,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x1FF800)  << 5; // imm6 field
     out |= (Value & 0x0007FF);        // imm11 field
 
-    uint64_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_t2_condbranch: {
     Value = Value - 4;
@@ -449,9 +472,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     out |= (Value & 0x1F800) << 5; // imm6 field
     out |= (Value & 0x007FF);      // imm11 field
 
-    uint32_t swapped = (out & 0xFFFF0000) >> 16;
-    swapped |= (out & 0x0000FFFF) << 16;
-    return swapped;
+    return swapHalfWords(out, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_bl: {
     // The value doesn't encode the low bit (always zero) and is offset by
@@ -475,13 +496,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     uint32_t imm10Bits = (offset & 0x1FF800) >> 11;
     uint32_t imm11Bits = (offset & 0x000007FF);
 
-    uint32_t Binary = 0;
-    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
-    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10Bits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                           (uint16_t)imm11Bits);
-    Binary |= secondHalf << 16;
-    Binary |= firstHalf;
-    return Binary;
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_blx: {
     // The value doesn't encode the low two bits (always zero) and is offset by
@@ -508,13 +526,10 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     uint32_t imm10HBits = (offset & 0xFFC00) >> 10;
     uint32_t imm10LBits = (offset & 0x3FF);
 
-    uint32_t Binary = 0;
-    uint32_t firstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
-    uint32_t secondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
+    uint32_t FirstHalf = (((uint16_t)signBit << 10) | (uint16_t)imm10HBits);
+    uint32_t SecondHalf = (((uint16_t)J1Bit << 13) | ((uint16_t)J2Bit << 11) |
                           ((uint16_t)imm10LBits) << 1);
-    Binary |= secondHalf << 16;
-    Binary |= firstHalf;
-    return Binary;
+    return joinHalfWords(FirstHalf, SecondHalf, IsLittleEndian);
   }
   case ARM::fixup_arm_thumb_cp:
     // Offset by 4, and don't encode the low two bits. Two bytes of that
@@ -566,11 +581,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
 
     // Same addressing mode as fixup_arm_pcrel_10, but with 16-bit halfwords
     // swapped.
-    if (Kind == ARM::fixup_t2_pcrel_10) {
-      uint32_t swapped = (Value & 0xFFFF0000) >> 16;
-      swapped |= (Value & 0x0000FFFF) << 16;
-      return swapped;
-    }
+    if (Kind == ARM::fixup_t2_pcrel_10)
+      return swapHalfWords(Value, IsLittleEndian);
 
     return Value;
   }
@@ -603,7 +615,7 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // the offset when the destination has the same MCFragment.
   if (A && (unsigned)Fixup.getKind() == ARM::fixup_arm_thumb_bl) {
     const MCSymbol &Sym = A->getSymbol().AliasedSymbol();
-    MCSymbolData &SymData = Asm.getSymbolData(Sym);
+    const MCSymbolData &SymData = Asm.getSymbolData(Sym);
     IsResolved = (SymData.getFragment() == DF);
   }
   // We must always generate a relocation for BL/BLX instructions if we have
@@ -618,7 +630,8 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
   // Try to get the encoded value for the fixup as-if we're mapping it into
   // the instruction. This allows adjustFixupValue() to issue a diagnostic
   // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext());
+  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
+                         IsLittleEndian);
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -662,6 +675,11 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case ARM::fixup_t2_movt_hi16:
   case ARM::fixup_t2_movw_lo16:
     return 4;
+
+  case FK_SecRel_2:
+    return 2;
+  case FK_SecRel_4:
+    return 4;
   }
 }
 
@@ -716,7 +734,7 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
                                bool IsPCRel) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr);
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian);
   if (!Value) return;           // Doesn't change encoding.
 
   unsigned Offset = Fixup.getOffset();
@@ -724,8 +742,11 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
-  if (!IsLittleEndian)
+  if (!IsLittleEndian) {
     FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+    assert((Offset + FullSizeBytes) <= DataSize && "Invalid fixup size!");
+    assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
+  }
 
   // For each byte of the fragment that the fixup touches, mask in the bits from
   // the fixup value. The Value has been "split up" into the appropriate
@@ -737,6 +758,15 @@ void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 }
 
 namespace {
+// FIXME: This should be in a separate file.
+class ARMWinCOFFAsmBackend : public ARMAsmBackend {
+public:
+  ARMWinCOFFAsmBackend(const Target &T, const StringRef &Triple)
+    : ARMAsmBackend(T, Triple, true) { }
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
+    return createARMWinCOFFObjectWriter(OS, /*Is64Bit=*/false);
+  }
+};
 
 // FIXME: This should be in a separate file.
 // ELF is an ELF of course...
@@ -777,7 +807,9 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
                                         bool isLittle) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO()) {
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
     MachO::CPUSubTypeARM CS =
       StringSwitch<MachO::CPUSubTypeARM>(TheTriple.getArchName())
       .Cases("armv4t", "thumbv4t", MachO::CPU_SUBTYPE_ARM_V4T)
@@ -792,15 +824,14 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T,
 
     return new DarwinARMAsmBackend(T, TT, CS);
   }
-
-#if 0
-  // FIXME: Introduce yet another checker but assert(0).
-  if (TheTriple.isOSBinFormatCOFF())
-    assert(0 && "Windows not supported on ARM");
-#endif
-
-  uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return new ARMWinCOFFAsmBackend(T, TT);
+  case Triple::ELF:
+    assert(TheTriple.isOSBinFormatELF() && "using ELF for non-ELF target");
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
+    return new ELFARMAsmBackend(T, TT, OSABI, isLittle);
+  }
 }
 
 MCAsmBackend *llvm::createARMLEAsmBackend(const Target &T,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index a4661b1..1c84263 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -74,7 +74,7 @@ unsigned ARMELFObjectWriter::GetRelocType(const MCValue &Target,
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const  {
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
@@ -91,6 +91,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTTPOFF:
         Type = ELF::R_ARM_TLS_IE32;
         break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
       }
       break;
     case ARM::fixup_arm_blx:
@@ -167,6 +170,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_GOTOFF:
         Type = ELF::R_ARM_GOTOFF32;
         break;
+      case MCSymbolRefExpr::VK_GOTPCREL:
+        Type = ELF::R_ARM_GOT_PREL;
+        break;
       case MCSymbolRefExpr::VK_ARM_TARGET1:
         Type = ELF::R_ARM_TARGET1;
         break;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 5a01d26..a4d13ed 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
@@ -62,7 +63,7 @@ static const char *GetFPUName(unsigned ID) {
 #define ARM_FPU_NAME(NAME, ID) case ARM::ID: return NAME;
 #include "ARMFPUName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static const char *GetArchName(unsigned ID) {
@@ -75,7 +76,7 @@ static const char *GetArchName(unsigned ID) {
 #define ARM_ARCH_ALIAS(NAME, ID) /* empty */
 #include "ARMArchName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static const char *GetArchDefaultCPUName(unsigned ID) {
@@ -88,7 +89,7 @@ static const char *GetArchDefaultCPUName(unsigned ID) {
 #define ARM_ARCH_ALIAS(NAME, ID) /* empty */
 #include "ARMArchName.def"
   }
-  return NULL;
+  return nullptr;
 }
 
 static unsigned GetArchDefaultCPUArch(unsigned ID) {
@@ -139,6 +140,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void finishAttributeSection() override;
 
   void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
 public:
   ARMTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS,
@@ -260,6 +262,10 @@ ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
 }
 
+void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  OS << "\t.thumb_set\t" << *Symbol << ", " << *Value << '\n';
+}
+
 void ARMTargetAsmStreamer::emitInst(uint32_t Inst, char Suffix) {
   OS << "\t.inst";
   if (Suffix)
@@ -310,7 +316,7 @@ private:
     for (size_t i = 0; i < Contents.size(); ++i)
       if (Contents[i].Tag == Attribute)
         return &Contents[i];
-    return 0;
+    return nullptr;
   }
 
   void setAttributeItem(unsigned Attribute, unsigned Value,
@@ -406,8 +412,10 @@ private:
   void emitFPU(unsigned FPU) override;
   void emitInst(uint32_t Inst, char Suffix = '\0') override;
   void finishAttributeSection() override;
+  void emitLabel(MCSymbol *Symbol) override;
 
   void AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *SRE) override;
+  void emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) override;
 
   size_t calculateContentSize() const;
 
@@ -415,7 +423,7 @@ public:
   ARMTargetELFStreamer(MCStreamer &S)
     : ARMTargetStreamer(S), CurrentVendor("aeabi"), FPU(ARM::INVALID_FPU),
       Arch(ARM::INVALID_ARCH), EmittedArch(ARM::INVALID_ARCH),
-      AttributeSection(0) {}
+      AttributeSection(nullptr) {}
 };
 
 /// Extend the generic ELFStreamer class so that it can emit mapping symbols at
@@ -531,7 +539,8 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size) override {
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     const SMLoc &Loc) override {
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size);
   }
@@ -600,12 +609,8 @@ private:
   }
 
   void EmitThumbFunc(MCSymbol *Func) override {
-    // FIXME: Anything needed here to flag the function as thumb?
-
     getAssembler().setIsThumbFunc(Func);
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Func);
-    SD.setFlags(SD.getFlags() | ELF_Other_ThumbFunc);
+    EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
   }
 
   // Helper functions for ARM exception handling directives
@@ -980,10 +985,35 @@ void ARMTargetELFStreamer::finishAttributeSection() {
   Contents.clear();
   FPU = ARM::INVALID_FPU;
 }
+
+void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
+  ARMELFStreamer &Streamer = getStreamer();
+  if (!Streamer.IsThumb)
+    return;
+
+  const MCSymbolData &SD = Streamer.getOrCreateSymbolData(Symbol);
+  if (MCELF::GetType(SD) & (ELF::STT_FUNC << ELF_STT_Shift))
+    Streamer.EmitThumbFunc(Symbol);
+}
+
 void
 ARMTargetELFStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
   getStreamer().EmitFixup(S, FK_Data_4);
 }
+
+void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
+    const MCSymbol &Sym = SRE->getSymbol();
+    if (!Sym.isDefined()) {
+      getStreamer().EmitAssignment(Symbol, Value);
+      return;
+    }
+  }
+
+  getStreamer().EmitThumbFunc(Symbol);
+  getStreamer().EmitAssignment(Symbol, Value);
+}
+
 void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
   getStreamer().emitInst(Inst, Suffix);
 }
@@ -1012,7 +1042,7 @@ inline void ARMELFStreamer::SwitchToEHSection(const char *Prefix,
   }
 
   // Get .ARM.extab or .ARM.exidx section
-  const MCSectionELF *EHSection = NULL;
+  const MCSectionELF *EHSection = nullptr;
   if (const MCSymbol *Group = FnSection.getGroup()) {
     EHSection = getContext().getELFSection(
       EHSecName, Type, Flags | ELF::SHF_GROUP, Kind,
@@ -1049,9 +1079,9 @@ void ARMELFStreamer::EmitFixup(const MCExpr *Expr, MCFixupKind Kind) {
 }
 
 void ARMELFStreamer::Reset() {
-  ExTab = NULL;
-  FnStart = NULL;
-  Personality = NULL;
+  ExTab = nullptr;
+  FnStart = nullptr;
+  Personality = nullptr;
   PersonalityIndex = ARM::EHABI::NUM_PERSONALITY_INDEX;
   FPReg = ARM::SP;
   FPOffset = 0;
@@ -1065,7 +1095,7 @@ void ARMELFStreamer::Reset() {
 }
 
 void ARMELFStreamer::emitFnStart() {
-  assert(FnStart == 0);
+  assert(FnStart == nullptr);
   FnStart = getContext().CreateTempSymbol();
   EmitLabel(FnStart);
 }
@@ -1104,11 +1134,14 @@ void ARMELFStreamer::emitFnEnd() {
     // the second word of exception index table entry.  The size of the unwind
     // opcodes should always be 4 bytes.
     assert(PersonalityIndex == ARM::EHABI::AEABI_UNWIND_CPP_PR0 &&
-           "Compact model must use __aeabi_cpp_unwind_pr0 as personality");
+           "Compact model must use __aeabi_unwind_cpp_pr0 as personality");
     assert(Opcodes.size() == 4u &&
-           "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
-    EmitBytes(StringRef(reinterpret_cast<const char*>(Opcodes.data()),
-                        Opcodes.size()));
+           "Unwind opcode size for __aeabi_unwind_cpp_pr0 must be equal to 4");
+    uint64_t Intval = Opcodes[0] |
+                      Opcodes[1] << 8 |
+                      Opcodes[2] << 16 |
+                      Opcodes[3] << 24;
+    EmitIntValue(Intval, Opcodes.size());
   }
 
   // Switch to the section containing FnStart
@@ -1180,8 +1213,15 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   }
 
   // Emit unwind opcodes
-  EmitBytes(StringRef(reinterpret_cast<const char *>(Opcodes.data()),
-                      Opcodes.size()));
+  assert((Opcodes.size() % 4) == 0 &&
+         "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be multiple of 4");
+  for (unsigned I = 0; I != Opcodes.size(); I += 4) {
+    uint64_t Intval = Opcodes[I] |
+                      Opcodes[I + 1] << 8 |
+                      Opcodes[I + 2] << 16 |
+                      Opcodes[I + 3] << 24;
+    EmitIntValue(Intval, 4);
+  }
 
   // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
   // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
@@ -1283,13 +1323,11 @@ void ARMELFStreamer::emitUnwindRaw(int64_t Offset,
 namespace llvm {
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useCFI,
-                                bool useDwarfDirectory,
+                                bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new ARMTargetAsmStreamer(*S, OS, *InstPrint, isVerboseAsm);
   return S;
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index b7f96e0..7a19208 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -25,7 +25,7 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(StringRef TT) {
       (TheTriple.getArch() == Triple::thumbeb))
     IsLittleEndian = false;
 
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
@@ -50,7 +50,7 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
   // ".comm align is in bytes but .align is pow-2."
   AlignmentIsInBytes = false;
 
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   CommentString = "@";
   Code16Directive = ".code\t16";
   Code32Directive = ".code\t32";
@@ -59,7 +59,14 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
 
   // Exceptions handling
-  ExceptionsType = ExceptionHandling::ARM;
+  switch (TheTriple.getOS()) {
+  case Triple::NetBSD:
+    ExceptionsType = ExceptionHandling::DwarfCFI;
+    break;
+  default:
+    ExceptionsType = ExceptionHandling::ARM;
+    break;
+  }
 
   // foo(plt) instead of foo@plt
   UseParensForSymbolVariant = true;
@@ -89,6 +96,7 @@ void ARMCOFFMCAsmInfoGNU::anchor() { }
 
 ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
   AlignmentIsInBytes = false;
+  HasSingleParameterDotFile = true;
 
   CommentString = "@";
   Code16Directive = ".code\t16";
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index beaf6a4..51cfa0a 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -35,13 +35,13 @@ namespace llvm {
   };
 
   class ARMCOFFMCAsmInfoMicrosoft : public MCAsmInfoMicrosoft {
-    void anchor();
+    void anchor() override;
   public:
     explicit ARMCOFFMCAsmInfoMicrosoft();
   };
 
   class ARMCOFFMCAsmInfoGNU : public MCAsmInfoGNUCOFF {
-    void anchor();
+    void anchor() override;
   public:
     explicit ARMCOFFMCAsmInfoGNU();
   };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 5564e0a..5b51a52 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "MCTargetDesc/ARMBaseInfo.h"
@@ -31,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
 STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
@@ -1036,16 +1037,17 @@ ARMMCCodeEmitter::getHiLo16ImmOpValue(const MCInst &MI, unsigned OpIdx,
                                        : ARM::fixup_arm_movw_lo16);
       break;
     }
+
     Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
     return 0;
   }
   // If the expression doesn't have :upper16: or :lower16: on it,
-  // it's just a plain immediate expression, and those evaluate to
+  // it's just a plain immediate expression, previously those evaluated to
   // the lower 16 bits of the expression regardless of whether
-  // we have a movt or a movw.
-  Kind = MCFixupKind(isThumb2(STI) ? ARM::fixup_t2_movw_lo16
-                                   : ARM::fixup_arm_movw_lo16);
-  Fixups.push_back(MCFixup::Create(0, E, Kind, MI.getLoc()));
+  // we have a movt or a movw, but that led to misleadingly results.
+  // This is now disallowed in the the AsmParser in validateInstruction()
+  // so this should never happen.
+  assert(0 && "expression without :upper16: or :lower16:");
   return 0;
 }
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
index fc8505b..87ea875 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.cpp
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "armmcexpr"
 #include "ARMMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "armmcexpr"
+
 const ARMMCExpr*
 ARMMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                        MCContext &Ctx) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 949a3d5..04d63a7 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -106,9 +107,11 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
     unsigned SubVer = TT[Idx];
     if (SubVer == '8') {
       if (NoCPU)
-        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2, FeatureMP,
-        //      FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone, FeatureT2XtPk, FeatureCrypto, FeatureCRC
-        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,+trustzone,+t2xtpk,+crypto,+crc";
+        // v8a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
+        //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
+        //      FeatureT2XtPk, FeatureCrypto, FeatureCRC
+        ARMArchFeature = "+v8,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
+                         "+trustzone,+t2xtpk,+crypto,+crc";
       else
         // Use CPU to figure out the exact features
         ARMArchFeature = "+v8";
@@ -245,7 +248,7 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   }
 
   unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
-  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(0, Reg, 0));
+  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
 
   return MAI;
 }
@@ -273,18 +276,20 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO()) {
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO: {
     MCStreamer *S = createMachOStreamer(Ctx, MAB, OS, Emitter, false);
     new ARMTargetStreamer(*S);
     return S;
   }
-
-  if (TheTriple.isOSWindows()) {
-    llvm_unreachable("ARM does not support Windows COFF format");
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
+    return createARMWinCOFFStreamer(Ctx, MAB, *Emitter, OS);
+  case Triple::ELF:
+    return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
+                                TheTriple.getArch() == Triple::thumb);
   }
-
-  return createARMELFStreamer(Ctx, MAB, OS, Emitter, false, NoExecStack,
-                              TheTriple.getArch() == Triple::thumb);
 }
 
 static MCInstPrinter *createARMMCInstPrinter(const Target &T,
@@ -295,7 +300,7 @@ static MCInstPrinter *createARMMCInstPrinter(const Target &T,
                                              const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new ARMInstPrinter(MAI, MII, MRI, STI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createARMMCRelocationInfo(StringRef TT,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index e81876f..8853a8c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -47,8 +47,7 @@ namespace ARM_MC {
 }
 
 MCStreamer *createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                                bool isVerboseAsm, bool useCFI,
-                                bool useDwarfDirectory,
+                                bool isVerboseAsm, bool useDwarfDirectory,
                                 MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                                 MCAsmBackend *TAB, bool ShowInst);
 
@@ -78,6 +77,11 @@ MCAsmBackend *createThumbLEAsmBackend(const Target &T, const MCRegisterInfo &MRI
 MCAsmBackend *createThumbBEAsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                       StringRef TT, StringRef CPU);
 
+/// createARMWinCOFFStreamer - Construct a PE/COFF machine code streamer which
+/// will generate a PE/COFF object file.
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS);
+
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
                                          uint8_t OSABI,
@@ -89,6 +93,8 @@ MCObjectWriter *createARMMachObjectWriter(raw_ostream &OS,
                                           uint32_t CPUType,
                                           uint32_t CPUSubtype);
 
+/// createARMWinCOFFObjectWriter - Construct an ARM PE/COFF object writer.
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit);
 
 /// createARMMachORelocationInfo - Construct ARM Mach-O relocation info.
 MCRelocationInfo *createARMMachORelocationInfo(MCContext &Ctx);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
index 3bf5cf1..ecfa4e5 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachObjectWriter.cpp
@@ -156,7 +156,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     Asm.getContext().FatalError(Fixup.getLoc(),
@@ -170,7 +170,7 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
   FixedValue += SecAddr;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       Asm.getContext().FatalError(Fixup.getLoc(),
@@ -206,11 +206,11 @@ RecordARMScatteredHalfRelocation(MachObjectWriter *Writer,
     // The thumb bit shouldn't be set in the 'other-half' bit of the
     // relocation, but it will be set in FixedValue if the base symbol
     // is a thumb function. Clear it out here.
-    if (A_SD->getFlags() & SF_ThumbFunc)
+    if (Asm.isThumbFunc(A))
       FixedValue &= 0xfffffffe;
     break;
   case ARM::fixup_t2_movt_hi16:
-    if (A_SD->getFlags() & SF_ThumbFunc)
+    if (Asm.isThumbFunc(A))
       FixedValue &= 0xfffffffe;
     MovtBit = 1;
     // Fallthrough
@@ -259,7 +259,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     Asm.getContext().FatalError(Fixup.getLoc(),
@@ -272,7 +272,7 @@ void ARMMachObjectWriter::RecordARMScatteredRelocation(MachObjectWriter *Writer,
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       Asm.getContext().FatalError(Fixup.getLoc(),
@@ -378,7 +378,7 @@ void ARMMachObjectWriter::RecordRelocation(MachObjectWriter *Writer,
   }
 
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index fdc0ed7..e3cfb05 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -109,7 +109,7 @@ ConstantPool *
 AssemblerConstantPools::getConstantPool(const MCSection *Section) {
   ConstantPoolMapTy::iterator CP = ConstantPools.find(Section);
   if (CP == ConstantPools.end())
-    return 0;
+    return nullptr;
 
   return &CP->second;
 }
@@ -246,3 +246,7 @@ void ARMTargetStreamer::AnnotateTLSDescriptorSequence(
     const MCSymbolRefExpr *SRE) {
   llvm_unreachable("unimplemented");
 }
+
+void ARMTargetStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
+  llvm_unreachable("unimplemented");
+}
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
new file mode 100644
index 0000000..d31f1f4
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFObjectWriter.cpp
@@ -0,0 +1,82 @@
+//===-- ARMWinCOFFObjectWriter.cpp - ARM Windows COFF Object Writer -- C++ -==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/ARMFixupKinds.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  ARMWinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARMNT) {
+    assert(!Is64Bit && "AArch64 support not yet implemented");
+  }
+  virtual ~ARMWinCOFFObjectWriter() { }
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection) const override;
+
+  bool recordRelocation(const MCFixup &) const override;
+};
+
+unsigned ARMWinCOFFObjectWriter::getRelocType(const MCValue &Target,
+                                              const MCFixup &Fixup,
+                                              bool IsCrossSection) const {
+  assert(getMachine() == COFF::IMAGE_FILE_MACHINE_ARMNT &&
+         "AArch64 support not yet implemented");
+
+  MCSymbolRefExpr::VariantKind Modifier =
+    Target.isAbsolute() ? MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
+  switch (static_cast<unsigned>(Fixup.getKind())) {
+  default: llvm_unreachable("unsupported relocation type");
+  case FK_Data_4:
+    switch (Modifier) {
+    case MCSymbolRefExpr::VK_COFF_IMGREL32:
+      return COFF::IMAGE_REL_ARM_ADDR32NB;
+    case MCSymbolRefExpr::VK_SECREL:
+      return COFF::IMAGE_REL_ARM_SECREL;
+    default:
+      return COFF::IMAGE_REL_ARM_ADDR32;
+    }
+  case FK_SecRel_2:
+    return COFF::IMAGE_REL_ARM_SECTION;
+  case FK_SecRel_4:
+    return COFF::IMAGE_REL_ARM_SECREL;
+  case ARM::fixup_t2_condbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH20T;
+  case ARM::fixup_t2_uncondbranch:
+    return COFF::IMAGE_REL_ARM_BRANCH24T;
+  case ARM::fixup_arm_thumb_bl:
+  case ARM::fixup_arm_thumb_blx:
+    return COFF::IMAGE_REL_ARM_BLX23T;
+  case ARM::fixup_t2_movw_lo16:
+  case ARM::fixup_t2_movt_hi16:
+    return COFF::IMAGE_REL_ARM_MOV32T;
+  }
+}
+
+bool ARMWinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const {
+  return static_cast<unsigned>(Fixup.getKind()) != ARM::fixup_t2_movt_hi16;
+}
+}
+
+namespace llvm {
+MCObjectWriter *createARMWinCOFFObjectWriter(raw_ostream &OS, bool Is64Bit) {
+  MCWinCOFFObjectTargetWriter *MOTW = new ARMWinCOFFObjectWriter(Is64Bit);
+  return createWinCOFFObjectWriter(MOTW, OS);
+}
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
new file mode 100644
index 0000000..b344ced
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -0,0 +1,46 @@
+//===-- ARMWinCOFFStreamer.cpp - ARM Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARMMCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class ARMWinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  ARMWinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter &CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, CE, OS) { }
+
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Symbol) override;
+};
+
+void ARMWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  switch (Flag) {
+  default: llvm_unreachable("not implemented");
+  case MCAF_SyntaxUnified:
+  case MCAF_Code16:
+    break;
+  }
+}
+
+void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+  getAssembler().setIsThumbFunc(Symbol);
+}
+}
+
+namespace llvm {
+MCStreamer *createARMWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     MCCodeEmitter &Emitter, raw_ostream &OS) {
+  return new ARMWinCOFFStreamer(Context, MAB, Emitter, OS);
+}
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/Android.mk b/lib/Target/ARM/MCTargetDesc/Android.mk
index 074d29e..a5827f7 100644
--- a/lib/Target/ARM/MCTargetDesc/Android.mk
+++ b/lib/Target/ARM/MCTargetDesc/Android.mk
@@ -17,7 +17,9 @@ arm_mc_desc_SRC_FILES := \
   ARMMachObjectWriter.cpp \
   ARMMachORelocationInfo.cpp \
   ARMTargetStreamer.cpp \
-  ARMUnwindOpAsm.cpp
+  ARMUnwindOpAsm.cpp \
+  ARMWinCOFFObjectWriter.cpp \
+  ARMWinCOFFStreamer.cpp \
 
 # For the host
 # =====================================================
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 06812d4..9582e8c 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -1,14 +1,16 @@
 add_llvm_library(LLVMARMDesc
   ARMAsmBackend.cpp
   ARMELFObjectWriter.cpp
+  ARMELFObjectWriter.cpp
   ARMELFStreamer.cpp
+  ARMMachObjectWriter.cpp
+  ARMMachORelocationInfo.cpp
   ARMMCAsmInfo.cpp
   ARMMCCodeEmitter.cpp
   ARMMCExpr.cpp
   ARMMCTargetDesc.cpp
-  ARMMachObjectWriter.cpp
-  ARMELFObjectWriter.cpp
   ARMTargetStreamer.cpp
   ARMUnwindOpAsm.cpp
-  ARMMachORelocationInfo.cpp
+  ARMWinCOFFObjectWriter.cpp
+  ARMWinCOFFStreamer.cpp
   )
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index 80af859..f6d24e9 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mlx-expansion"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mlx-expansion"
+
 static cl::opt<bool>
 ForceExapnd("expand-all-fp-mlx", cl::init(false), cl::Hidden);
 static cl::opt<unsigned>
@@ -73,7 +74,7 @@ namespace {
 }
 
 void MLxExpansion::clearStack() {
-  std::fill(LastMIs, LastMIs + 4, (MachineInstr*)0);
+  std::fill(LastMIs, LastMIs + 4, nullptr);
   MIIdx = 0;
 }
 
@@ -88,7 +89,7 @@ MachineInstr *MLxExpansion::getAccDefMI(MachineInstr *MI) const {
   // real definition MI. This is important for _sfp instructions.
   unsigned Reg = MI->getOperand(1).getReg();
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *MBB = MI->getParent();
   MachineInstr *DefMI = MRI->getVRegDef(Reg);
@@ -352,7 +353,7 @@ bool MLxExpansion::ExpandFPMLxInstructions(MachineBasicBlock &MBB) {
     if (Domain == ARMII::DomainGeneral) {
       if (++Skip == 2)
         // Assume dual issues of non-VFP / NEON instructions.
-        pushStack(0);
+        pushStack(nullptr);
     } else {
       Skip = 0;
 
diff --git a/lib/Target/ARM/README-Thumb.txt b/lib/Target/ARM/README-Thumb.txt
index a64707e..f4d9be3 100644
--- a/lib/Target/ARM/README-Thumb.txt
+++ b/lib/Target/ARM/README-Thumb.txt
@@ -215,10 +215,6 @@ etc. Almost all Thumb instructions clobber condition code.
 
 //===---------------------------------------------------------------------===//
 
-Add ldmia, stmia support.
-
-//===---------------------------------------------------------------------===//
-
 Thumb load / store address mode offsets are scaled. The values kept in the
 instruction operands are pre-scale values. This probably ought to be changed
 to avoid extra work when we convert Thumb2 instructions to Thumb1 instructions.
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index 2224652..be29dc5 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -293,7 +293,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF) const {
     AFI->setShouldRestoreSPFromFP(true);
 }
 
-static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
+static bool isCSRestore(MachineInstr *MI, const MCPhysReg *CSRegs) {
   if (MI->getOpcode() == ARM::tLDRspi &&
       MI->getOperand(1).isFI() &&
       isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs))
@@ -328,7 +328,7 @@ void Thumb1FrameLowering::emitEpilogue(MachineFunction &MF,
   int NumBytes = (int)MFI->getStackSize();
   assert((unsigned)NumBytes >= ArgRegsSaveSize &&
          "ArgRegsSaveSize is included in NumBytes");
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs();
   unsigned FramePtr = RegInfo->getFrameRegister(MF);
 
   if (!AFI->hasStackFrame()) {
diff --git a/lib/Target/ARM/Thumb1RegisterInfo.h b/lib/Target/ARM/Thumb1RegisterInfo.h
index 93e2b5a..0c0abbe 100644
--- a/lib/Target/ARM/Thumb1RegisterInfo.h
+++ b/lib/Target/ARM/Thumb1RegisterInfo.h
@@ -56,7 +56,7 @@ public:
                              unsigned Reg) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 };
 }
 
diff --git a/lib/Target/ARM/Thumb2ITBlockPass.cpp b/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 406dbe0..edb9ff3 100644
--- a/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "thumb2-it"
 #include "ARM.h"
 #include "ARMMachineFunctionInfo.h"
 #include "Thumb2InstrInfo.h"
@@ -19,6 +18,8 @@
 #include "llvm/CodeGen/MachineInstrBundle.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "thumb2-it"
+
 STATISTIC(NumITs,        "Number of IT blocks inserted");
 STATISTIC(NumMovedInsts, "Number of predicated instructions moved");
 
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index 04b83fb..6267ecf 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "t2-reduce-size"
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
@@ -25,6 +24,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "t2-reduce-size"
+
 STATISTIC(NumNarrows,  "Number of 32-bit instrs reduced to 16-bit ones");
 STATISTIC(Num2Addrs,   "Number of 32-bit instrs reduced to 2addr 16-bit ones");
 STATISTIC(NumLdSts,    "Number of 32-bit load / store reduced to 16-bit ones");
@@ -915,15 +916,14 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
   // Yes, CPSR could be livein.
   bool LiveCPSR = MBB.isLiveIn(ARM::CPSR);
-  MachineInstr *BundleMI = 0;
+  MachineInstr *BundleMI = nullptr;
 
-  CPSRDef = 0;
+  CPSRDef = nullptr;
   HighLatencyCPSR = false;
 
   // Check predecessors for the latest CPSRDef.
-  for (MachineBasicBlock::pred_iterator
-       I = MBB.pred_begin(), E = MBB.pred_end(); I != E; ++I) {
-    const MBBInfo &PInfo = BlockInfo[(*I)->getNumber()];
+  for (auto *Pred : MBB.predecessors()) {
+    const MBBInfo &PInfo = BlockInfo[Pred->getNumber()];
     if (!PInfo.Visited) {
       // Since blocks are visited in RPO, this must be a back-edge.
       continue;
@@ -984,7 +984,7 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
     LiveCPSR = UpdateCPSRDef(*MI, LiveCPSR, DefCPSR);
     if (MI->isCall()) {
       // Calls don't really set CPSR.
-      CPSRDef = 0;
+      CPSRDef = nullptr;
       HighLatencyCPSR = false;
       IsSelfLoop = false;
     } else if (DefCPSR) {
diff --git a/lib/Target/ARM64/ARM64.h b/lib/Target/ARM64/ARM64.h
deleted file mode 100644
index f2c5e60..0000000
--- a/lib/Target/ARM64/ARM64.h
+++ /dev/null
@@ -1,48 +0,0 @@
-//===-- ARM64.h - Top-level interface for ARM64 representation --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the entry points for global functions defined in the LLVM
-// ARM64 back-end.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef TARGET_ARM64_H
-#define TARGET_ARM64_H
-
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Support/DataTypes.h"
-
-namespace llvm {
-
-class ARM64TargetMachine;
-class FunctionPass;
-class MachineFunctionPass;
-
-FunctionPass *createARM64DeadRegisterDefinitions();
-FunctionPass *createARM64ConditionalCompares();
-FunctionPass *createARM64AdvSIMDScalar();
-FunctionPass *createARM64BranchRelaxation();
-FunctionPass *createARM64ISelDag(ARM64TargetMachine &TM,
-                                 CodeGenOpt::Level OptLevel);
-FunctionPass *createARM64StorePairSuppressPass();
-FunctionPass *createARM64ExpandPseudoPass();
-FunctionPass *createARM64LoadStoreOptimizationPass();
-ModulePass *createARM64PromoteConstantPass();
-FunctionPass *createARM64AddressTypePromotionPass();
-/// \brief Creates an ARM-specific Target Transformation Info pass.
-ImmutablePass *createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM);
-
-FunctionPass *createARM64CleanupLocalDynamicTLSPass();
-
-FunctionPass *createARM64CollectLOHPass();
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64.td b/lib/Target/ARM64/ARM64.td
deleted file mode 100644
index 3eef8b2..0000000
--- a/lib/Target/ARM64/ARM64.td
+++ /dev/null
@@ -1,95 +0,0 @@
-//===- ARM64.td - Describe the ARM64 Target Machine --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Target-independent interfaces which we are implementing
-//===----------------------------------------------------------------------===//
-
-include "llvm/Target/Target.td"
-
-//===----------------------------------------------------------------------===//
-// ARM64 Subtarget features.
-//
-
-/// Cyclone has register move instructions which are "free".
-def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
-                                        "Has zereo-cycle register moves">;
-
-/// Cyclone has instructions which zero registers for "free".
-def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true",
-                                        "Has zero-cycle zeroing instructions">;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "ARM64RegisterInfo.td"
-include "ARM64CallingConvention.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "ARM64Schedule.td"
-include "ARM64InstrInfo.td"
-
-def ARM64InstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
-// ARM64 Processors supported.
-//
-include "ARM64SchedCyclone.td"
-
-def : ProcessorModel<"arm64-generic", NoSchedModel, []>;
-
-def : ProcessorModel<"cyclone", CycloneModel, [FeatureZCRegMove, FeatureZCZeroing]>;
-
-//===----------------------------------------------------------------------===//
-// Assembly parser
-//===----------------------------------------------------------------------===//
-
-def GenericAsmParserVariant : AsmParserVariant {
-  int Variant = 0;
-  string Name = "generic";
-}
-
-def AppleAsmParserVariant : AsmParserVariant {
-  int Variant = 1;
-  string Name = "apple-neon";
-}
-
-//===----------------------------------------------------------------------===//
-// Assembly printer
-//===----------------------------------------------------------------------===//
-// ARM64 Uses the MC printer for asm output, so make sure the TableGen
-// AsmWriter bits get associated with the correct class.
-def GenericAsmWriter : AsmWriter {
-  string AsmWriterClassName  = "InstPrinter";
-  int Variant = 0;
-  bit isMCAsmWriter = 1;
-}
-
-def AppleAsmWriter : AsmWriter {
-  let AsmWriterClassName = "AppleInstPrinter";
-  int Variant = 1;
-  int isMCAsmWriter = 1;
-}
-
-//===----------------------------------------------------------------------===//
-// Target Declaration
-//===----------------------------------------------------------------------===//
-
-def ARM64 : Target {
-  let InstructionSet = ARM64InstrInfo;
-  let AssemblyParserVariants = [GenericAsmParserVariant, AppleAsmParserVariant];
-  let AssemblyWriters = [GenericAsmWriter, AppleAsmWriter];
-}
diff --git a/lib/Target/ARM64/ARM64AsmPrinter.cpp b/lib/Target/ARM64/ARM64AsmPrinter.cpp
deleted file mode 100644
index d0aa6af..0000000
--- a/lib/Target/ARM64/ARM64AsmPrinter.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-//===-- ARM64AsmPrinter.cpp - ARM64 LLVM assembly writer ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains a printer that converts from our internal representation
-// of machine-dependent LLVM code to the ARM64 assembly language.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "ARM64.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64MCInstLower.h"
-#include "ARM64RegisterInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/StackMaps.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstBuilder.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmPrinter : public AsmPrinter {
-  ARM64MCInstLower MCInstLowering;
-  StackMaps SM;
-
-public:
-  ARM64AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), MCInstLowering(OutContext, *Mang, *this),
-        SM(*this), ARM64FI(NULL), LOHLabelCounter(0) {}
-
-  virtual const char *getPassName() const { return "ARM64 Assembly Printer"; }
-
-  /// \brief Wrapper for MCInstLowering.lowerOperand() for the
-  /// tblgen'erated pseudo lowering.
-  bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
-    return MCInstLowering.lowerOperand(MO, MCOp);
-  }
-
-  void LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                     const MachineInstr &MI);
-  void LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                       const MachineInstr &MI);
-  /// \brief tblgen'erated driver function for lowering simple MI->MC
-  /// pseudo instructions.
-  bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
-                                   const MachineInstr *MI);
-
-  void EmitInstruction(const MachineInstr *MI);
-
-  void getAnalysisUsage(AnalysisUsage &AU) const {
-    AsmPrinter::getAnalysisUsage(AU);
-    AU.setPreservesAll();
-  }
-
-  bool runOnMachineFunction(MachineFunction &F) {
-    ARM64FI = F.getInfo<ARM64FunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(F);
-  }
-
-private:
-  MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
-  void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
-  bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
-  bool printAsmRegInClass(const MachineOperand &MO,
-                          const TargetRegisterClass *RC, bool isVector,
-                          raw_ostream &O);
-
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
-
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  void EmitFunctionBodyEnd();
-
-  MCSymbol *GetCPISymbol(unsigned CPID) const;
-  void EmitEndOfAsmFile(Module &M);
-  ARM64FunctionInfo *ARM64FI;
-
-  /// \brief Emit the LOHs contained in ARM64FI.
-  void EmitLOHs();
-
-  typedef std::map<const MachineInstr *, MCSymbol *> MInstToMCSymbol;
-  MInstToMCSymbol LOHInstToLabel;
-  unsigned LOHLabelCounter;
-};
-
-} // end of anonymous namespace
-
-//===----------------------------------------------------------------------===//
-
-void ARM64AsmPrinter::EmitEndOfAsmFile(Module &M) {
-  // Funny Darwin hack: This flag tells the linker that no global symbols
-  // contain code that falls through to other global symbols (e.g. the obvious
-  // implementation of multiple entry points).  If this doesn't occur, the
-  // linker can safely perform dead code stripping.  Since LLVM never
-  // generates code that does this, it is always safe to set.
-  OutStreamer.EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
-  SM.serializeToStackMapSection();
-}
-
-MachineLocation
-ARM64AsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
-void ARM64AsmPrinter::EmitLOHs() {
-  SmallVector<MCSymbol *, 3> MCArgs;
-
-  for (const auto &D : ARM64FI->getLOHContainer()) {
-    for (const MachineInstr *MI : D.getArgs()) {
-      MInstToMCSymbol::iterator LabelIt = LOHInstToLabel.find(MI);
-      assert(LabelIt != LOHInstToLabel.end() &&
-             "Label hasn't been inserted for LOH related instruction");
-      MCArgs.push_back(LabelIt->second);
-    }
-    OutStreamer.EmitLOHDirective(D.getKind(), MCArgs);
-    MCArgs.clear();
-  }
-}
-
-void ARM64AsmPrinter::EmitFunctionBodyEnd() {
-  if (!ARM64FI->getLOHRelated().empty())
-    EmitLOHs();
-}
-
-/// GetCPISymbol - Return the symbol for the specified constant pool entry.
-MCSymbol *ARM64AsmPrinter::GetCPISymbol(unsigned CPID) const {
-  // Darwin uses a linker-private symbol name for constant-pools (to
-  // avoid addends on the relocation?), ELF has no such concept and
-  // uses a normal private symbol.
-  if (getDataLayout().getLinkerPrivateGlobalPrefix()[0])
-    return OutContext.GetOrCreateSymbol(
-        Twine(getDataLayout().getLinkerPrivateGlobalPrefix()) + "CPI" +
-        Twine(getFunctionNumber()) + "_" + Twine(CPID));
-
-  return OutContext.GetOrCreateSymbol(
-      Twine(getDataLayout().getPrivateGlobalPrefix()) + "CPI" +
-      Twine(getFunctionNumber()) + "_" + Twine(CPID));
-}
-
-void ARM64AsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  switch (MO.getType()) {
-  default:
-    assert(0 && "<unknown operand type>");
-  case MachineOperand::MO_Register: {
-    unsigned Reg = MO.getReg();
-    assert(TargetRegisterInfo::isPhysicalRegister(Reg));
-    assert(!MO.getSubReg() && "Subregs should be eliminated!");
-    O << ARM64InstPrinter::getRegisterName(Reg);
-    break;
-  }
-  case MachineOperand::MO_Immediate: {
-    int64_t Imm = MO.getImm();
-    O << '#' << Imm;
-    break;
-  }
-  }
-}
-
-bool ARM64AsmPrinter::printAsmMRegister(const MachineOperand &MO, char Mode,
-                                        raw_ostream &O) {
-  unsigned Reg = MO.getReg();
-  switch (Mode) {
-  default:
-    return true; // Unknown mode.
-  case 'w':
-    Reg = getWRegFromXReg(Reg);
-    break;
-  case 'x':
-    Reg = getXRegFromWReg(Reg);
-    break;
-  }
-
-  O << ARM64InstPrinter::getRegisterName(Reg);
-  return false;
-}
-
-// Prints the register in MO using class RC using the offset in the
-// new register class. This should not be used for cross class
-// printing.
-bool ARM64AsmPrinter::printAsmRegInClass(const MachineOperand &MO,
-                                         const TargetRegisterClass *RC,
-                                         bool isVector, raw_ostream &O) {
-  assert(MO.isReg() && "Should only get here with a register!");
-  const ARM64RegisterInfo *RI =
-      static_cast<const ARM64RegisterInfo *>(TM.getRegisterInfo());
-  unsigned Reg = MO.getReg();
-  unsigned RegToPrint = RC->getRegister(RI->getEncodingValue(Reg));
-  assert(RI->regsOverlap(RegToPrint, Reg));
-  O << ARM64InstPrinter::getRegisterName(
-           RegToPrint, isVector ? ARM64::vreg : ARM64::NoRegAltName);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
-                                      unsigned AsmVariant,
-                                      const char *ExtraCode, raw_ostream &O) {
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  // Does this asm operand have a single letter operand modifier?
-  if (ExtraCode && ExtraCode[0]) {
-    if (ExtraCode[1] != 0)
-      return true; // Unknown modifier.
-
-    switch (ExtraCode[0]) {
-    default:
-      return true; // Unknown modifier.
-    case 'w':      // Print W register
-    case 'x':      // Print X register
-      if (MO.isReg())
-        return printAsmMRegister(MO, ExtraCode[0], O);
-      if (MO.isImm() && MO.getImm() == 0) {
-        unsigned Reg = ExtraCode[0] == 'w' ? ARM64::WZR : ARM64::XZR;
-        O << ARM64InstPrinter::getRegisterName(Reg);
-        return false;
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    case 'b': // Print B register.
-    case 'h': // Print H register.
-    case 's': // Print S register.
-    case 'd': // Print D register.
-    case 'q': // Print Q register.
-      if (MO.isReg()) {
-        const TargetRegisterClass *RC;
-        switch (ExtraCode[0]) {
-        case 'b':
-          RC = &ARM64::FPR8RegClass;
-          break;
-        case 'h':
-          RC = &ARM64::FPR16RegClass;
-          break;
-        case 's':
-          RC = &ARM64::FPR32RegClass;
-          break;
-        case 'd':
-          RC = &ARM64::FPR64RegClass;
-          break;
-        case 'q':
-          RC = &ARM64::FPR128RegClass;
-          break;
-        default:
-          return true;
-        }
-        return printAsmRegInClass(MO, RC, false /* vector */, O);
-      }
-      printOperand(MI, OpNum, O);
-      return false;
-    }
-  }
-
-  // According to ARM, we should emit x and v registers unless we have a
-  // modifier.
-  if (MO.isReg()) {
-    unsigned Reg = MO.getReg();
-
-    // If this is a w or x register, print an x register.
-    if (ARM64::GPR32allRegClass.contains(Reg) ||
-        ARM64::GPR64allRegClass.contains(Reg))
-      return printAsmMRegister(MO, 'x', O);
-
-    // If this is a b, h, s, d, or q register, print it as a v register.
-    return printAsmRegInClass(MO, &ARM64::FPR128RegClass, true /* vector */, O);
-  }
-
-  printOperand(MI, OpNum, O);
-  return false;
-}
-
-bool ARM64AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
-                                            unsigned OpNum, unsigned AsmVariant,
-                                            const char *ExtraCode,
-                                            raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
-
-  const MachineOperand &MO = MI->getOperand(OpNum);
-  assert(MO.isReg() && "unexpected inline asm memory operand");
-  O << "[" << ARM64InstPrinter::getRegisterName(MO.getReg()) << "]";
-  return false;
-}
-
-void ARM64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                             raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps == 4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps - 1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '[';
-  printOperand(MI, 0, OS);
-  OS << '+';
-  printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps - 2, OS);
-}
-
-void ARM64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
-                                    const MachineInstr &MI) {
-  unsigned NumNOPBytes = MI.getOperand(1).getImm();
-
-  SM.recordStackMap(MI);
-  // Emit padding.
-  assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
-  for (unsigned i = 0; i < NumNOPBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Lower a patchpoint of the form:
-// [<def>], <id>, <numBytes>, <target>, <numArgs>
-void ARM64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
-                                      const MachineInstr &MI) {
-  SM.recordPatchPoint(MI);
-
-  PatchPointOpers Opers(&MI);
-
-  int64_t CallTarget = Opers.getMetaOper(PatchPointOpers::TargetPos).getImm();
-  unsigned EncodedBytes = 0;
-  if (CallTarget) {
-    assert((CallTarget & 0xFFFFFFFFFFFF) == CallTarget &&
-           "High 16 bits of call target should be zero.");
-    unsigned ScratchReg = MI.getOperand(Opers.getNextScratchIdx()).getReg();
-    EncodedBytes = 16;
-    // Materialize the jump address:
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVZWi)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 32) & 0xFFFF)
-                                    .addImm(32));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm((CallTarget >> 16) & 0xFFFF)
-                                    .addImm(16));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::MOVKWi)
-                                    .addReg(ScratchReg)
-                                    .addReg(ScratchReg)
-                                    .addImm(CallTarget & 0xFFFF)
-                                    .addImm(0));
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::BLR).addReg(ScratchReg));
-  }
-  // Emit padding.
-  unsigned NumBytes = Opers.getMetaOper(PatchPointOpers::NBytesPos).getImm();
-  assert(NumBytes >= EncodedBytes &&
-         "Patchpoint can't request size less than the length of a call.");
-  assert((NumBytes - EncodedBytes) % 4 == 0 &&
-         "Invalid number of NOP bytes requested!");
-  for (unsigned i = EncodedBytes; i < NumBytes; i += 4)
-    EmitToStreamer(OutStreamer, MCInstBuilder(ARM64::HINT).addImm(0));
-}
-
-// Simple pseudo-instructions have their lowering (with expansion to real
-// instructions) auto-generated.
-#include "ARM64GenMCPseudoLowering.inc"
-
-static unsigned getRealIndexedOpcode(unsigned Opc) {
-  switch (Opc) {
-  case ARM64::LDRXpre_isel:    return ARM64::LDRXpre;
-  case ARM64::LDRWpre_isel:    return ARM64::LDRWpre;
-  case ARM64::LDRDpre_isel:    return ARM64::LDRDpre;
-  case ARM64::LDRSpre_isel:    return ARM64::LDRSpre;
-  case ARM64::LDRBBpre_isel:   return ARM64::LDRBBpre;
-  case ARM64::LDRHHpre_isel:   return ARM64::LDRHHpre;
-  case ARM64::LDRSBWpre_isel:  return ARM64::LDRSBWpre;
-  case ARM64::LDRSBXpre_isel:  return ARM64::LDRSBXpre;
-  case ARM64::LDRSHWpre_isel:  return ARM64::LDRSHWpre;
-  case ARM64::LDRSHXpre_isel:  return ARM64::LDRSHXpre;
-  case ARM64::LDRSWpre_isel:   return ARM64::LDRSWpre;
-
-  case ARM64::LDRDpost_isel:   return ARM64::LDRDpost;
-  case ARM64::LDRSpost_isel:   return ARM64::LDRSpost;
-  case ARM64::LDRXpost_isel:   return ARM64::LDRXpost;
-  case ARM64::LDRWpost_isel:   return ARM64::LDRWpost;
-  case ARM64::LDRHHpost_isel:  return ARM64::LDRHHpost;
-  case ARM64::LDRBBpost_isel:  return ARM64::LDRBBpost;
-  case ARM64::LDRSWpost_isel:  return ARM64::LDRSWpost;
-  case ARM64::LDRSHWpost_isel: return ARM64::LDRSHWpost;
-  case ARM64::LDRSHXpost_isel: return ARM64::LDRSHXpost;
-  case ARM64::LDRSBWpost_isel: return ARM64::LDRSBWpost;
-  case ARM64::LDRSBXpost_isel: return ARM64::LDRSBXpost;
-
-  case ARM64::STRXpre_isel:    return ARM64::STRXpre;
-  case ARM64::STRWpre_isel:    return ARM64::STRWpre;
-  case ARM64::STRHHpre_isel:   return ARM64::STRHHpre;
-  case ARM64::STRBBpre_isel:   return ARM64::STRBBpre;
-  case ARM64::STRDpre_isel:    return ARM64::STRDpre;
-  case ARM64::STRSpre_isel:    return ARM64::STRSpre;
-  }
-  llvm_unreachable("Unexpected pre-indexed opcode!");
-}
-
-void ARM64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  // Do any auto-generated pseudo lowerings.
-  if (emitPseudoExpansionLowering(OutStreamer, MI))
-    return;
-
-  if (ARM64FI->getLOHRelated().count(MI)) {
-    // Generate a label for LOH related instruction
-    MCSymbol *LOHLabel = GetTempSymbol("loh", LOHLabelCounter++);
-    // Associate the instruction with the label
-    LOHInstToLabel[MI] = LOHLabel;
-    OutStreamer.EmitLabel(LOHLabel);
-  }
-
-  // Do any manual lowerings.
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
-  // Indexed loads and stores use a pseudo to handle complex operand
-  // tricks and writeback to the base register. We strip off the writeback
-  // operand and switch the opcode here. Post-indexed stores were handled by the
-  // tablegen'erated pseudos above. (The complex operand <--> simple
-  // operand isel is beyond tablegen's ability, so we do these manually).
-  case ARM64::LDRHHpre_isel:
-  case ARM64::LDRBBpre_isel:
-  case ARM64::LDRXpre_isel:
-  case ARM64::LDRWpre_isel:
-  case ARM64::LDRDpre_isel:
-  case ARM64::LDRSpre_isel:
-  case ARM64::LDRSBWpre_isel:
-  case ARM64::LDRSBXpre_isel:
-  case ARM64::LDRSHWpre_isel:
-  case ARM64::LDRSHXpre_isel:
-  case ARM64::LDRSWpre_isel:
-  case ARM64::LDRDpost_isel:
-  case ARM64::LDRSpost_isel:
-  case ARM64::LDRXpost_isel:
-  case ARM64::LDRWpost_isel:
-  case ARM64::LDRHHpost_isel:
-  case ARM64::LDRBBpost_isel:
-  case ARM64::LDRSWpost_isel:
-  case ARM64::LDRSHWpost_isel:
-  case ARM64::LDRSHXpost_isel:
-  case ARM64::LDRSBWpost_isel:
-  case ARM64::LDRSBXpost_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the second.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::STRXpre_isel:
-  case ARM64::STRWpre_isel:
-  case ARM64::STRHHpre_isel:
-  case ARM64::STRBBpre_isel:
-  case ARM64::STRDpre_isel:
-  case ARM64::STRSpre_isel: {
-    MCInst TmpInst;
-    // For loads, the writeback operand to be skipped is the first.
-    TmpInst.setOpcode(getRealIndexedOpcode(MI->getOpcode()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(1).getReg()));
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(2).getReg()));
-    TmpInst.addOperand(MCOperand::CreateImm(MI->getOperand(3).getImm()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-
-  // Tail calls use pseudo instructions so they have the proper code-gen
-  // attributes (isCall, isReturn, etc.). We lower them to the real
-  // instruction here.
-  case ARM64::TCRETURNri: {
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::BR);
-    TmpInst.addOperand(MCOperand::CreateReg(MI->getOperand(0).getReg()));
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TCRETURNdi: {
-    MCOperand Dest;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Dest);
-    MCInst TmpInst;
-    TmpInst.setOpcode(ARM64::B);
-    TmpInst.addOperand(Dest);
-    EmitToStreamer(OutStreamer, TmpInst);
-    return;
-  }
-  case ARM64::TLSDESC_BLR: {
-    MCOperand Callee, Sym;
-    MCInstLowering.lowerOperand(MI->getOperand(0), Callee);
-    MCInstLowering.lowerOperand(MI->getOperand(1), Sym);
-
-    // First emit a relocation-annotation. This expands to no code, but requests
-    // the following instruction gets an R_AARCH64_TLSDESC_CALL.
-    MCInst TLSDescCall;
-    TLSDescCall.setOpcode(ARM64::TLSDESCCALL);
-    TLSDescCall.addOperand(Sym);
-    EmitToStreamer(OutStreamer, TLSDescCall);
-
-    // Other than that it's just a normal indirect call to the function loaded
-    // from the descriptor.
-    MCInst BLR;
-    BLR.setOpcode(ARM64::BLR);
-    BLR.addOperand(Callee);
-    EmitToStreamer(OutStreamer, BLR);
-
-    return;
-  }
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(OutStreamer, SM, *MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(OutStreamer, SM, *MI);
-  }
-
-  // Finally, do the automated lowerings for everything else.
-  MCInst TmpInst;
-  MCInstLowering.Lower(MI, TmpInst);
-  EmitToStreamer(OutStreamer, TmpInst);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmPrinter() {
-  RegisterAsmPrinter<ARM64AsmPrinter> X(TheARM64Target);
-}
diff --git a/lib/Target/ARM64/ARM64CallingConv.h b/lib/Target/ARM64/ARM64CallingConv.h
deleted file mode 100644
index 0128236..0000000
--- a/lib/Target/ARM64/ARM64CallingConv.h
+++ /dev/null
@@ -1,94 +0,0 @@
-//=== ARM64CallingConv.h - Custom Calling Convention Routines -*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the custom routines for the ARM64 Calling Convention that
-// aren't done by tablegen.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64CALLINGCONV_H
-#define ARM64CALLINGCONV_H
-
-#include "ARM64InstrInfo.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-namespace llvm {
-
-/// CC_ARM64_Custom_i1i8i16_Reg - customized handling of passing i1/i8/i16 via
-/// register. Here, ValVT can be i1/i8/i16 or i32 depending on whether the
-/// argument is already promoted and LocVT is i1/i8/i16. We only promote the
-/// argument to i32 if we are sure this argument will be passed in register.
-static bool CC_ARM64_Custom_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                        CCValAssign::LocInfo LocInfo,
-                                        ISD::ArgFlagsTy ArgFlags,
-                                        CCState &State,
-                                        bool IsWebKitJS = false) {
-  static const uint16_t RegList1[] = { ARM64::W0, ARM64::W1, ARM64::W2,
-                                       ARM64::W3, ARM64::W4, ARM64::W5,
-                                       ARM64::W6, ARM64::W7 };
-  static const uint16_t RegList2[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                       ARM64::X3, ARM64::X4, ARM64::X5,
-                                       ARM64::X6, ARM64::X7 };
-  static const uint16_t WebKitRegList1[] = { ARM64::W0 };
-  static const uint16_t WebKitRegList2[] = { ARM64::X0 };
-
-  const uint16_t *List1 = IsWebKitJS ? WebKitRegList1 : RegList1;
-  const uint16_t *List2 = IsWebKitJS ? WebKitRegList2 : RegList2;
-
-  if (unsigned Reg = State.AllocateReg(List1, List2, 8)) {
-    // Customized extra section for handling i1/i8/i16:
-    // We need to promote the argument to i32 if it is not done already.
-    if (ValVT != MVT::i32) {
-      if (ArgFlags.isSExt())
-        LocInfo = CCValAssign::SExt;
-      else if (ArgFlags.isZExt())
-        LocInfo = CCValAssign::ZExt;
-      else
-        LocInfo = CCValAssign::AExt;
-      ValVT = MVT::i32;
-    }
-    // Set LocVT to i32 as well if passing via register.
-    LocVT = MVT::i32;
-    State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    return true;
-  }
-  return false;
-}
-
-/// CC_ARM64_WebKit_JS_i1i8i16_Reg - customized handling of passing i1/i8/i16
-/// via register. This behaves the same as CC_ARM64_Custom_i1i8i16_Reg, but only
-/// uses the first register.
-static bool CC_ARM64_WebKit_JS_i1i8i16_Reg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                           CCValAssign::LocInfo LocInfo,
-                                           ISD::ArgFlagsTy ArgFlags,
-                                           CCState &State) {
-  return CC_ARM64_Custom_i1i8i16_Reg(ValNo, ValVT, LocVT, LocInfo, ArgFlags,
-                                     State, true);
-}
-
-/// CC_ARM64_Custom_i1i8i16_Stack: customized handling of passing i1/i8/i16 on
-/// stack. Here, ValVT can be i1/i8/i16 or i32 depending on whether the argument
-/// is already promoted and LocVT is i1/i8/i16. If ValVT is already promoted,
-/// it will be truncated back to i1/i8/i16.
-static bool CC_ARM64_Custom_i1i8i16_Stack(unsigned ValNo, MVT ValVT, MVT LocVT,
-                                          CCValAssign::LocInfo LocInfo,
-                                          ISD::ArgFlagsTy ArgFlags,
-                                          CCState &State) {
-  unsigned Space = ((LocVT == MVT::i1 || LocVT == MVT::i8) ? 1 : 2);
-  unsigned Offset12 = State.AllocateStack(Space, Space);
-  ValVT = LocVT;
-  State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset12, LocVT, LocInfo));
-  return true;
-}
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp b/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
deleted file mode 100644
index 3e410e5..0000000
--- a/lib/Target/ARM64/ARM64DeadRegisterDefinitionsPass.cpp
+++ /dev/null
@@ -1,104 +0,0 @@
-//===-- ARM64DeadRegisterDefinitions.cpp - Replace dead defs w/ zero reg --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-// When allowed by the instruction, replace a dead definition of a GPR with
-// the zero register. This makes the code a bit friendlier towards the
-// hardware's register renamer.
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-dead-defs"
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(NumDeadDefsReplaced, "Number of dead definitions replaced");
-
-namespace {
-class ARM64DeadRegisterDefinitions : public MachineFunctionPass {
-private:
-  bool processMachineBasicBlock(MachineBasicBlock *MBB);
-
-public:
-  static char ID; // Pass identification, replacement for typeid.
-  explicit ARM64DeadRegisterDefinitions() : MachineFunctionPass(ID) {}
-
-  virtual bool runOnMachineFunction(MachineFunction &F);
-
-  const char *getPassName() const { return "Dead register definitions"; }
-
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-    AU.setPreservesCFG();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-};
-char ARM64DeadRegisterDefinitions::ID = 0;
-} // end anonymous namespace
-
-bool
-ARM64DeadRegisterDefinitions::processMachineBasicBlock(MachineBasicBlock *MBB) {
-  bool Changed = false;
-  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
-       ++I) {
-    MachineInstr *MI = I;
-    for (int i = 0, e = MI->getDesc().getNumDefs(); i != e; ++i) {
-      MachineOperand &MO = MI->getOperand(i);
-      if (MO.isReg() && MO.isDead() && MO.isDef()) {
-        assert(!MO.isImplicit() && "Unexpected implicit def!");
-        DEBUG(dbgs() << "  Dead def operand #" << i << " in:\n    ";
-              MI->print(dbgs()));
-        // Be careful not to change the register if it's a tied operand.
-        if (MI->isRegTiedToUseOperand(i)) {
-          DEBUG(dbgs() << "    Ignoring, def is tied operand.\n");
-          continue;
-        }
-        // Make sure the instruction take a register class that contains
-        // the zero register and replace it if so.
-        unsigned NewReg;
-        switch (MI->getDesc().OpInfo[i].RegClass) {
-        default:
-          DEBUG(dbgs() << "    Ignoring, register is not a GPR.\n");
-          continue;
-        case ARM64::GPR32RegClassID:
-          NewReg = ARM64::WZR;
-          break;
-        case ARM64::GPR64RegClassID:
-          NewReg = ARM64::XZR;
-          break;
-        }
-        DEBUG(dbgs() << "    Replacing with zero register. New:\n      ");
-        MO.setReg(NewReg);
-        DEBUG(MI->print(dbgs()));
-        ++NumDeadDefsReplaced;
-      }
-    }
-  }
-  return Changed;
-}
-
-// Scan the function for instructions that have a dead definition of a
-// register. Replace that register with the zero register when possible.
-bool ARM64DeadRegisterDefinitions::runOnMachineFunction(MachineFunction &mf) {
-  MachineFunction *MF = &mf;
-  bool Changed = false;
-  DEBUG(dbgs() << "***** ARM64DeadRegisterDefinitions *****\n");
-
-  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I)
-    if (processMachineBasicBlock(I))
-      Changed = true;
-  return Changed;
-}
-
-FunctionPass *llvm::createARM64DeadRegisterDefinitions() {
-  return new ARM64DeadRegisterDefinitions();
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.cpp b/lib/Target/ARM64/ARM64FrameLowering.cpp
deleted file mode 100644
index 798986c..0000000
--- a/lib/Target/ARM64/ARM64FrameLowering.cpp
+++ /dev/null
@@ -1,816 +0,0 @@
-//===- ARM64FrameLowering.cpp - ARM64 Frame Lowering -----------*- C++ -*-====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of TargetFrameLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "frame-info"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64Subtarget.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Function.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-static cl::opt<bool> EnableRedZone("arm64-redzone",
-                                   cl::desc("enable use of redzone on ARM64"),
-                                   cl::init(false), cl::Hidden);
-
-STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
-
-static unsigned estimateStackSize(MachineFunction &MF) {
-  const MachineFrameInfo *FFI = MF.getFrameInfo();
-  int Offset = 0;
-  for (int i = FFI->getObjectIndexBegin(); i != 0; ++i) {
-    int FixedOff = -FFI->getObjectOffset(i);
-    if (FixedOff > Offset)
-      Offset = FixedOff;
-  }
-  for (unsigned i = 0, e = FFI->getObjectIndexEnd(); i != e; ++i) {
-    if (FFI->isDeadObjectIndex(i))
-      continue;
-    Offset += FFI->getObjectSize(i);
-    unsigned Align = FFI->getObjectAlignment(i);
-    // Adjust to alignment boundary
-    Offset = (Offset + Align - 1) / Align * Align;
-  }
-  // This does not include the 16 bytes used for fp and lr.
-  return (unsigned)Offset;
-}
-
-bool ARM64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
-  if (!EnableRedZone)
-    return false;
-  // Don't use the red zone if the function explicitly asks us not to.
-  // This is typically used for kernel code.
-  if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoRedZone))
-    return false;
-
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  unsigned NumBytes = AFI->getLocalStackSize();
-
-  // Note: currently hasFP() is always true for hasCalls(), but that's an
-  // implementation detail of the current code, not a strict requirement,
-  // so stay safe here and check both.
-  if (MFI->hasCalls() || hasFP(MF) || NumBytes > 128)
-    return false;
-  return true;
-}
-
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.
-bool ARM64FrameLowering::hasFP(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-#ifndef NDEBUG
-  const TargetRegisterInfo *RegInfo = MF.getTarget().getRegisterInfo();
-  assert(!RegInfo->needsStackRealignment(MF) &&
-         "No stack realignment on ARM64!");
-#endif
-
-  return (MFI->hasCalls() || MFI->hasVarSizedObjects() ||
-          MFI->isFrameAddressTaken());
-}
-
-/// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
-/// not required, we reserve argument space for call sites in the function
-/// immediately on entry to the current function.  This eliminates the need for
-/// add/sub sp brackets around call sites.  Returns true if the call frame is
-/// included as part of the stack frame.
-bool ARM64FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  return !MF.getFrameInfo()->hasVarSizedObjects();
-}
-
-void ARM64FrameLowering::eliminateCallFramePseudoInstr(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator I) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  if (!TFI->hasReservedCallFrame(MF)) {
-    // If we have alloca, convert as follows:
-    // ADJCALLSTACKDOWN -> sub, sp, sp, amount
-    // ADJCALLSTACKUP   -> add, sp, sp, amount
-    MachineInstr *Old = I;
-    DebugLoc DL = Old->getDebugLoc();
-    unsigned Amount = Old->getOperand(0).getImm();
-    if (Amount != 0) {
-      // We need to keep the stack aligned properly.  To do this, we round the
-      // amount of space needed for the outgoing arguments up to the next
-      // alignment boundary.
-      unsigned Align = TFI->getStackAlignment();
-      Amount = (Amount + Align - 1) / Align * Align;
-
-      // Replace the pseudo instruction with a new instruction...
-      unsigned Opc = Old->getOpcode();
-      if (Opc == ARM64::ADJCALLSTACKDOWN) {
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, -Amount, TII);
-      } else {
-        assert(Opc == ARM64::ADJCALLSTACKUP && "expected ADJCALLSTACKUP");
-        emitFrameOffset(MBB, I, DL, ARM64::SP, ARM64::SP, Amount, TII);
-      }
-    }
-  }
-  MBB.erase(I);
-}
-
-void
-ARM64FrameLowering::emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                              MachineBasicBlock::iterator MBBI,
-                                              unsigned FramePtr) const {
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  // Add callee saved registers to move list.
-  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
-  if (CSI.empty())
-    return;
-
-  const DataLayout *TD = MF.getTarget().getDataLayout();
-  bool HasFP = hasFP(MF);
-
-  // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TD->getPointerSize(0);
-
-  // Calculate offsets.
-  int64_t saveAreaOffset = (HasFP ? 2 : 1) * stackGrowth;
-  unsigned TotalSkipped = 0;
-  for (const auto &Info : CSI) {
-    unsigned Reg = Info.getReg();
-    int64_t Offset = MFI->getObjectOffset(Info.getFrameIdx()) -
-                     getOffsetOfLocalArea() + saveAreaOffset;
-
-    // Don't output a new CFI directive if we're re-saving the frame pointer or
-    // link register. This happens when the PrologEpilogInserter has inserted an
-    // extra "STP" of the frame pointer and link register -- the "emitPrologue"
-    // method automatically generates the directives when frame pointers are
-    // used. If we generate CFI directives for the extra "STP"s, the linker will
-    // lose track of the correct values for the frame pointer and link register.
-    if (HasFP && (FramePtr == Reg || Reg == ARM64::LR)) {
-      TotalSkipped += stackGrowth;
-      continue;
-    }
-
-    unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
-        nullptr, DwarfReg, Offset - TotalSkipped));
-    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
-  }
-}
-
-void ARM64FrameLowering::emitPrologue(MachineFunction &MF) const {
-  MachineBasicBlock &MBB = MF.front(); // Prologue goes in entry BB.
-  MachineBasicBlock::iterator MBBI = MBB.begin();
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const Function *Fn = MF.getFunction();
-  const ARM64RegisterInfo *RegInfo = TM.getRegisterInfo();
-  const ARM64InstrInfo *TII = TM.getInstrInfo();
-  MachineModuleInfo &MMI = MF.getMMI();
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  bool needsFrameMoves = MMI.hasDebugInfo() || Fn->needsUnwindTableEntry();
-  bool HasFP = hasFP(MF);
-  DebugLoc DL = MBB.findDebugLoc(MBBI);
-
-  int NumBytes = (int)MFI->getStackSize();
-  if (!AFI->hasStackFrame()) {
-    assert(!HasFP && "unexpected function without stack frame but with FP");
-
-    // All of the stack allocation is for locals.
-    AFI->setLocalStackSize(NumBytes);
-
-    // Label used to tie together the PROLOG_LABEL and the MachineMoves.
-    MCSymbol *FrameLabel = MMI.getContext().CreateTempSymbol();
-
-    // REDZONE: If the stack size is less than 128 bytes, we don't need
-    // to actually allocate.
-    if (NumBytes && !canUseRedZone(MF)) {
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else if (NumBytes) {
-      ++NumRedZoneFunctions;
-    }
-
-    return;
-  }
-
-  // Only set up FP if we actually need to.
-  int FPOffset = 0;
-  if (HasFP) {
-    // First instruction must a) allocate the stack  and b) have an immediate
-    // that is a multiple of -2.
-    assert((MBBI->getOpcode() == ARM64::STPXpre ||
-            MBBI->getOpcode() == ARM64::STPDpre) &&
-           MBBI->getOperand(2).getReg() == ARM64::SP &&
-           MBBI->getOperand(3).getImm() < 0 &&
-           (MBBI->getOperand(3).getImm() & 1) == 0);
-
-    // Frame pointer is fp = sp - 16. Since the  STPXpre subtracts the space
-    // required for the callee saved register area we get the frame pointer
-    // by addding that offset - 16 = -getImm()*8 - 2*8 = -(getImm() + 2) * 8.
-    FPOffset = -(MBBI->getOperand(3).getImm() + 2) * 8;
-    assert(FPOffset >= 0 && "Bad Framepointer Offset");
-  }
-
-  // Move past the saves of the callee-saved registers.
-  while (MBBI->getOpcode() == ARM64::STPXi ||
-         MBBI->getOpcode() == ARM64::STPDi ||
-         MBBI->getOpcode() == ARM64::STPXpre ||
-         MBBI->getOpcode() == ARM64::STPDpre) {
-    ++MBBI;
-    NumBytes -= 16;
-  }
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-  if (HasFP) {
-    // Issue    sub fp, sp, FPOffset or
-    //          mov fp,sp          when FPOffset is zero.
-    // Note: All stores of callee-saved registers are marked as "FrameSetup".
-    // This code marks the instruction(s) that set the FP also.
-    emitFrameOffset(MBB, MBBI, DL, ARM64::FP, ARM64::SP, FPOffset, TII,
-                    MachineInstr::FrameSetup);
-  }
-
-  // All of the remaining stack allocations are for locals.
-  AFI->setLocalStackSize(NumBytes);
-
-  // Allocate space for the rest of the frame.
-  if (NumBytes) {
-    // If we're a leaf function, try using the red zone.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, MBBI, DL, ARM64::SP, ARM64::SP, -NumBytes, TII,
-                      MachineInstr::FrameSetup);
-  }
-
-  // If we need a base pointer, set it up here. It's whatever the value of the
-  // stack pointer is at this point. Any variable size objects will be allocated
-  // after this, so we can still use the base pointer to reference locals.
-  //
-  // FIXME: Clarify FrameSetup flags here.
-  // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
-  // needed.
-  //
-  if (RegInfo->hasBasePointer(MF))
-    TII->copyPhysReg(MBB, MBBI, DL, ARM64::X19, ARM64::SP, false);
-
-  if (needsFrameMoves) {
-    const DataLayout *TD = MF.getTarget().getDataLayout();
-    const int StackGrowth = -TD->getPointerSize(0);
-    unsigned FramePtr = RegInfo->getFrameRegister(MF);
-
-    // An example of the prologue:
-    //
-    //     .globl __foo
-    //     .align 2
-    //  __foo:
-    // Ltmp0:
-    //     .cfi_startproc
-    //     .cfi_personality 155, ___gxx_personality_v0
-    // Leh_func_begin:
-    //     .cfi_lsda 16, Lexception33
-    //
-    //     stp  xa,bx, [sp, -#offset]!
-    //     ...
-    //     stp  x28, x27, [sp, #offset-32]
-    //     stp  fp, lr, [sp, #offset-16]
-    //     add  fp, sp, #offset - 16
-    //     sub  sp, sp, #1360
-    //
-    // The Stack:
-    //       +-------------------------------------------+
-    // 10000 | ........ | ........ | ........ | ........ |
-    // 10004 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10008 | ........ | ........ | ........ | ........ |
-    // 1000c | ........ | ........ | ........ | ........ |
-    //       +===========================================+
-    // 10010 |                X28 Register               |
-    // 10014 |                X28 Register               |
-    //       +-------------------------------------------+
-    // 10018 |                X27 Register               |
-    // 1001c |                X27 Register               |
-    //       +===========================================+
-    // 10020 |                Frame Pointer              |
-    // 10024 |                Frame Pointer              |
-    //       +-------------------------------------------+
-    // 10028 |                Link Register              |
-    // 1002c |                Link Register              |
-    //       +===========================================+
-    // 10030 | ........ | ........ | ........ | ........ |
-    // 10034 | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    // 10038 | ........ | ........ | ........ | ........ |
-    // 1003c | ........ | ........ | ........ | ........ |
-    //       +-------------------------------------------+
-    //
-    //     [sp] = 10030        ::    >>initial value<<
-    //     sp = 10020          ::  stp fp, lr, [sp, #-16]!
-    //     fp = sp == 10020    ::  mov fp, sp
-    //     [sp] == 10020       ::  stp x28, x27, [sp, #-16]!
-    //     sp == 10010         ::    >>final value<<
-    //
-    // The frame pointer (w29) points to address 10020. If we use an offset of
-    // '16' from 'w29', we get the CFI offsets of -8 for w30, -16 for w29, -24
-    // for w27, and -32 for w28:
-    //
-    //  Ltmp1:
-    //     .cfi_def_cfa w29, 16
-    //  Ltmp2:
-    //     .cfi_offset w30, -8
-    //  Ltmp3:
-    //     .cfi_offset w29, -16
-    //  Ltmp4:
-    //     .cfi_offset w27, -24
-    //  Ltmp5:
-    //     .cfi_offset w28, -32
-
-    if (HasFP) {
-      // Define the current CFA rule to use the provided FP.
-      unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfa(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored LR
-      unsigned LR = RegInfo->getDwarfRegNum(ARM64::LR, true);
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, LR, StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-
-      // Record the location of the stored FP
-      CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(nullptr, Reg, 2 * StackGrowth));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    } else {
-      // Encode the stack size of the leaf function.
-      unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI->getStackSize()));
-      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-          .addCFIIndex(CFIIndex);
-    }
-
-    // Now emit the moves for whatever callee saved regs we have.
-    emitCalleeSavedFrameMoves(MBB, MBBI, FramePtr);
-  }
-}
-
-static bool isCalleeSavedRegister(unsigned Reg, const uint16_t *CSRegs) {
-  for (unsigned i = 0; CSRegs[i]; ++i)
-    if (Reg == CSRegs[i])
-      return true;
-  return false;
-}
-
-static bool isCSRestore(MachineInstr *MI, const uint16_t *CSRegs) {
-  if (MI->getOpcode() == ARM64::LDPXpost ||
-      MI->getOpcode() == ARM64::LDPDpost || MI->getOpcode() == ARM64::LDPXi ||
-      MI->getOpcode() == ARM64::LDPDi) {
-    if (!isCalleeSavedRegister(MI->getOperand(0).getReg(), CSRegs) ||
-        !isCalleeSavedRegister(MI->getOperand(1).getReg(), CSRegs) ||
-        MI->getOperand(2).getReg() != ARM64::SP)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-void ARM64FrameLowering::emitEpilogue(MachineFunction &MF,
-                                      MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
-  assert(MBBI->isReturn() && "Can only insert epilog into returning blocks");
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64InstrInfo *TII =
-      static_cast<const ARM64InstrInfo *>(MF.getTarget().getInstrInfo());
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  DebugLoc DL = MBBI->getDebugLoc();
-
-  int NumBytes = MFI->getStackSize();
-  unsigned NumRestores = 0;
-  // Move past the restores of the callee-saved registers.
-  MachineBasicBlock::iterator LastPopI = MBBI;
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-  if (LastPopI != MBB.begin()) {
-    do {
-      ++NumRestores;
-      --LastPopI;
-    } while (LastPopI != MBB.begin() && isCSRestore(LastPopI, CSRegs));
-    if (!isCSRestore(LastPopI, CSRegs)) {
-      ++LastPopI;
-      --NumRestores;
-    }
-  }
-  NumBytes -= NumRestores * 16;
-  assert(NumBytes >= 0 && "Negative stack allocation size!?");
-
-  if (!hasFP(MF)) {
-    // If this was a redzone leaf function, we don't need to restore the
-    // stack pointer.
-    if (!canUseRedZone(MF))
-      emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::SP, NumBytes, TII);
-    return;
-  }
-
-  // Restore the original stack pointer.
-  // FIXME: Rather than doing the math here, we should instead just use
-  // non-post-indexed loads for the restores if we aren't actually going to
-  // be able to save any instructions.
-  if (NumBytes || MFI->hasVarSizedObjects())
-    emitFrameOffset(MBB, LastPopI, DL, ARM64::SP, ARM64::FP,
-                    -(NumRestores - 1) * 16, TII, MachineInstr::NoFlags);
-}
-
-/// getFrameIndexOffset - Returns the displacement from the frame register to
-/// the stack frame of the specified index.
-int ARM64FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
-                                            int FI) const {
-  unsigned FrameReg;
-  return getFrameIndexReference(MF, FI, FrameReg);
-}
-
-/// getFrameIndexReference - Provide a base+offset reference to an FI slot for
-/// debug info.  It's the same as what we use for resolving the code-gen
-/// references for now.  FIXME: This can go wrong when references are
-/// SP-relative and simple call frames aren't used.
-int ARM64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                               int FI,
-                                               unsigned &FrameReg) const {
-  return resolveFrameIndexReference(MF, FI, FrameReg);
-}
-
-int ARM64FrameLowering::resolveFrameIndexReference(const MachineFunction &MF,
-                                                   int FI, unsigned &FrameReg,
-                                                   bool PreferFP) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  const ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  int FPOffset = MFI->getObjectOffset(FI) + 16;
-  int Offset = MFI->getObjectOffset(FI) + MFI->getStackSize();
-  bool isFixed = MFI->isFixedObjectIndex(FI);
-
-  // Use frame pointer to reference fixed objects. Use it for locals if
-  // there are VLAs (and thus the SP isn't reliable as a base).
-  // Make sure useFPForScavengingIndex() does the right thing for the emergency
-  // spill slot.
-  bool UseFP = false;
-  if (AFI->hasStackFrame()) {
-    // Note: Keeping the following as multiple 'if' statements rather than
-    // merging to a single expression for readability.
-    //
-    // Argument access should always use the FP.
-    if (isFixed) {
-      UseFP = hasFP(MF);
-    } else if (hasFP(MF) && !RegInfo->hasBasePointer(MF)) {
-      // Use SP or FP, whichever gives us the best chance of the offset
-      // being in range for direct access. If the FPOffset is positive,
-      // that'll always be best, as the SP will be even further away.
-      // If the FPOffset is negative, we have to keep in mind that the
-      // available offset range for negative offsets is smaller than for
-      // positive ones. If we have variable sized objects, we're stuck with
-      // using the FP regardless, though, as the SP offset is unknown
-      // and we don't have a base pointer available. If an offset is
-      // available via the FP and the SP, use whichever is closest.
-      if (PreferFP || MFI->hasVarSizedObjects() || FPOffset >= 0 ||
-          (FPOffset >= -256 && Offset > -FPOffset))
-        UseFP = true;
-    }
-  }
-
-  if (UseFP) {
-    FrameReg = RegInfo->getFrameRegister(MF);
-    return FPOffset;
-  }
-
-  // Use the base pointer if we have one.
-  if (RegInfo->hasBasePointer(MF))
-    FrameReg = RegInfo->getBaseRegister();
-  else {
-    FrameReg = ARM64::SP;
-    // If we're using the red zone for this function, the SP won't actually
-    // be adjusted, so the offsets will be negative. They're also all
-    // within range of the signed 9-bit immediate instructions.
-    if (canUseRedZone(MF))
-      Offset -= AFI->getLocalStackSize();
-  }
-
-  return Offset;
-}
-
-static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
-  if (Reg != ARM64::LR)
-    return getKillRegState(true);
-
-  // LR maybe referred to later by an @llvm.returnaddress intrinsic.
-  bool LRLiveIn = MF.getRegInfo().isLiveIn(ARM64::LR);
-  bool LRKill = !(LRLiveIn && MF.getFrameInfo()->isReturnAddressTaken());
-  return getKillRegState(LRKill);
-}
-
-bool ARM64FrameLowering::spillCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned idx = Count - i - 2;
-    unsigned Reg1 = CSI[idx].getReg();
-    unsigned Reg2 = CSI[idx + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    //
-    // The order of the registers in the list is controlled by
-    // getCalleeSavedRegs(), so they will always be in-order, as well.
-    assert(CSI[idx].getFrameIdx() + 1 == CSI[idx + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    unsigned StrOpc;
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    // Issue sequence of non-sp increment and pi sp spills for cs regs. The
-    // first spill is a pre-increment that allocates the stack.
-    // For example:
-    //    stp     x22, x21, [sp, #-48]!   // addImm(-6)
-    //    stp     x20, x19, [sp, #16]    // addImm(+2)
-    //    stp     fp, lr, [sp, #32]      // addImm(+4)
-    // Rationale: This sequence saves uop updates compared to a sequence of
-    // pre-increment spills like stp xi,xj,[sp,#-16]!
-    // Note: Similar rational and sequence for restores in epilog.
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPXpre;
-      else
-        StrOpc = ARM64::STPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      // For first spill use pre-increment store.
-      if (i == 0)
-        StrOpc = ARM64::STPDpre;
-      else
-        StrOpc = ARM64::STPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR spill: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[idx].getFrameIdx()
-                 << ", " << CSI[idx + 1].getFrameIdx() << ")\n");
-    // Compute offset: i = 0 => offset = -Count;
-    //                 i = 2 => offset = -(Count - 2) + Count = 2 = i; etc.
-    const int Offset = (i == 0) ? -Count : i;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for STP immediate");
-    BuildMI(MBB, MI, DL, TII.get(StrOpc))
-        .addReg(Reg2, getPrologueDeath(MF, Reg2))
-        .addReg(Reg1, getPrologueDeath(MF, Reg1))
-        .addReg(ARM64::SP)
-        .addImm(Offset) // [sp, #offset * 8], where factor * 8 is implicit
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-  return true;
-}
-
-bool ARM64FrameLowering::restoreCalleeSavedRegisters(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
-  MachineFunction &MF = *MBB.getParent();
-  const TargetInstrInfo &TII = *MF.getTarget().getInstrInfo();
-  unsigned Count = CSI.size();
-  DebugLoc DL;
-  assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-
-  if (MI != MBB.end())
-    DL = MI->getDebugLoc();
-
-  for (unsigned i = 0; i < Count; i += 2) {
-    unsigned Reg1 = CSI[i].getReg();
-    unsigned Reg2 = CSI[i + 1].getReg();
-    // GPRs and FPRs are saved in pairs of 64-bit regs. We expect the CSI
-    // list to come in sorted by frame index so that we can issue the store
-    // pair instructions directly. Assert if we see anything otherwise.
-    assert(CSI[i].getFrameIdx() + 1 == CSI[i + 1].getFrameIdx() &&
-           "Out of order callee saved regs!");
-    // Issue sequence of non-sp increment and sp-pi restores for cs regs. Only
-    // the last load is sp-pi post-increment and de-allocates the stack:
-    // For example:
-    //    ldp     fp, lr, [sp, #32]       // addImm(+4)
-    //    ldp     x20, x19, [sp, #16]     // addImm(+2)
-    //    ldp     x22, x21, [sp], #48     // addImm(+6)
-    // Note: see comment in spillCalleeSavedRegisters()
-    unsigned LdrOpc;
-
-    assert((Count & 1) == 0 && "Odd number of callee-saved regs to spill!");
-    assert((i & 1) == 0 && "Odd index for callee-saved reg spill!");
-    if (ARM64::GPR64RegClass.contains(Reg1)) {
-      assert(ARM64::GPR64RegClass.contains(Reg2) &&
-             "Expected GPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPXpost;
-      else
-        LdrOpc = ARM64::LDPXi;
-    } else if (ARM64::FPR64RegClass.contains(Reg1)) {
-      assert(ARM64::FPR64RegClass.contains(Reg2) &&
-             "Expected FPR64 callee-saved register pair!");
-      if (i == Count - 2)
-        LdrOpc = ARM64::LDPDpost;
-      else
-        LdrOpc = ARM64::LDPDi;
-    } else
-      llvm_unreachable("Unexpected callee saved register!");
-    DEBUG(dbgs() << "CSR restore: (" << TRI->getName(Reg1) << ", "
-                 << TRI->getName(Reg2) << ") -> fi#(" << CSI[i].getFrameIdx()
-                 << ", " << CSI[i + 1].getFrameIdx() << ")\n");
-
-    // Compute offset: i = 0 => offset = Count - 2; i = 2 => offset = Count - 4;
-    // etc.
-    const int Offset = (i == Count - 2) ? Count : Count - i - 2;
-    assert((Offset >= -64 && Offset <= 63) &&
-           "Offset out of bounds for LDP immediate");
-    BuildMI(MBB, MI, DL, TII.get(LdrOpc))
-        .addReg(Reg2, getDefRegState(true))
-        .addReg(Reg1, getDefRegState(true))
-        .addReg(ARM64::SP)
-        .addImm(Offset); // [sp], #offset * 8  or [sp, #offset * 8]
-                         // where the factor * 8 is implicit
-  }
-  return true;
-}
-
-void ARM64FrameLowering::processFunctionBeforeCalleeSavedScan(
-    MachineFunction &MF, RegScavenger *RS) const {
-  const ARM64RegisterInfo *RegInfo =
-      static_cast<const ARM64RegisterInfo *>(MF.getTarget().getRegisterInfo());
-  ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-  MachineRegisterInfo *MRI = &MF.getRegInfo();
-  SmallVector<unsigned, 4> UnspilledCSGPRs;
-  SmallVector<unsigned, 4> UnspilledCSFPRs;
-
-  // The frame record needs to be created by saving the appropriate registers
-  if (hasFP(MF)) {
-    MRI->setPhysRegUsed(ARM64::FP);
-    MRI->setPhysRegUsed(ARM64::LR);
-  }
-
-  // Spill the BasePtr if it's used. Do this first thing so that the
-  // getCalleeSavedRegs() below will get the right answer.
-  if (RegInfo->hasBasePointer(MF))
-    MRI->setPhysRegUsed(RegInfo->getBaseRegister());
-
-  // If any callee-saved registers are used, the frame cannot be eliminated.
-  unsigned NumGPRSpilled = 0;
-  unsigned NumFPRSpilled = 0;
-  bool ExtraCSSpill = false;
-  bool CanEliminateFrame = true;
-  DEBUG(dbgs() << "*** processFunctionBeforeCalleeSavedScan\nUsed CSRs:");
-  const uint16_t *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
-
-  // Check pairs of consecutive callee-saved registers.
-  for (unsigned i = 0; CSRegs[i]; i += 2) {
-    assert(CSRegs[i + 1] && "Odd number of callee-saved registers!");
-
-    const unsigned OddReg = CSRegs[i];
-    const unsigned EvenReg = CSRegs[i + 1];
-    assert((ARM64::GPR64RegClass.contains(OddReg) &&
-            ARM64::GPR64RegClass.contains(EvenReg)) ^
-               (ARM64::FPR64RegClass.contains(OddReg) &&
-                ARM64::FPR64RegClass.contains(EvenReg)) &&
-           "Register class mismatch!");
-
-    const bool OddRegUsed = MRI->isPhysRegUsed(OddReg);
-    const bool EvenRegUsed = MRI->isPhysRegUsed(EvenReg);
-
-    // Early exit if none of the registers in the register pair is actually
-    // used.
-    if (!OddRegUsed && !EvenRegUsed) {
-      if (ARM64::GPR64RegClass.contains(OddReg)) {
-        UnspilledCSGPRs.push_back(OddReg);
-        UnspilledCSGPRs.push_back(EvenReg);
-      } else {
-        UnspilledCSFPRs.push_back(OddReg);
-        UnspilledCSFPRs.push_back(EvenReg);
-      }
-      continue;
-    }
-
-    unsigned Reg = ARM64::NoRegister;
-    // If only one of the registers of the register pair is used, make sure to
-    // mark the other one as used as well.
-    if (OddRegUsed ^ EvenRegUsed) {
-      // Find out which register is the additional spill.
-      Reg = OddRegUsed ? EvenReg : OddReg;
-      MRI->setPhysRegUsed(Reg);
-    }
-
-    DEBUG(dbgs() << ' ' << PrintReg(OddReg, RegInfo));
-    DEBUG(dbgs() << ' ' << PrintReg(EvenReg, RegInfo));
-
-    assert(((OddReg == ARM64::LR && EvenReg == ARM64::FP) ||
-            (RegInfo->getEncodingValue(OddReg) + 1 ==
-             RegInfo->getEncodingValue(EvenReg))) &&
-           "Register pair of non-adjacent registers!");
-    if (ARM64::GPR64RegClass.contains(OddReg)) {
-      NumGPRSpilled += 2;
-      // If it's not a reserved register, we can use it in lieu of an
-      // emergency spill slot for the register scavenger.
-      // FIXME: It would be better to instead keep looking and choose another
-      // unspilled register that isn't reserved, if there is one.
-      if (Reg != ARM64::NoRegister && !RegInfo->isReservedReg(MF, Reg))
-        ExtraCSSpill = true;
-    } else
-      NumFPRSpilled += 2;
-
-    CanEliminateFrame = false;
-  }
-
-  // FIXME: Set BigStack if any stack slot references may be out of range.
-  // For now, just conservatively guestimate based on unscaled indexing
-  // range. We'll end up allocating an unnecessary spill slot a lot, but
-  // realistically that's not a big deal at this stage of the game.
-  // The CSR spill slots have not been allocated yet, so estimateStackSize
-  // won't include them.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned CFSize = estimateStackSize(MF) + 8 * (NumGPRSpilled + NumFPRSpilled);
-  DEBUG(dbgs() << "Estimated stack frame size: " << CFSize << " bytes.\n");
-  bool BigStack = (CFSize >= 256);
-  if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF))
-    AFI->setHasStackFrame(true);
-
-  // Estimate if we might need to scavenge a register at some point in order
-  // to materialize a stack offset. If so, either spill one additional
-  // callee-saved register or reserve a special spill slot to facilitate
-  // register scavenging. If we already spilled an extra callee-saved register
-  // above to keep the number of spills even, we don't need to do anything else
-  // here.
-  if (BigStack && !ExtraCSSpill) {
-
-    // If we're adding a register to spill here, we have to add two of them
-    // to keep the number of regs to spill even.
-    assert(((UnspilledCSGPRs.size() & 1) == 0) && "Odd number of registers!");
-    unsigned Count = 0;
-    while (!UnspilledCSGPRs.empty() && Count < 2) {
-      unsigned Reg = UnspilledCSGPRs.back();
-      UnspilledCSGPRs.pop_back();
-      DEBUG(dbgs() << "Spilling " << PrintReg(Reg, RegInfo)
-                   << " to get a scratch register.\n");
-      MRI->setPhysRegUsed(Reg);
-      ExtraCSSpill = true;
-      ++Count;
-    }
-
-    // If we didn't find an extra callee-saved register to spill, create
-    // an emergency spill slot.
-    if (!ExtraCSSpill) {
-      const TargetRegisterClass *RC = &ARM64::GPR64RegClass;
-      int FI = MFI->CreateStackObject(RC->getSize(), RC->getAlignment(), false);
-      RS->addScavengingFrameIndex(FI);
-      DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
-                   << " as the emergency spill slot.\n");
-    }
-  }
-}
diff --git a/lib/Target/ARM64/ARM64FrameLowering.h b/lib/Target/ARM64/ARM64FrameLowering.h
deleted file mode 100644
index 02edcdb..0000000
--- a/lib/Target/ARM64/ARM64FrameLowering.h
+++ /dev/null
@@ -1,75 +0,0 @@
-//===-- ARM64FrameLowering.h - TargetFrameLowering for ARM64 ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64_FRAMELOWERING_H
-#define ARM64_FRAMELOWERING_H
-
-#include "llvm/Target/TargetFrameLowering.h"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64FrameLowering : public TargetFrameLowering {
-  const ARM64TargetMachine &TM;
-
-public:
-  explicit ARM64FrameLowering(const ARM64TargetMachine &TM,
-                              const ARM64Subtarget &STI)
-      : TargetFrameLowering(StackGrowsDown, 16, 0, 16,
-                            false /*StackRealignable*/),
-        TM(TM) {}
-
-  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI,
-                                 unsigned FramePtr) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
-  /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const;
-  int resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg,
-                                 bool PreferFP = false) const;
-  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
-
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
-
-  /// \brief Can this function use the red zone for local allocations.
-  bool canUseRedZone(const MachineFunction &MF) const;
-
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-
-  void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
-};
-
-} // End llvm namespace
-
-#endif
diff --git a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp b/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
deleted file mode 100644
index 2e234c9..0000000
--- a/lib/Target/ARM64/ARM64ISelDAGToDAG.cpp
+++ /dev/null
@@ -1,2381 +0,0 @@
-//===-- ARM64ISelDAGToDAG.cpp - A dag to dag inst selector for ARM64 ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines an instruction selector for the ARM64 target.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-isel"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/Function.h" // To access function attributes.
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-//===--------------------------------------------------------------------===//
-/// ARM64DAGToDAGISel - ARM64 specific code to select ARM64 machine
-/// instructions for SelectionDAG operations.
-///
-namespace {
-
-class ARM64DAGToDAGISel : public SelectionDAGISel {
-  ARM64TargetMachine &TM;
-
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  bool ForCodeSize;
-
-public:
-  explicit ARM64DAGToDAGISel(ARM64TargetMachine &tm, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(tm, OptLevel), TM(tm),
-        Subtarget(&TM.getSubtarget<ARM64Subtarget>()), ForCodeSize(false) {}
-
-  virtual const char *getPassName() const {
-    return "ARM64 Instruction Selection";
-  }
-
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
-    AttributeSet FnAttrs = MF.getFunction()->getAttributes();
-    ForCodeSize =
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex,
-                             Attribute::OptimizeForSize) ||
-        FnAttrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  SDNode *Select(SDNode *Node);
-
-  /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
-  /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
-
-  SDNode *SelectMLAV64LaneV128(SDNode *N);
-  SDNode *SelectMULLV64LaneV128(unsigned IntNo, SDNode *N);
-  bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
-  bool SelectArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectNegArithImmed(SDValue N, SDValue &Val, SDValue &Shift);
-  bool SelectArithShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, false, Reg, Shift);
-  }
-  bool SelectLogicalShiftedRegister(SDValue N, SDValue &Reg, SDValue &Shift) {
-    return SelectShiftedRegister(N, true, Reg, Shift);
-  }
-  bool SelectAddrModeIndexed8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeIndexed128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeIndexed(N, 16, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled8(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 1, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled16(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 2, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled32(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 4, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled64(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 8, Base, OffImm);
-  }
-  bool SelectAddrModeUnscaled128(SDValue N, SDValue &Base, SDValue &OffImm) {
-    return SelectAddrModeUnscaled(N, 16, Base, OffImm);
-  }
-
-  bool SelectAddrModeRO8(SDValue N, SDValue &Base, SDValue &Offset,
-                         SDValue &Imm) {
-    return SelectAddrModeRO(N, 1, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO16(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 2, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO32(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 4, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO64(SDValue N, SDValue &Base, SDValue &Offset,
-                          SDValue &Imm) {
-    return SelectAddrModeRO(N, 8, Base, Offset, Imm);
-  }
-  bool SelectAddrModeRO128(SDValue N, SDValue &Base, SDValue &Offset,
-                           SDValue &Imm) {
-    return SelectAddrModeRO(N, 16, Base, Offset, Imm);
-  }
-  bool SelectAddrModeNoIndex(SDValue N, SDValue &Val);
-
-  /// Form sequences of consecutive 64/128-bit registers for use in NEON
-  /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
-  /// between 1 and 4 elements. If it contains a single element that is returned
-  /// unchanged; otherwise a REG_SEQUENCE value is returned.
-  SDValue createDTuple(ArrayRef<SDValue> Vecs);
-  SDValue createQTuple(ArrayRef<SDValue> Vecs);
-
-  /// Generic helper for the createDTuple/createQTuple
-  /// functions. Those should almost always be called instead.
-  SDValue createTuple(ArrayRef<SDValue> Vecs, unsigned RegClassIDs[],
-                      unsigned SubRegs[]);
-
-  SDNode *SelectTable(SDNode *N, unsigned NumVecs, unsigned Opc, bool isExt);
-
-  SDNode *SelectIndexedLoad(SDNode *N, bool &Done);
-
-  SDNode *SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                     unsigned SubRegIdx);
-  SDNode *SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
-  SDNode *SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
-
-  SDNode *SelectSIMDAddSubNarrowing(unsigned IntNo, SDNode *Node);
-  SDNode *SelectSIMDXtnNarrowing(unsigned IntNo, SDNode *Node);
-
-  SDNode *SelectAtomic(SDNode *Node, unsigned Op8, unsigned Op16, unsigned Op32,
-                       unsigned Op64);
-
-  SDNode *SelectBitfieldExtractOp(SDNode *N);
-  SDNode *SelectBitfieldInsertOp(SDNode *N);
-
-  SDNode *SelectLIBM(SDNode *N);
-
-// Include the pieces autogenerated from the target description.
-#include "ARM64GenDAGISel.inc"
-
-private:
-  bool SelectShiftedRegister(SDValue N, bool AllowROR, SDValue &Reg,
-                             SDValue &Shift);
-  bool SelectAddrModeIndexed(SDValue N, unsigned Size, SDValue &Base,
-                             SDValue &OffImm);
-  bool SelectAddrModeUnscaled(SDValue N, unsigned Size, SDValue &Base,
-                              SDValue &OffImm);
-  bool SelectAddrModeRO(SDValue N, unsigned Size, SDValue &Base,
-                        SDValue &Offset, SDValue &Imm);
-  bool isWorthFolding(SDValue V) const;
-  bool SelectExtendedSHL(SDValue N, unsigned Size, SDValue &Offset,
-                         SDValue &Imm);
-};
-} // end anonymous namespace
-
-/// isIntImmediate - This method tests to see if the node is a constant
-/// operand. If so Imm will receive the 32-bit value.
-static bool isIntImmediate(const SDNode *N, uint64_t &Imm) {
-  if (const ConstantSDNode *C = dyn_cast<const ConstantSDNode>(N)) {
-    Imm = C->getZExtValue();
-    return true;
-  }
-  return false;
-}
-
-// isIntImmediate - This method tests to see if a constant operand.
-// If so Imm will receive the value.
-static bool isIntImmediate(SDValue N, uint64_t &Imm) {
-  return isIntImmediate(N.getNode(), Imm);
-}
-
-// isOpcWithIntImmediate - This method tests to see if the node is a specific
-// opcode and that it has a immediate integer right operand.
-// If so Imm will receive the 32 bit value.
-static bool isOpcWithIntImmediate(const SDNode *N, unsigned Opc,
-                                  uint64_t &Imm) {
-  return N->getOpcode() == Opc &&
-         isIntImmediate(N->getOperand(1).getNode(), Imm);
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeNoIndex(SDValue N, SDValue &Val) {
-  EVT ValTy = N.getValueType();
-  if (ValTy != MVT::i64)
-    return false;
-  Val = N;
-  return true;
-}
-
-bool ARM64DAGToDAGISel::SelectInlineAsmMemoryOperand(
-    const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps) {
-  assert(ConstraintCode == 'm' && "unexpected asm memory constraint");
-  // Require the address to be in a register.  That is safe for all ARM64
-  // variants and it is hard to do anything much smarter without knowing
-  // how the operand is used.
-  OutOps.push_back(Op);
-  return false;
-}
-
-/// SelectArithImmed - Select an immediate value that can be represented as
-/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
-/// Val set to the 12-bit value and Shift set to the shifter operand.
-bool ARM64DAGToDAGISel::SelectArithImmed(SDValue N, SDValue &Val,
-                                         SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-  unsigned ShiftAmt;
-
-  if (Immed >> 12 == 0) {
-    ShiftAmt = 0;
-  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
-    ShiftAmt = 12;
-    Immed = Immed >> 12;
-  } else
-    return false;
-
-  unsigned ShVal = ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftAmt);
-  Val = CurDAG->getTargetConstant(Immed, MVT::i32);
-  Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-  return true;
-}
-
-/// SelectNegArithImmed - As above, but negates the value before trying to
-/// select it.
-bool ARM64DAGToDAGISel::SelectNegArithImmed(SDValue N, SDValue &Val,
-                                            SDValue &Shift) {
-  // This function is called from the addsub_shifted_imm ComplexPattern,
-  // which lists [imm] as the list of opcode it's interested in, however
-  // we still need to check whether the operand is actually an immediate
-  // here because the ComplexPattern opcode list is only used in
-  // root-level opcode matching.
-  if (!isa<ConstantSDNode>(N.getNode()))
-    return false;
-
-  // The immediate operand must be a 24-bit zero-extended immediate.
-  uint64_t Immed = cast<ConstantSDNode>(N.getNode())->getZExtValue();
-
-  // This negation is almost always valid, but "cmp wN, #0" and "cmn wN, #0"
-  // have the opposite effect on the C flag, so this pattern mustn't match under
-  // those circumstances.
-  if (Immed == 0)
-    return false;
-
-  if (N.getValueType() == MVT::i32)
-    Immed = ~((uint32_t)Immed) + 1;
-  else
-    Immed = ~Immed + 1ULL;
-  if (Immed & 0xFFFFFFFFFF000000ULL)
-    return false;
-
-  Immed &= 0xFFFFFFULL;
-  return SelectArithImmed(CurDAG->getConstant(Immed, MVT::i32), Val, Shift);
-}
-
-/// getShiftTypeForNode - Translate a shift node to the corresponding
-/// ShiftType value.
-static ARM64_AM::ShiftType getShiftTypeForNode(SDValue N) {
-  switch (N.getOpcode()) {
-  default:
-    return ARM64_AM::InvalidShift;
-  case ISD::SHL:
-    return ARM64_AM::LSL;
-  case ISD::SRL:
-    return ARM64_AM::LSR;
-  case ISD::SRA:
-    return ARM64_AM::ASR;
-  case ISD::ROTR:
-    return ARM64_AM::ROR;
-  }
-}
-
-/// \brief Determine wether it is worth to fold V into an extended register.
-bool ARM64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the a value is used at least twice, unless we are optimizing
-  // for code size.
-  if (ForCodeSize || V.hasOneUse())
-    return true;
-  return false;
-}
-
-/// SelectShiftedRegister - Select a "shifted register" operand.  If the value
-/// is not shifted, set the Shift operand to default of "LSL 0".  The logical
-/// instructions allow the shifted register to be rotated, but the arithmetic
-/// instructions do not.  The AllowROR parameter specifies whether ROR is
-/// supported.
-bool ARM64DAGToDAGISel::SelectShiftedRegister(SDValue N, bool AllowROR,
-                                              SDValue &Reg, SDValue &Shift) {
-  ARM64_AM::ShiftType ShType = getShiftTypeForNode(N);
-  if (ShType == ARM64_AM::InvalidShift)
-    return false;
-  if (!AllowROR && ShType == ARM64_AM::ROR)
-    return false;
-
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    unsigned BitSize = N.getValueType().getSizeInBits();
-    unsigned Val = RHS->getZExtValue() & (BitSize - 1);
-    unsigned ShVal = ARM64_AM::getShifterImm(ShType, Val);
-
-    Reg = N.getOperand(0);
-    Shift = CurDAG->getTargetConstant(ShVal, MVT::i32);
-    return isWorthFolding(N);
-  }
-
-  return false;
-}
-
-/// getExtendTypeForNode - Translate an extend node to the corresponding
-/// ExtendType value.
-static ARM64_AM::ExtendType getExtendTypeForNode(SDValue N,
-                                                 bool IsLoadStore = false) {
-  if (N.getOpcode() == ISD::SIGN_EXTEND ||
-      N.getOpcode() == ISD::SIGN_EXTEND_INREG) {
-    EVT SrcVT;
-    if (N.getOpcode() == ISD::SIGN_EXTEND_INREG)
-      SrcVT = cast<VTSDNode>(N.getOperand(1))->getVT();
-    else
-      SrcVT = N.getOperand(0).getValueType();
-
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::SXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::SXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::SXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::SXTX;
-
-    return ARM64_AM::InvalidExtend;
-  } else if (N.getOpcode() == ISD::ZERO_EXTEND ||
-             N.getOpcode() == ISD::ANY_EXTEND) {
-    EVT SrcVT = N.getOperand(0).getValueType();
-    if (!IsLoadStore && SrcVT == MVT::i8)
-      return ARM64_AM::UXTB;
-    else if (!IsLoadStore && SrcVT == MVT::i16)
-      return ARM64_AM::UXTH;
-    else if (SrcVT == MVT::i32)
-      return ARM64_AM::UXTW;
-    else if (SrcVT == MVT::i64)
-      return ARM64_AM::UXTX;
-
-    return ARM64_AM::InvalidExtend;
-  } else if (N.getOpcode() == ISD::AND) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return ARM64_AM::InvalidExtend;
-    uint64_t AndMask = CSD->getZExtValue();
-
-    switch (AndMask) {
-    default:
-      return ARM64_AM::InvalidExtend;
-    case 0xFF:
-      return !IsLoadStore ? ARM64_AM::UXTB : ARM64_AM::InvalidExtend;
-    case 0xFFFF:
-      return !IsLoadStore ? ARM64_AM::UXTH : ARM64_AM::InvalidExtend;
-    case 0xFFFFFFFF:
-      return ARM64_AM::UXTW;
-    }
-  }
-
-  return ARM64_AM::InvalidExtend;
-}
-
-// Helper for SelectMLAV64LaneV128 - Recognize high lane extracts.
-static bool checkHighLaneIndex(SDNode *DL, SDValue &LaneOp, int &LaneIdx) {
-  if (DL->getOpcode() != ARM64ISD::DUPLANE16 &&
-      DL->getOpcode() != ARM64ISD::DUPLANE32)
-    return false;
-
-  SDValue SV = DL->getOperand(0);
-  if (SV.getOpcode() != ISD::INSERT_SUBVECTOR)
-    return false;
-
-  SDValue EV = SV.getOperand(1);
-  if (EV.getOpcode() != ISD::EXTRACT_SUBVECTOR)
-    return false;
-
-  ConstantSDNode *DLidx = cast<ConstantSDNode>(DL->getOperand(1).getNode());
-  ConstantSDNode *EVidx = cast<ConstantSDNode>(EV.getOperand(1).getNode());
-  LaneIdx = DLidx->getSExtValue() + EVidx->getSExtValue();
-  LaneOp = EV.getOperand(0);
-
-  return true;
-}
-
-// Helper for SelectOpcV64LaneV128 - Recogzine operatinos where one operand is a
-// high lane extract.
-static bool checkV64LaneV128(SDValue Op0, SDValue Op1, SDValue &StdOp,
-                             SDValue &LaneOp, int &LaneIdx) {
-
-  if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (!checkHighLaneIndex(Op0.getNode(), LaneOp, LaneIdx))
-      return false;
-  }
-  StdOp = Op1;
-  return true;
-}
-
-/// SelectMLAV64LaneV128 - ARM64 supports 64-bit vector MLAs (v4i16 and v2i32)
-/// where one multiplicand is a lane in the upper half of a 128-bit vector.
-/// Recognize and select this so that we don't emit unnecessary lane extracts.
-SDNode *ARM64DAGToDAGISel::SelectMLAV64LaneV128(SDNode *N) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  SDValue MLAOp1;   // Will hold ordinary multiplicand for MLA.
-  SDValue MLAOp2;   // Will hold lane-accessed multiplicand for MLA.
-  int LaneIdx = -1; // Will hold the lane index.
-
-  if (Op1.getOpcode() != ISD::MUL ||
-      !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                        LaneIdx)) {
-    std::swap(Op0, Op1);
-    if (Op1.getOpcode() != ISD::MUL ||
-        !checkV64LaneV128(Op1.getOperand(0), Op1.getOperand(1), MLAOp1, MLAOp2,
-                          LaneIdx))
-      return 0;
-  }
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { Op0, MLAOp1, MLAOp2, LaneIdxVal };
-
-  unsigned MLAOpc = ~0U;
-
-  switch (N->getSimpleValueType(0).SimpleTy) {
-  default:
-    llvm_unreachable("Unrecognized MLA.");
-  case MVT::v4i16:
-    MLAOpc = ARM64::MLAv4i16_indexed;
-    break;
-  case MVT::v2i32:
-    MLAOpc = ARM64::MLAv2i32_indexed;
-    break;
-  }
-
-  return CurDAG->getMachineNode(MLAOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectMULLV64LaneV128(unsigned IntNo, SDNode *N) {
-  SDValue SMULLOp0;
-  SDValue SMULLOp1;
-  int LaneIdx;
-
-  if (!checkV64LaneV128(N->getOperand(1), N->getOperand(2), SMULLOp0, SMULLOp1,
-                        LaneIdx))
-    return 0;
-
-  SDValue LaneIdxVal = CurDAG->getTargetConstant(LaneIdx, MVT::i64);
-
-  SDValue Ops[] = { SMULLOp0, SMULLOp1, LaneIdxVal };
-
-  unsigned SMULLOpc = ~0U;
-
-  if (IntNo == Intrinsic::arm64_neon_smull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::SMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::SMULLv2i32_indexed;
-      break;
-    }
-  } else if (IntNo == Intrinsic::arm64_neon_umull) {
-    switch (N->getSimpleValueType(0).SimpleTy) {
-    default:
-      llvm_unreachable("Unrecognized SMULL.");
-    case MVT::v4i32:
-      SMULLOpc = ARM64::UMULLv4i16_indexed;
-      break;
-    case MVT::v2i64:
-      SMULLOpc = ARM64::UMULLv2i32_indexed;
-      break;
-    }
-  } else
-    llvm_unreachable("Unrecognized intrinsic.");
-
-  return CurDAG->getMachineNode(SMULLOpc, SDLoc(N), N->getValueType(0), Ops);
-}
-
-/// SelectArithExtendedRegister - Select a "extended register" operand.  This
-/// operand folds in an extend followed by an optional left shift.
-bool ARM64DAGToDAGISel::SelectArithExtendedRegister(SDValue N, SDValue &Reg,
-                                                    SDValue &Shift) {
-  unsigned ShiftVal = 0;
-  ARM64_AM::ExtendType Ext;
-
-  if (N.getOpcode() == ISD::SHL) {
-    ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-    if (!CSD)
-      return false;
-    ShiftVal = CSD->getZExtValue();
-    if ((ShiftVal & 0x3) != ShiftVal)
-      return false;
-
-    Ext = getExtendTypeForNode(N.getOperand(0));
-    if (Ext == ARM64_AM::InvalidExtend)
-      return false;
-
-    Reg = N.getOperand(0).getOperand(0);
-  } else {
-    Ext = getExtendTypeForNode(N);
-    if (Ext == ARM64_AM::InvalidExtend)
-      return false;
-
-    Reg = N.getOperand(0);
-  }
-
-  // ARM64 mandates that the RHS of the operation must use the smallest
-  // register classs that could contain the size being extended from.  Thus,
-  // if we're folding a (sext i8), we need the RHS to be a GPR32, even though
-  // there might not be an actual 32-bit value in the program.  We can
-  // (harmlessly) synthesize one by injected an EXTRACT_SUBREG here.
-  if (Reg.getValueType() == MVT::i64 && Ext != ARM64_AM::UXTX &&
-      Ext != ARM64_AM::SXTX) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    MachineSDNode *Node = CurDAG->getMachineNode(
-        TargetOpcode::EXTRACT_SUBREG, SDLoc(N), MVT::i32, Reg, SubReg);
-    Reg = SDValue(Node, 0);
-  }
-
-  Shift = CurDAG->getTargetConstant(getArithExtendImm(Ext, ShiftVal), MVT::i32);
-  return isWorthFolding(N);
-}
-
-/// SelectAddrModeIndexed - Select a "register plus scaled unsigned 12-bit
-/// immediate" address.  The "Size" argument is the size in bytes of the memory
-/// reference, which determines the scale.
-bool ARM64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
-                                              SDValue &Base, SDValue &OffImm) {
-  const TargetLowering *TLI = getTargetLowering();
-  if (N.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-    return true;
-  }
-
-  if (N.getOpcode() == ARM64ISD::ADDlow) {
-    GlobalAddressSDNode *GAN =
-        dyn_cast<GlobalAddressSDNode>(N.getOperand(1).getNode());
-    Base = N.getOperand(0);
-    OffImm = N.getOperand(1);
-    if (!GAN)
-      return true;
-
-    const GlobalValue *GV = GAN->getGlobal();
-    unsigned Alignment = GV->getAlignment();
-    const DataLayout *DL = TLI->getDataLayout();
-    if (Alignment == 0 && !Subtarget->isTargetDarwin())
-      Alignment = DL->getABITypeAlignment(GV->getType()->getElementType());
-
-    if (Alignment >= Size)
-      return true;
-  }
-
-  if (CurDAG->isBaseWithConstantOffset(N)) {
-    if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-      int64_t RHSC = (int64_t)RHS->getZExtValue();
-      unsigned Scale = Log2_32(Size);
-      if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 && RHSC < (0x1000 << Scale)) {
-        Base = N.getOperand(0);
-        if (Base.getOpcode() == ISD::FrameIndex) {
-          int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-          Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-        }
-        OffImm = CurDAG->getTargetConstant(RHSC >> Scale, MVT::i64);
-        return true;
-      }
-    }
-  }
-
-  // Before falling back to our general case, check if the unscaled
-  // instructions can handle this. If so, that's preferable.
-  if (SelectAddrModeUnscaled(N, Size, Base, OffImm))
-    return false;
-
-  // Base only. The address will be materialized into a register before
-  // the memory is accessed.
-  //    add x0, Xbase, #offset
-  //    ldr x0, [x0]
-  Base = N;
-  OffImm = CurDAG->getTargetConstant(0, MVT::i64);
-  return true;
-}
-
-/// SelectAddrModeUnscaled - Select a "register plus unscaled signed 9-bit
-/// immediate" address.  This should only match when there is an offset that
-/// is not valid for a scaled immediate addressing mode.  The "Size" argument
-/// is the size in bytes of the memory reference, which is needed here to know
-/// what is valid for a scaled immediate.
-bool ARM64DAGToDAGISel::SelectAddrModeUnscaled(SDValue N, unsigned Size,
-                                               SDValue &Base, SDValue &OffImm) {
-  if (!CurDAG->isBaseWithConstantOffset(N))
-    return false;
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
-    int64_t RHSC = RHS->getSExtValue();
-    // If the offset is valid as a scaled immediate, don't match here.
-    if ((RHSC & (Size - 1)) == 0 && RHSC >= 0 &&
-        RHSC < (0x1000 << Log2_32(Size)))
-      return false;
-    if (RHSC >= -256 && RHSC < 256) {
-      Base = N.getOperand(0);
-      if (Base.getOpcode() == ISD::FrameIndex) {
-        int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        const TargetLowering *TLI = getTargetLowering();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-      }
-      OffImm = CurDAG->getTargetConstant(RHSC, MVT::i64);
-      return true;
-    }
-  }
-  return false;
-}
-
-static SDValue Widen(SelectionDAG *CurDAG, SDValue N) {
-  SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-  SDValue ImpDef = SDValue(
-      CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, SDLoc(N), MVT::i64),
-      0);
-  MachineSDNode *Node = CurDAG->getMachineNode(
-      TargetOpcode::INSERT_SUBREG, SDLoc(N), MVT::i64, ImpDef, N, SubReg);
-  return SDValue(Node, 0);
-}
-
-static SDValue WidenIfNeeded(SelectionDAG *CurDAG, SDValue N) {
-  if (N.getValueType() == MVT::i32) {
-    return Widen(CurDAG, N);
-  }
-
-  return N;
-}
-
-/// \brief Check if the given SHL node (\p N), can be used to form an
-/// extended register for an addressing mode.
-bool ARM64DAGToDAGISel::SelectExtendedSHL(SDValue N, unsigned Size,
-                                          SDValue &Offset, SDValue &Imm) {
-  assert(N.getOpcode() == ISD::SHL && "Invalid opcode.");
-  ConstantSDNode *CSD = dyn_cast<ConstantSDNode>(N.getOperand(1));
-  if (CSD && (CSD->getZExtValue() & 0x7) == CSD->getZExtValue()) {
-
-    ARM64_AM::ExtendType Ext = getExtendTypeForNode(N.getOperand(0), true);
-    if (Ext == ARM64_AM::InvalidExtend) {
-      Ext = ARM64_AM::UXTX;
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0));
-    } else {
-      Offset = WidenIfNeeded(CurDAG, N.getOperand(0).getOperand(0));
-    }
-
-    unsigned LegalShiftVal = Log2_32(Size);
-    unsigned ShiftVal = CSD->getZExtValue();
-
-    if (ShiftVal != 0 && ShiftVal != LegalShiftVal)
-      return false;
-
-    Imm = CurDAG->getTargetConstant(
-        ARM64_AM::getMemExtendImm(Ext, ShiftVal != 0), MVT::i32);
-    if (isWorthFolding(N))
-      return true;
-  }
-  return false;
-}
-
-bool ARM64DAGToDAGISel::SelectAddrModeRO(SDValue N, unsigned Size,
-                                         SDValue &Base, SDValue &Offset,
-                                         SDValue &Imm) {
-  if (N.getOpcode() != ISD::ADD)
-    return false;
-  SDValue LHS = N.getOperand(0);
-  SDValue RHS = N.getOperand(1);
-
-  // We don't want to match immediate adds here, because they are better lowered
-  // to the register-immediate addressing modes.
-  if (isa<ConstantSDNode>(LHS) || isa<ConstantSDNode>(RHS))
-    return false;
-
-  // Check if this particular node is reused in any non-memory related
-  // operation.  If yes, do not try to fold this node into the address
-  // computation, since the computation will be kept.
-  const SDNode *Node = N.getNode();
-  for (SDNode::use_iterator UI = Node->use_begin(), UE = Node->use_end();
-       UI != UE; ++UI) {
-    if (!isa<MemSDNode>(*UI))
-      return false;
-  }
-
-  // Remember if it is worth folding N when it produces extended register.
-  bool IsExtendedRegisterWorthFolding = isWorthFolding(N);
-
-  // Try to match a shifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding && RHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(RHS, Size, Offset, Imm)) {
-    Base = LHS;
-    return true;
-  }
-
-  // Try to match a shifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding && LHS.getOpcode() == ISD::SHL &&
-      SelectExtendedSHL(LHS, Size, Offset, Imm)) {
-    Base = RHS;
-    return true;
-  }
-
-  ARM64_AM::ExtendType Ext = ARM64_AM::UXTX;
-  // Try to match an unshifted extend on the LHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(LHS, true)) != ARM64_AM::InvalidExtend) {
-    Base = RHS;
-    Offset = WidenIfNeeded(CurDAG, LHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(LHS))
-      return true;
-  }
-
-  // Try to match an unshifted extend on the RHS.
-  if (IsExtendedRegisterWorthFolding &&
-      (Ext = getExtendTypeForNode(RHS, true)) != ARM64_AM::InvalidExtend) {
-    Base = LHS;
-    Offset = WidenIfNeeded(CurDAG, RHS.getOperand(0));
-    Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                    MVT::i32);
-    if (isWorthFolding(RHS))
-      return true;
-  }
-
-  // Match any non-shifted, non-extend, non-immediate add expression.
-  Base = LHS;
-  Offset = WidenIfNeeded(CurDAG, RHS);
-  Ext = ARM64_AM::UXTX;
-  Imm = CurDAG->getTargetConstant(ARM64_AM::getMemExtendImm(Ext, false),
-                                  MVT::i32);
-  // Reg1 + Reg2 is free: no check needed.
-  return true;
-}
-
-SDValue ARM64DAGToDAGISel::createDTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::DDRegClassID, ARM64::DDDRegClassID,
-                                    ARM64::DDDDRegClassID };
-  static unsigned SubRegs[] = { ARM64::dsub0, ARM64::dsub1,
-                                ARM64::dsub2, ARM64::dsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
-  static unsigned RegClassIDs[] = { ARM64::QQRegClassID, ARM64::QQQRegClassID,
-                                    ARM64::QQQQRegClassID };
-  static unsigned SubRegs[] = { ARM64::qsub0, ARM64::qsub1,
-                                ARM64::qsub2, ARM64::qsub3 };
-
-  return createTuple(Regs, RegClassIDs, SubRegs);
-}
-
-SDValue ARM64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
-                                       unsigned RegClassIDs[],
-                                       unsigned SubRegs[]) {
-  // There's no special register-class for a vector-list of 1 element: it's just
-  // a vector.
-  if (Regs.size() == 1)
-    return Regs[0];
-
-  assert(Regs.size() >= 2 && Regs.size() <= 4);
-
-  SDLoc DL(Regs[0].getNode());
-
-  SmallVector<SDValue, 4> Ops;
-
-  // First operand of REG_SEQUENCE is the desired RegClass.
-  Ops.push_back(
-      CurDAG->getTargetConstant(RegClassIDs[Regs.size() - 2], MVT::i32));
-
-  // Then we get pairs of source & subregister-position for the components.
-  for (unsigned i = 0; i < Regs.size(); ++i) {
-    Ops.push_back(Regs[i]);
-    Ops.push_back(CurDAG->getTargetConstant(SubRegs[i], MVT::i32));
-  }
-
-  SDNode *N =
-      CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, MVT::Untyped, Ops);
-  return SDValue(N, 0);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectTable(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc, bool isExt) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  unsigned ExtOff = isExt;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  unsigned Vec0Off = ExtOff + 1;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + Vec0Off,
-                               N->op_begin() + Vec0Off + NumVecs);
-  SDValue RegSeq = createQTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  if (isExt)
-    Ops.push_back(N->getOperand(1));
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + ExtOff + 1));
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectIndexedLoad(SDNode *N, bool &Done) {
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  if (LD->isUnindexed())
-    return NULL;
-  EVT VT = LD->getMemoryVT();
-  EVT DstVT = N->getValueType(0);
-  ISD::MemIndexedMode AM = LD->getAddressingMode();
-  bool IsPre = AM == ISD::PRE_INC || AM == ISD::PRE_DEC;
-
-  // We're not doing validity checking here. That was done when checking
-  // if we should mark the load as indexed or not. We're just selecting
-  // the right instruction.
-  unsigned Opcode = 0;
-
-  ISD::LoadExtType ExtType = LD->getExtensionType();
-  bool InsertTo64 = false;
-  if (VT == MVT::i64)
-    Opcode = IsPre ? ARM64::LDRXpre_isel : ARM64::LDRXpost_isel;
-  else if (VT == MVT::i32) {
-    if (ExtType == ISD::NON_EXTLOAD)
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-    else if (ExtType == ISD::SEXTLOAD)
-      Opcode = IsPre ? ARM64::LDRSWpre_isel : ARM64::LDRSWpost_isel;
-    else {
-      Opcode = IsPre ? ARM64::LDRWpre_isel : ARM64::LDRWpost_isel;
-      InsertTo64 = true;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i16) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSHXpre_isel : ARM64::LDRSHXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSHWpre_isel : ARM64::LDRSHWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRHHpre_isel : ARM64::LDRHHpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::i8) {
-    if (ExtType == ISD::SEXTLOAD) {
-      if (DstVT == MVT::i64)
-        Opcode = IsPre ? ARM64::LDRSBXpre_isel : ARM64::LDRSBXpost_isel;
-      else
-        Opcode = IsPre ? ARM64::LDRSBWpre_isel : ARM64::LDRSBWpost_isel;
-    } else {
-      Opcode = IsPre ? ARM64::LDRBBpre_isel : ARM64::LDRBBpost_isel;
-      InsertTo64 = DstVT == MVT::i64;
-      // The result of the load is only i32. It's the subreg_to_reg that makes
-      // it into an i64.
-      DstVT = MVT::i32;
-    }
-  } else if (VT == MVT::f32) {
-    Opcode = IsPre ? ARM64::LDRSpre_isel : ARM64::LDRSpost_isel;
-  } else if (VT == MVT::f64) {
-    Opcode = IsPre ? ARM64::LDRDpre_isel : ARM64::LDRDpost_isel;
-  } else
-    return NULL;
-  SDValue Chain = LD->getChain();
-  SDValue Base = LD->getBasePtr();
-  ConstantSDNode *OffsetOp = cast<ConstantSDNode>(LD->getOffset());
-  int OffsetVal = (int)OffsetOp->getZExtValue();
-  SDValue Offset = CurDAG->getTargetConstant(OffsetVal, MVT::i64);
-  SDValue Ops[] = { Base, Offset, Chain };
-  SDNode *Res = CurDAG->getMachineNode(Opcode, SDLoc(N), DstVT, MVT::i64,
-                                       MVT::Other, Ops);
-  // Either way, we're replacing the node, so tell the caller that.
-  Done = true;
-  if (InsertTo64) {
-    SDValue SubReg = CurDAG->getTargetConstant(ARM64::sub_32, MVT::i32);
-    SDNode *Sub = CurDAG->getMachineNode(
-        ARM64::SUBREG_TO_REG, SDLoc(N), MVT::i64,
-        CurDAG->getTargetConstant(0, MVT::i64), SDValue(Res, 0), SubReg);
-    ReplaceUses(SDValue(N, 0), SDValue(Sub, 0));
-    ReplaceUses(SDValue(N, 1), SDValue(Res, 1));
-    ReplaceUses(SDValue(N, 2), SDValue(Res, 2));
-    return 0;
-  }
-  return Res;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoad(SDNode *N, unsigned NumVecs, unsigned Opc,
-                                      unsigned SubRegIdx) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SDValue Chain = N->getOperand(0);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(2)); // Mem operand;
-  Ops.push_back(Chain);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  // MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  // MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  // cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-
-  switch (NumVecs) {
-  case 4:
-    ReplaceUses(SDValue(N, 3), CurDAG->getTargetExtractSubreg(SubRegIdx + 3, dl,
-                                                              VT, SuperReg));
-  // FALLTHROUGH
-  case 3:
-    ReplaceUses(SDValue(N, 2), CurDAG->getTargetExtractSubreg(SubRegIdx + 2, dl,
-                                                              VT, SuperReg));
-  // FALLTHROUGH
-  case 2:
-    ReplaceUses(SDValue(N, 1), CurDAG->getTargetExtractSubreg(SubRegIdx + 1, dl,
-                                                              VT, SuperReg));
-    ReplaceUses(SDValue(N, 0),
-                CurDAG->getTargetExtractSubreg(SubRegIdx, dl, VT, SuperReg));
-    break;
-  case 1:
-    ReplaceUses(SDValue(N, 0), SuperReg);
-    break;
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return 0;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
-                                       unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-
-  // Form a REG_SEQUENCE to force register allocation.
-  bool Is128Bit = VT.getSizeInBits() == 128;
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-  SDValue RegSeq = Is128Bit ? createQTuple(Regs) : createDTuple(Regs);
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(N->getOperand(NumVecs + 2));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
-
-  return St;
-}
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-class WidenVector {
-  SelectionDAG &DAG;
-
-public:
-  WidenVector(SelectionDAG &DAG) : DAG(DAG) {}
-
-  SDValue operator()(SDValue V64Reg) {
-    EVT VT = V64Reg.getValueType();
-    unsigned NarrowSize = VT.getVectorNumElements();
-    MVT EltTy = VT.getVectorElementType().getSimpleVT();
-    MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-    SDLoc DL(V64Reg);
-
-    SDValue Undef =
-        SDValue(DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, WideTy), 0);
-    return DAG.getTargetInsertSubreg(ARM64::dsub, DL, WideTy, Undef, V64Reg);
-  }
-};
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, SDLoc(V128Reg), NarrowTy,
-                                    V128Reg);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLoadLane(SDNode *N, unsigned NumVecs,
-                                          unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  std::vector<EVT> ResTys;
-  ResTys.push_back(MVT::Untyped);
-  ResTys.push_back(MVT::Other);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *Ld = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-  SDValue SuperReg = SDValue(Ld, 0);
-
-  EVT WideVT = RegSeq.getOperand(1)->getValueType(0);
-  switch (NumVecs) {
-  case 4: {
-    SDValue NV3 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub3, dl, WideVT, SuperReg);
-    if (Narrow)
-      ReplaceUses(SDValue(N, 3), NarrowVector(NV3, *CurDAG));
-    else
-      ReplaceUses(SDValue(N, 3), NV3);
-  }
-  // FALLTHROUGH
-  case 3: {
-    SDValue NV2 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub2, dl, WideVT, SuperReg);
-    if (Narrow)
-      ReplaceUses(SDValue(N, 2), NarrowVector(NV2, *CurDAG));
-    else
-      ReplaceUses(SDValue(N, 2), NV2);
-  }
-  // FALLTHROUGH
-  case 2: {
-    SDValue NV1 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub1, dl, WideVT, SuperReg);
-    SDValue NV0 =
-        CurDAG->getTargetExtractSubreg(ARM64::qsub0, dl, WideVT, SuperReg);
-    if (Narrow) {
-      ReplaceUses(SDValue(N, 1), NarrowVector(NV1, *CurDAG));
-      ReplaceUses(SDValue(N, 0), NarrowVector(NV0, *CurDAG));
-    } else {
-      ReplaceUses(SDValue(N, 1), NV1);
-      ReplaceUses(SDValue(N, 0), NV0);
-    }
-    break;
-  }
-  }
-
-  ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1));
-
-  return Ld;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectStoreLane(SDNode *N, unsigned NumVecs,
-                                           unsigned Opc) {
-  SDLoc dl(N);
-  EVT VT = N->getOperand(2)->getValueType(0);
-  bool Narrow = VT.getSizeInBits() == 64;
-
-  // Form a REG_SEQUENCE to force register allocation.
-  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
-
-  if (Narrow)
-    std::transform(Regs.begin(), Regs.end(), Regs.begin(),
-                   WidenVector(*CurDAG));
-
-  SDValue RegSeq = createQTuple(Regs);
-
-  unsigned LaneNo =
-      cast<ConstantSDNode>(N->getOperand(NumVecs + 2))->getZExtValue();
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(RegSeq);
-  Ops.push_back(CurDAG->getTargetConstant(LaneNo, MVT::i64));
-  Ops.push_back(N->getOperand(NumVecs + 3));
-  Ops.push_back(N->getOperand(0));
-  SDNode *St = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
-
-  // Transfer memoperands.
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-  cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-  return St;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
-                                        unsigned Op16, unsigned Op32,
-                                        unsigned Op64) {
-  // Mostly direct translation to the given operations, except that we preserve
-  // the AtomicOrdering for use later on.
-  AtomicSDNode *AN = cast<AtomicSDNode>(Node);
-  EVT VT = AN->getMemoryVT();
-
-  unsigned Op;
-  if (VT == MVT::i8)
-    Op = Op8;
-  else if (VT == MVT::i16)
-    Op = Op16;
-  else if (VT == MVT::i32)
-    Op = Op32;
-  else if (VT == MVT::i64)
-    Op = Op64;
-  else
-    llvm_unreachable("Unexpected atomic operation");
-
-  SmallVector<SDValue, 4> Ops;
-  for (unsigned i = 1; i < AN->getNumOperands(); ++i)
-    Ops.push_back(AN->getOperand(i));
-
-  Ops.push_back(CurDAG->getTargetConstant(AN->getOrdering(), MVT::i32));
-  Ops.push_back(AN->getOperand(0)); // Chain moves to the end
-
-  return CurDAG->SelectNodeTo(Node, Op, AN->getValueType(0), MVT::Other,
-                              &Ops[0], Ops.size());
-}
-
-static bool isBitfieldExtractOpFromAnd(SelectionDAG *CurDAG, SDNode *N,
-                                       unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       unsigned NumberOfIgnoredLowBits,
-                                       bool BiggerPattern) {
-  assert(N->getOpcode() == ISD::AND &&
-         "N must be a AND operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // FIXME: simplify-demanded-bits in DAGCombine will probably have
-  // changed the AND node to a 32-bit mask operation. We'll have to
-  // undo that as part of the transform here if we want to catch all
-  // the opportunities.
-  // Currently the NumberOfIgnoredLowBits argument helps to recover
-  // form these situations when matching bigger pattern (bitfield insert).
-
-  // For unsigned extracts, check for a shift right and mask
-  uint64_t And_imm = 0;
-  if (!isOpcWithIntImmediate(N, ISD::AND, And_imm))
-    return false;
-
-  const SDNode *Op0 = N->getOperand(0).getNode();
-
-  // Because of simplify-demanded-bits in DAGCombine, the mask may have been
-  // simplified. Try to undo that
-  And_imm |= (1 << NumberOfIgnoredLowBits) - 1;
-
-  // The immediate is a mask of the low bits iff imm & (imm+1) == 0
-  if (And_imm & (And_imm + 1))
-    return false;
-
-  bool ClampMSB = false;
-  uint64_t Srl_imm = 0;
-  // Handle the SRL + ANY_EXTEND case.
-  if (VT == MVT::i64 && Op0->getOpcode() == ISD::ANY_EXTEND &&
-      isOpcWithIntImmediate(Op0->getOperand(0).getNode(), ISD::SRL, Srl_imm)) {
-    // Extend the incoming operand of the SRL to 64-bit.
-    Opd0 = Widen(CurDAG, Op0->getOperand(0).getOperand(0));
-    // Make sure to clamp the MSB so that we preserve the semantics of the
-    // original operations.
-    ClampMSB = true;
-  } else if (isOpcWithIntImmediate(Op0, ISD::SRL, Srl_imm)) {
-    Opd0 = Op0->getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift right has been performed.
-    // The resulting code will be at least as good as the original one
-    // plus it may expose more opportunities for bitfield insert pattern.
-    // FIXME: Currently we limit this to the bigger pattern, because
-    // some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert((BiggerPattern || (Srl_imm > 0 && Srl_imm < VT.getSizeInBits())) &&
-         "bad amount in shift node!");
-
-  LSB = Srl_imm;
-  MSB = Srl_imm + (VT == MVT::i32 ? CountTrailingOnes_32(And_imm)
-                                  : CountTrailingOnes_64(And_imm)) -
-        1;
-  if (ClampMSB)
-    // Since we're moving the extend before the right shift operation, we need
-    // to clamp the MSB to make sure we don't shift in undefined bits instead of
-    // the zeros which would get shifted in with the original right shift
-    // operation.
-    MSB = MSB > 31 ? 31 : MSB;
-
-  Opc = VT == MVT::i32 ? ARM64::UBFMWri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isOneBitExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     unsigned &LSB, unsigned &MSB) {
-  // We are looking for the following pattern which basically extracts a single
-  // bit from the source value and places it in the LSB of the destination
-  // value, all other bits of the destination value or set to zero:
-  //
-  // Value2 = AND Value, MaskImm
-  // SRL Value2, ShiftImm
-  //
-  // with MaskImm >> ShiftImm == 1.
-  //
-  // This gets selected into a single UBFM:
-  //
-  // UBFM Value, ShiftImm, ShiftImm
-  //
-
-  if (N->getOpcode() != ISD::SRL)
-    return false;
-
-  uint64_t And_mask = 0;
-  if (!isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::AND, And_mask))
-    return false;
-
-  Opd0 = N->getOperand(0).getOperand(0);
-
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  // Check whether we really have a one bit extract here.
-  if (And_mask >> Srl_imm == 0x1) {
-    if (N->getValueType(0) == MVT::i32)
-      Opc = ARM64::UBFMWri;
-    else
-      Opc = ARM64::UBFMXri;
-
-    LSB = MSB = Srl_imm;
-
-    return true;
-  }
-
-  return false;
-}
-
-static bool isBitfieldExtractOpFromShr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                       unsigned &LSB, unsigned &MSB,
-                                       bool BiggerPattern) {
-  assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
-         "N must be a SHR/SRA operation to call this function");
-
-  EVT VT = N->getValueType(0);
-
-  // Here we can test the type of VT and return false when the type does not
-  // match, but since it is done prior to that call in the current context
-  // we turned that into an assert to avoid redundant code.
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "Type checking must have been done before calling this function");
-
-  // Check for AND + SRL doing a one bit extract.
-  if (isOneBitExtractOpFromShr(N, Opc, Opd0, LSB, MSB))
-    return true;
-
-  // we're looking for a shift of a shift
-  uint64_t Shl_imm = 0;
-  if (isOpcWithIntImmediate(N->getOperand(0).getNode(), ISD::SHL, Shl_imm)) {
-    Opd0 = N->getOperand(0).getOperand(0);
-  } else if (BiggerPattern) {
-    // Let's pretend a 0 shift left has been performed.
-    // FIXME: Currently we limit this to the bigger pattern case,
-    // because some optimizations expect AND and not UBFM
-    Opd0 = N->getOperand(0);
-  } else
-    return false;
-
-  assert(Shl_imm < VT.getSizeInBits() && "bad amount in shift node!");
-  uint64_t Srl_imm = 0;
-  if (!isIntImmediate(N->getOperand(1), Srl_imm))
-    return false;
-
-  assert(Srl_imm > 0 && Srl_imm < VT.getSizeInBits() &&
-         "bad amount in shift node!");
-  // Note: The width operand is encoded as width-1.
-  unsigned Width = VT.getSizeInBits() - Srl_imm - 1;
-  int sLSB = Srl_imm - Shl_imm;
-  if (sLSB < 0)
-    return false;
-  LSB = sLSB;
-  MSB = LSB + Width;
-  // SRA requires a signed extraction
-  if (VT == MVT::i32)
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMWri : ARM64::UBFMWri;
-  else
-    Opc = N->getOpcode() == ISD::SRA ? ARM64::SBFMXri : ARM64::UBFMXri;
-  return true;
-}
-
-static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
-                                SDValue &Opd0, unsigned &LSB, unsigned &MSB,
-                                unsigned NumberOfIgnoredLowBits = 0,
-                                bool BiggerPattern = false) {
-  if (N->getValueType(0) != MVT::i32 && N->getValueType(0) != MVT::i64)
-    return false;
-
-  switch (N->getOpcode()) {
-  default:
-    if (!N->isMachineOpcode())
-      return false;
-    break;
-  case ISD::AND:
-    return isBitfieldExtractOpFromAnd(CurDAG, N, Opc, Opd0, LSB, MSB,
-                                      NumberOfIgnoredLowBits, BiggerPattern);
-  case ISD::SRL:
-  case ISD::SRA:
-    return isBitfieldExtractOpFromShr(N, Opc, Opd0, LSB, MSB, BiggerPattern);
-  }
-
-  unsigned NOpc = N->getMachineOpcode();
-  switch (NOpc) {
-  default:
-    return false;
-  case ARM64::SBFMWri:
-  case ARM64::UBFMWri:
-  case ARM64::SBFMXri:
-  case ARM64::UBFMXri:
-    Opc = NOpc;
-    Opd0 = N->getOperand(0);
-    LSB = cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
-    MSB = cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
-    return true;
-  }
-  // Unreachable
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldExtractOp(SDNode *N) {
-  unsigned Opc, LSB, MSB;
-  SDValue Opd0;
-  if (!isBitfieldExtractOp(CurDAG, N, Opc, Opd0, LSB, MSB))
-    return NULL;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0, CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 3);
-}
-
-// Is mask a i32 or i64 binary sequence 1..10..0 and
-// CountTrailingZeros(mask) == ExpectedTrailingZeros
-static bool isHighMask(uint64_t Mask, unsigned ExpectedTrailingZeros,
-                       unsigned NumberOfIgnoredHighBits, EVT VT) {
-  assert((VT == MVT::i32 || VT == MVT::i64) &&
-         "i32 or i64 mask type expected!");
-
-  uint64_t ExpectedMask;
-  if (VT == MVT::i32) {
-    uint32_t ExpectedMaski32 = ~0 << ExpectedTrailingZeros;
-    ExpectedMask = ExpectedMaski32;
-    if (NumberOfIgnoredHighBits) {
-      uint32_t highMask = ~0 << (32 - NumberOfIgnoredHighBits);
-      Mask |= highMask;
-    }
-  } else {
-    ExpectedMask = ((uint64_t) ~0) << ExpectedTrailingZeros;
-    if (NumberOfIgnoredHighBits)
-      Mask |= ((uint64_t) ~0) << (64 - NumberOfIgnoredHighBits);
-  }
-
-  return Mask == ExpectedMask;
-}
-
-// Look for bits that will be useful for later uses.
-// A bit is consider useless as soon as it is dropped and never used
-// before it as been dropped.
-// E.g., looking for useful bit of x
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// After #1, x useful bits are 0x7, then the useful bits of x, live through
-// y.
-// After #2, the useful bits of x are 0x4.
-// However, if x is used on an unpredicatable instruction, then all its bits
-// are useful.
-// E.g.
-// 1. y = x & 0x7
-// 2. z = y >> 2
-// 3. str x, [@x]
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth = 0);
-
-static void getUsefulBitsFromAndWithImmediate(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  Imm = ARM64_AM::decodeLogicalImmediate(Imm, UsefulBits.getBitWidth());
-  UsefulBits &= APInt(UsefulBits.getBitWidth(), Imm);
-  getUsefulBits(Op, UsefulBits, Depth + 1);
-}
-
-static void getUsefulBitsFromBitfieldMoveOpd(SDValue Op, APInt &UsefulBits,
-                                             uint64_t Imm, uint64_t MSB,
-                                             unsigned Depth) {
-  // inherit the bitwidth value
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    // The interesting part will be in the lower part of the result
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was starting at Imm in the argument
-    OpUsefulBits = OpUsefulBits.shl(Imm);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    // The interesting part will be shifted in the result
-    OpUsefulBits = OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm);
-    getUsefulBits(Op, OpUsefulBits, Depth + 1);
-    // The interesting part was at zero in the argument
-    OpUsefulBits = OpUsefulBits.lshr(OpUsefulBits.getBitWidth() - Imm);
-  }
-
-  UsefulBits &= OpUsefulBits;
-}
-
-static void getUsefulBitsFromUBFM(SDValue Op, APInt &UsefulBits,
-                                  unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(1).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-
-  getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-}
-
-static void getUsefulBitsFromOrWithShiftedReg(SDValue Op, APInt &UsefulBits,
-                                              unsigned Depth) {
-  uint64_t ShiftTypeAndValue =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  APInt Mask(UsefulBits);
-  Mask.clearAllBits();
-  Mask.flipAllBits();
-
-  if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSL) {
-    // Shift Left
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.shl(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.lshr(ShiftAmt);
-  } else if (ARM64_AM::getShiftType(ShiftTypeAndValue) == ARM64_AM::LSR) {
-    // Shift Right
-    // We do not handle ARM64_AM::ASR, because the sign will change the
-    // number of useful bits
-    uint64_t ShiftAmt = ARM64_AM::getShiftValue(ShiftTypeAndValue);
-    Mask = Mask.lshr(ShiftAmt);
-    getUsefulBits(Op, Mask, Depth + 1);
-    Mask = Mask.shl(ShiftAmt);
-  } else
-    return;
-
-  UsefulBits &= Mask;
-}
-
-static void getUsefulBitsFromBFM(SDValue Op, SDValue Orig, APInt &UsefulBits,
-                                 unsigned Depth) {
-  uint64_t Imm =
-      cast<const ConstantSDNode>(Op.getOperand(2).getNode())->getZExtValue();
-  uint64_t MSB =
-      cast<const ConstantSDNode>(Op.getOperand(3).getNode())->getZExtValue();
-
-  if (Op.getOperand(1) == Orig)
-    return getUsefulBitsFromBitfieldMoveOpd(Op, UsefulBits, Imm, MSB, Depth);
-
-  APInt OpUsefulBits(UsefulBits);
-  OpUsefulBits = 1;
-
-  if (MSB >= Imm) {
-    OpUsefulBits = OpUsefulBits.shl(MSB - Imm + 1);
-    --OpUsefulBits;
-    UsefulBits &= ~OpUsefulBits;
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  } else {
-    OpUsefulBits = OpUsefulBits.shl(MSB + 1);
-    --OpUsefulBits;
-    UsefulBits = ~(OpUsefulBits.shl(OpUsefulBits.getBitWidth() - Imm));
-    getUsefulBits(Op, UsefulBits, Depth + 1);
-  }
-}
-
-static void getUsefulBitsForUse(SDNode *UserNode, APInt &UsefulBits,
-                                SDValue Orig, unsigned Depth) {
-
-  // Users of this node should have already been instruction selected
-  // FIXME: Can we turn that into an assert?
-  if (!UserNode->isMachineOpcode())
-    return;
-
-  switch (UserNode->getMachineOpcode()) {
-  default:
-    return;
-  case ARM64::ANDSWri:
-  case ARM64::ANDSXri:
-  case ARM64::ANDWri:
-  case ARM64::ANDXri:
-    // We increment Depth only when we call the getUsefulBits
-    return getUsefulBitsFromAndWithImmediate(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::UBFMWri:
-  case ARM64::UBFMXri:
-    return getUsefulBitsFromUBFM(SDValue(UserNode, 0), UsefulBits, Depth);
-
-  case ARM64::ORRWrs:
-  case ARM64::ORRXrs:
-    if (UserNode->getOperand(1) != Orig)
-      return;
-    return getUsefulBitsFromOrWithShiftedReg(SDValue(UserNode, 0), UsefulBits,
-                                             Depth);
-  case ARM64::BFMWri:
-  case ARM64::BFMXri:
-    return getUsefulBitsFromBFM(SDValue(UserNode, 0), Orig, UsefulBits, Depth);
-  }
-}
-
-static void getUsefulBits(SDValue Op, APInt &UsefulBits, unsigned Depth) {
-  if (Depth >= 6)
-    return;
-  // Initialize UsefulBits
-  if (!Depth) {
-    unsigned Bitwidth = Op.getValueType().getScalarType().getSizeInBits();
-    // At the beginning, assume every produced bits is useful
-    UsefulBits = APInt(Bitwidth, 0);
-    UsefulBits.flipAllBits();
-  }
-  APInt UsersUsefulBits(UsefulBits.getBitWidth(), 0);
-
-  for (SDNode::use_iterator UseIt = Op.getNode()->use_begin(),
-                            UseEnd = Op.getNode()->use_end();
-       UseIt != UseEnd; ++UseIt) {
-    // A use cannot produce useful bits
-    APInt UsefulBitsForUse = APInt(UsefulBits);
-    getUsefulBitsForUse(*UseIt, UsefulBitsForUse, Op, Depth);
-    UsersUsefulBits |= UsefulBitsForUse;
-  }
-  // UsefulBits contains the produced bits that are meaningful for the
-  // current definition, thus a user cannot make a bit meaningful at
-  // this point
-  UsefulBits &= UsersUsefulBits;
-}
-
-// Given a OR operation, check if we have the following pattern
-// ubfm c, b, imm, imm2 (or something that does the same jobs, see
-//                       isBitfieldExtractOp)
-// d = e & mask2 ; where mask is a binary sequence of 1..10..0 and
-//                 countTrailingZeros(mask2) == imm2 - imm + 1
-// f = d | c
-// if yes, given reference arguments will be update so that one can replace
-// the OR instruction with:
-// f = Opc Opd0, Opd1, LSB, MSB ; where Opc is a BFM, LSB = imm, and MSB = imm2
-static bool isBitfieldInsertOpFromOr(SDNode *N, unsigned &Opc, SDValue &Opd0,
-                                     SDValue &Opd1, unsigned &LSB,
-                                     unsigned &MSB, SelectionDAG *CurDAG) {
-  assert(N->getOpcode() == ISD::OR && "Expect a OR operation");
-
-  // Set Opc
-  EVT VT = N->getValueType(0);
-  if (VT == MVT::i32)
-    Opc = ARM64::BFMWri;
-  else if (VT == MVT::i64)
-    Opc = ARM64::BFMXri;
-  else
-    return false;
-
-  // Because of simplify-demanded-bits in DAGCombine, involved masks may not
-  // have the expected shape. Try to undo that.
-  APInt UsefulBits;
-  getUsefulBits(SDValue(N, 0), UsefulBits);
-
-  unsigned NumberOfIgnoredLowBits = UsefulBits.countTrailingZeros();
-  unsigned NumberOfIgnoredHighBits = UsefulBits.countLeadingZeros();
-
-  // OR is commutative, check both possibilities (does llvm provide a
-  // way to do that directely, e.g., via code matcher?)
-  SDValue OrOpd1Val = N->getOperand(1);
-  SDNode *OrOpd0 = N->getOperand(0).getNode();
-  SDNode *OrOpd1 = N->getOperand(1).getNode();
-  for (int i = 0; i < 2;
-       ++i, std::swap(OrOpd0, OrOpd1), OrOpd1Val = N->getOperand(0)) {
-    unsigned BFXOpc;
-    // Set Opd1, LSB and MSB arguments by looking for
-    // c = ubfm b, imm, imm2
-    if (!isBitfieldExtractOp(CurDAG, OrOpd0, BFXOpc, Opd1, LSB, MSB,
-                             NumberOfIgnoredLowBits, true))
-      continue;
-
-    // Check that the returned opcode is compatible with the pattern,
-    // i.e., same type and zero extended (U and not S)
-    if ((BFXOpc != ARM64::UBFMXri && VT == MVT::i64) ||
-        (BFXOpc != ARM64::UBFMWri && VT == MVT::i32))
-      continue;
-
-    // Compute the width of the bitfield insertion
-    int sMSB = MSB - LSB + 1;
-    // FIXME: This constraints is to catch bitfield insertion we may
-    // want to widen the pattern if we want to grab general bitfied
-    // move case
-    if (sMSB <= 0)
-      continue;
-
-    // Check the second part of the pattern
-    EVT VT = OrOpd1->getValueType(0);
-    if (VT != MVT::i32 && VT != MVT::i64)
-      continue;
-
-    // Compute the Known Zero for the candidate of the first operand.
-    // This allows to catch more general case than just looking for
-    // AND with imm. Indeed, simplify-demanded-bits may have removed
-    // the AND instruction because it proves it was useless.
-    APInt KnownZero, KnownOne;
-    CurDAG->ComputeMaskedBits(OrOpd1Val, KnownZero, KnownOne);
-
-    // Check if there is enough room for the second operand to appear
-    // in the first one
-    if (KnownZero.countTrailingOnes() < (unsigned)sMSB)
-      continue;
-
-    // Set the first operand
-    uint64_t Imm;
-    if (isOpcWithIntImmediate(OrOpd1, ISD::AND, Imm) &&
-        isHighMask(Imm, sMSB, NumberOfIgnoredHighBits, VT))
-      // In that case, we can eliminate the AND
-      Opd0 = OrOpd1->getOperand(0);
-    else
-      // Maybe the AND has been removed by simplify-demanded-bits
-      // or is useful because it discards more bits
-      Opd0 = OrOpd1Val;
-
-    // both parts match
-    return true;
-  }
-
-  return false;
-}
-
-SDNode *ARM64DAGToDAGISel::SelectBitfieldInsertOp(SDNode *N) {
-  if (N->getOpcode() != ISD::OR)
-    return NULL;
-
-  unsigned Opc;
-  unsigned LSB, MSB;
-  SDValue Opd0, Opd1;
-
-  if (!isBitfieldInsertOpFromOr(N, Opc, Opd0, Opd1, LSB, MSB, CurDAG))
-    return NULL;
-
-  EVT VT = N->getValueType(0);
-  SDValue Ops[] = { Opd0,
-                    Opd1,
-                    CurDAG->getTargetConstant(LSB, VT),
-                    CurDAG->getTargetConstant(MSB, VT) };
-  return CurDAG->SelectNodeTo(N, Opc, VT, Ops, 4);
-}
-
-SDNode *ARM64DAGToDAGISel::SelectLIBM(SDNode *N) {
-  EVT VT = N->getValueType(0);
-  unsigned Variant;
-  unsigned Opc;
-  unsigned FRINTXOpcs[] = { ARM64::FRINTXSr, ARM64::FRINTXDr };
-
-  if (VT == MVT::f32) {
-    Variant = 0;
-  } else if (VT == MVT::f64) {
-    Variant = 1;
-  } else
-    return 0; // Unrecognized argument type. Fall back on default codegen.
-
-  // Pick the FRINTX variant needed to set the flags.
-  unsigned FRINTXOpc = FRINTXOpcs[Variant];
-
-  switch (N->getOpcode()) {
-  default:
-    return 0; // Unrecognized libm ISD node. Fall back on default codegen.
-  case ISD::FCEIL: {
-    unsigned FRINTPOpcs[] = { ARM64::FRINTPSr, ARM64::FRINTPDr };
-    Opc = FRINTPOpcs[Variant];
-    break;
-  }
-  case ISD::FFLOOR: {
-    unsigned FRINTMOpcs[] = { ARM64::FRINTMSr, ARM64::FRINTMDr };
-    Opc = FRINTMOpcs[Variant];
-    break;
-  }
-  case ISD::FTRUNC: {
-    unsigned FRINTZOpcs[] = { ARM64::FRINTZSr, ARM64::FRINTZDr };
-    Opc = FRINTZOpcs[Variant];
-    break;
-  }
-  case ISD::FROUND: {
-    unsigned FRINTAOpcs[] = { ARM64::FRINTASr, ARM64::FRINTADr };
-    Opc = FRINTAOpcs[Variant];
-    break;
-  }
-  }
-
-  SDLoc dl(N);
-  SDValue In = N->getOperand(0);
-  SmallVector<SDValue, 2> Ops;
-  Ops.push_back(In);
-
-  if (!TM.Options.UnsafeFPMath) {
-    SDNode *FRINTX = CurDAG->getMachineNode(FRINTXOpc, dl, VT, MVT::Glue, In);
-    Ops.push_back(SDValue(FRINTX, 1));
-  }
-
-  return CurDAG->getMachineNode(Opc, dl, VT, Ops);
-}
-
-SDNode *ARM64DAGToDAGISel::Select(SDNode *Node) {
-  // Dump information about the Node being selected
-  DEBUG(errs() << "Selecting: ");
-  DEBUG(Node->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  // If we have a custom node, we already have selected!
-  if (Node->isMachineOpcode()) {
-    DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
-    Node->setNodeId(-1);
-    return NULL;
-  }
-
-  // Few custom selection stuff.
-  SDNode *ResNode = 0;
-  EVT VT = Node->getValueType(0);
-
-  switch (Node->getOpcode()) {
-  default:
-    break;
-
-  case ISD::ADD:
-    if (SDNode *I = SelectMLAV64LaneV128(Node))
-      return I;
-    break;
-
-  case ISD::ATOMIC_LOAD_ADD:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_ADD_I8,
-                        ARM64::ATOMIC_LOAD_ADD_I16, ARM64::ATOMIC_LOAD_ADD_I32,
-                        ARM64::ATOMIC_LOAD_ADD_I64);
-  case ISD::ATOMIC_LOAD_SUB:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_SUB_I8,
-                        ARM64::ATOMIC_LOAD_SUB_I16, ARM64::ATOMIC_LOAD_SUB_I32,
-                        ARM64::ATOMIC_LOAD_SUB_I64);
-  case ISD::ATOMIC_LOAD_AND:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_AND_I8,
-                        ARM64::ATOMIC_LOAD_AND_I16, ARM64::ATOMIC_LOAD_AND_I32,
-                        ARM64::ATOMIC_LOAD_AND_I64);
-  case ISD::ATOMIC_LOAD_OR:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_OR_I8,
-                        ARM64::ATOMIC_LOAD_OR_I16, ARM64::ATOMIC_LOAD_OR_I32,
-                        ARM64::ATOMIC_LOAD_OR_I64);
-  case ISD::ATOMIC_LOAD_XOR:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_XOR_I8,
-                        ARM64::ATOMIC_LOAD_XOR_I16, ARM64::ATOMIC_LOAD_XOR_I32,
-                        ARM64::ATOMIC_LOAD_XOR_I64);
-  case ISD::ATOMIC_LOAD_NAND:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_NAND_I8, ARM64::ATOMIC_LOAD_NAND_I16,
-        ARM64::ATOMIC_LOAD_NAND_I32, ARM64::ATOMIC_LOAD_NAND_I64);
-  case ISD::ATOMIC_LOAD_MIN:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MIN_I8,
-                        ARM64::ATOMIC_LOAD_MIN_I16, ARM64::ATOMIC_LOAD_MIN_I32,
-                        ARM64::ATOMIC_LOAD_MIN_I64);
-  case ISD::ATOMIC_LOAD_MAX:
-    return SelectAtomic(Node, ARM64::ATOMIC_LOAD_MAX_I8,
-                        ARM64::ATOMIC_LOAD_MAX_I16, ARM64::ATOMIC_LOAD_MAX_I32,
-                        ARM64::ATOMIC_LOAD_MAX_I64);
-  case ISD::ATOMIC_LOAD_UMIN:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_UMIN_I8, ARM64::ATOMIC_LOAD_UMIN_I16,
-        ARM64::ATOMIC_LOAD_UMIN_I32, ARM64::ATOMIC_LOAD_UMIN_I64);
-  case ISD::ATOMIC_LOAD_UMAX:
-    return SelectAtomic(
-        Node, ARM64::ATOMIC_LOAD_UMAX_I8, ARM64::ATOMIC_LOAD_UMAX_I16,
-        ARM64::ATOMIC_LOAD_UMAX_I32, ARM64::ATOMIC_LOAD_UMAX_I64);
-  case ISD::ATOMIC_SWAP:
-    return SelectAtomic(Node, ARM64::ATOMIC_SWAP_I8, ARM64::ATOMIC_SWAP_I16,
-                        ARM64::ATOMIC_SWAP_I32, ARM64::ATOMIC_SWAP_I64);
-  case ISD::ATOMIC_CMP_SWAP:
-    return SelectAtomic(Node, ARM64::ATOMIC_CMP_SWAP_I8,
-                        ARM64::ATOMIC_CMP_SWAP_I16, ARM64::ATOMIC_CMP_SWAP_I32,
-                        ARM64::ATOMIC_CMP_SWAP_I64);
-
-  case ISD::LOAD: {
-    // Try to select as an indexed load. Fall through to normal processing
-    // if we can't.
-    bool Done = false;
-    SDNode *I = SelectIndexedLoad(Node, Done);
-    if (Done)
-      return I;
-    break;
-  }
-
-  case ISD::SRL:
-  case ISD::AND:
-  case ISD::SRA:
-    if (SDNode *I = SelectBitfieldExtractOp(Node))
-      return I;
-    break;
-
-  case ISD::OR:
-    if (SDNode *I = SelectBitfieldInsertOp(Node))
-      return I;
-    break;
-
-  case ISD::EXTRACT_VECTOR_ELT: {
-    // Extracting lane zero is a special case where we can just use a plain
-    // EXTRACT_SUBREG instruction, which will become FMOV. This is easier for
-    // the rest of the compiler, especially the register allocator and copyi
-    // propagation, to reason about, so is preferred when it's possible to
-    // use it.
-    ConstantSDNode *LaneNode = cast<ConstantSDNode>(Node->getOperand(1));
-    // Bail and use the default Select() for non-zero lanes.
-    if (LaneNode->getZExtValue() != 0)
-      break;
-    // If the element type is not the same as the result type, likewise
-    // bail and use the default Select(), as there's more to do than just
-    // a cross-class COPY. This catches extracts of i8 and i16 elements
-    // since they will need an explicit zext.
-    if (VT != Node->getOperand(0).getValueType().getVectorElementType())
-      break;
-    unsigned SubReg;
-    switch (Node->getOperand(0)
-                .getValueType()
-                .getVectorElementType()
-                .getSizeInBits()) {
-    default:
-      assert(0 && "Unexpected vector element type!");
-    case 64:
-      SubReg = ARM64::dsub;
-      break;
-    case 32:
-      SubReg = ARM64::ssub;
-      break;
-    case 16: // FALLTHROUGH
-    case 8:
-      llvm_unreachable("unexpected zext-requiring extract element!");
-    }
-    SDValue Extract = CurDAG->getTargetExtractSubreg(SubReg, SDLoc(Node), VT,
-                                                     Node->getOperand(0));
-    DEBUG(dbgs() << "ISEL: Custom selection!\n=> ");
-    DEBUG(Extract->dumpr(CurDAG));
-    DEBUG(dbgs() << "\n");
-    return Extract.getNode();
-  }
-  case ISD::Constant: {
-    // Materialize zero constants as copies from WZR/XZR.  This allows
-    // the coalescer to propagate these into other instructions.
-    ConstantSDNode *ConstNode = cast<ConstantSDNode>(Node);
-    if (ConstNode->isNullValue()) {
-      if (VT == MVT::i32)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::WZR, MVT::i32).getNode();
-      else if (VT == MVT::i64)
-        return CurDAG->getCopyFromReg(CurDAG->getEntryNode(), SDLoc(Node),
-                                      ARM64::XZR, MVT::i64).getNode();
-    }
-    break;
-  }
-
-  case ISD::FrameIndex: {
-    // Selects to ADDXri FI, 0 which in turn will become ADDXri SP, imm.
-    int FI = cast<FrameIndexSDNode>(Node)->getIndex();
-    unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-    const TargetLowering *TLI = getTargetLowering();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
-    SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
-                      CurDAG->getTargetConstant(Shifter, MVT::i32) };
-    return CurDAG->SelectNodeTo(Node, ARM64::ADDXri, MVT::i64, Ops, 3);
-  }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_ldxp: {
-      SDValue MemAddr = Node->getOperand(2);
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-
-      SDNode *Ld = CurDAG->getMachineNode(ARM64::LDXPX, DL, MVT::i64, MVT::i64,
-                                          MVT::Other, MemAddr, Chain);
-
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(Ld)->setMemRefs(MemOp, MemOp + 1);
-      return Ld;
-    }
-    case Intrinsic::arm64_stxp: {
-      SDLoc DL(Node);
-      SDValue Chain = Node->getOperand(0);
-      SDValue ValLo = Node->getOperand(2);
-      SDValue ValHi = Node->getOperand(3);
-      SDValue MemAddr = Node->getOperand(4);
-
-      // Place arguments in the right order.
-      SmallVector<SDValue, 7> Ops;
-      Ops.push_back(ValLo);
-      Ops.push_back(ValHi);
-      Ops.push_back(MemAddr);
-      Ops.push_back(Chain);
-
-      SDNode *St =
-          CurDAG->getMachineNode(ARM64::STXPX, DL, MVT::i32, MVT::Other, Ops);
-      // Transfer memoperands.
-      MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-      MemOp[0] = cast<MemIntrinsicSDNode>(Node)->getMemOperand();
-      cast<MachineSDNode>(St)->setMemRefs(MemOp, MemOp + 1);
-
-      return St;
-    }
-    case Intrinsic::arm64_neon_ld1x2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD1Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD1Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD1Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD1Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD1Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD1Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld1x4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Twov16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Twov8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Twov4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD1Twov1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Twov2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Threev16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Threev8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Threev4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD1Threev1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Threev2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD1Fourv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Fourv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 2, ARM64::LD2Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 2, ARM64::LD2Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 2, ARM64::LD2Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 2, ARM64::LD2Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld3r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 3, ARM64::LD3Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 3, ARM64::LD3Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 3, ARM64::LD3Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 3, ARM64::LD3Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld4r:
-      if (VT == MVT::v8i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8b, ARM64::dsub0);
-      else if (VT == MVT::v16i8)
-        return SelectLoad(Node, 4, ARM64::LD4Rv16b, ARM64::qsub0);
-      else if (VT == MVT::v4i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4h, ARM64::dsub0);
-      else if (VT == MVT::v8i16)
-        return SelectLoad(Node, 4, ARM64::LD4Rv8h, ARM64::qsub0);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2s, ARM64::dsub0);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectLoad(Node, 4, ARM64::LD4Rv4s, ARM64::qsub0);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv1d, ARM64::dsub0);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectLoad(Node, 4, ARM64::LD4Rv2d, ARM64::qsub0);
-      break;
-    case Intrinsic::arm64_neon_ld2lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 2, ARM64::LD2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 2, ARM64::LD2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 2, ARM64::LD2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 2, ARM64::LD2i64);
-      break;
-    case Intrinsic::arm64_neon_ld3lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 3, ARM64::LD3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 3, ARM64::LD3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 3, ARM64::LD3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 3, ARM64::LD3i64);
-      break;
-    case Intrinsic::arm64_neon_ld4lane:
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectLoadLane(Node, 4, ARM64::LD4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectLoadLane(Node, 4, ARM64::LD4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectLoadLane(Node, 4, ARM64::LD4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectLoadLane(Node, 4, ARM64::LD4i64);
-      break;
-    }
-  } break;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_tbl2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBLv8i8Two
-                                                  : ARM64::TBLv16i8Two,
-                         false);
-    case Intrinsic::arm64_neon_tbl3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBLv8i8Three
-                                                  : ARM64::TBLv16i8Three,
-                         false);
-    case Intrinsic::arm64_neon_tbl4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBLv8i8Four
-                                                  : ARM64::TBLv16i8Four,
-                         false);
-    case Intrinsic::arm64_neon_tbx2:
-      return SelectTable(Node, 2, VT == MVT::v8i8 ? ARM64::TBXv8i8Two
-                                                  : ARM64::TBXv16i8Two,
-                         true);
-    case Intrinsic::arm64_neon_tbx3:
-      return SelectTable(Node, 3, VT == MVT::v8i8 ? ARM64::TBXv8i8Three
-                                                  : ARM64::TBXv16i8Three,
-                         true);
-    case Intrinsic::arm64_neon_tbx4:
-      return SelectTable(Node, 4, VT == MVT::v8i8 ? ARM64::TBXv8i8Four
-                                                  : ARM64::TBXv16i8Four,
-                         true);
-    case Intrinsic::arm64_neon_smull:
-    case Intrinsic::arm64_neon_umull:
-      if (SDNode *N = SelectMULLV64LaneV128(IntNo, Node))
-        return N;
-      break;
-    }
-    break;
-  }
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    if (Node->getNumOperands() >= 3)
-      VT = Node->getOperand(2)->getValueType(0);
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_st1x2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST1Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST1Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST1Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST1Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST1Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST1Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st1x4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST1Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST1Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST1Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 2, ARM64::ST2Twov16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 2, ARM64::ST2Twov8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 2, ARM64::ST2Twov4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 2, ARM64::ST2Twov2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 2, ARM64::ST1Twov1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 3, ARM64::ST3Threev16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 3, ARM64::ST3Threev8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 3, ARM64::ST3Threev4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 3, ARM64::ST3Threev2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 3, ARM64::ST1Threev1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4: {
-      if (VT == MVT::v8i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8b);
-      else if (VT == MVT::v16i8)
-        return SelectStore(Node, 4, ARM64::ST4Fourv16b);
-      else if (VT == MVT::v4i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4h);
-      else if (VT == MVT::v8i16)
-        return SelectStore(Node, 4, ARM64::ST4Fourv8h);
-      else if (VT == MVT::v2i32 || VT == MVT::v2f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2s);
-      else if (VT == MVT::v4i32 || VT == MVT::v4f32)
-        return SelectStore(Node, 4, ARM64::ST4Fourv4s);
-      else if (VT == MVT::v2i64 || VT == MVT::v2f64)
-        return SelectStore(Node, 4, ARM64::ST4Fourv2d);
-      else if (VT == MVT::v1i64 || VT == MVT::v1f64)
-        return SelectStore(Node, 4, ARM64::ST1Fourv1d);
-      break;
-    }
-    case Intrinsic::arm64_neon_st2lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 2, ARM64::ST2i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 2, ARM64::ST2i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 2, ARM64::ST2i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 2, ARM64::ST2i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st3lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 3, ARM64::ST3i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 3, ARM64::ST3i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 3, ARM64::ST3i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 3, ARM64::ST3i64);
-      break;
-    }
-    case Intrinsic::arm64_neon_st4lane: {
-      if (VT == MVT::v16i8 || VT == MVT::v8i8)
-        return SelectStoreLane(Node, 4, ARM64::ST4i8);
-      else if (VT == MVT::v8i16 || VT == MVT::v4i16)
-        return SelectStoreLane(Node, 4, ARM64::ST4i16);
-      else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
-               VT == MVT::v2f32)
-        return SelectStoreLane(Node, 4, ARM64::ST4i32);
-      else if (VT == MVT::v2i64 || VT == MVT::v1i64 || VT == MVT::v2f64 ||
-               VT == MVT::v1f64)
-        return SelectStoreLane(Node, 4, ARM64::ST4i64);
-      break;
-    }
-    }
-  }
-
-  case ISD::FCEIL:
-  case ISD::FFLOOR:
-  case ISD::FTRUNC:
-  case ISD::FROUND:
-    if (SDNode *I = SelectLIBM(Node))
-      return I;
-    break;
-  }
-
-  // Select the default instruction
-  ResNode = SelectCode(Node);
-
-  DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
-    DEBUG(Node->dump(CurDAG));
-  else
-    DEBUG(ResNode->dump(CurDAG));
-  DEBUG(errs() << "\n");
-
-  return ResNode;
-}
-
-/// createARM64ISelDag - This pass converts a legalized DAG into a
-/// ARM64-specific DAG, ready for instruction scheduling.
-FunctionPass *llvm::createARM64ISelDag(ARM64TargetMachine &TM,
-                                       CodeGenOpt::Level OptLevel) {
-  return new ARM64DAGToDAGISel(TM, OptLevel);
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.cpp b/lib/Target/ARM64/ARM64ISelLowering.cpp
deleted file mode 100644
index 641f591..0000000
--- a/lib/Target/ARM64/ARM64ISelLowering.cpp
+++ /dev/null
@@ -1,7551 +0,0 @@
-//===-- ARM64ISelLowering.cpp - ARM64 DAG Lowering Implementation  --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64TargetLowering class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-lower"
-
-#include "ARM64ISelLowering.h"
-#include "ARM64PerfectShuffle.h"
-#include "ARM64Subtarget.h"
-#include "ARM64CallingConv.h"
-#include "ARM64MachineFunctionInfo.h"
-#include "ARM64TargetMachine.h"
-#include "ARM64TargetObjectFile.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetOptions.h"
-using namespace llvm;
-
-STATISTIC(NumTailCalls, "Number of tail calls");
-STATISTIC(NumShiftInserts, "Number of vector shift inserts");
-
-// This option should go away when tail calls fully work.
-static cl::opt<bool>
-EnableARM64TailCalls("arm64-tail-calls", cl::Hidden,
-                     cl::desc("Generate ARM64 tail calls (TEMPORARY OPTION)."),
-                     cl::init(true));
-
-static cl::opt<bool>
-StrictAlign("arm64-strict-align", cl::Hidden,
-            cl::desc("Disallow all unaligned memory accesses"));
-
-// Place holder until extr generation is tested fully.
-static cl::opt<bool>
-EnableARM64ExtrGeneration("arm64-extr-generation", cl::Hidden,
-                          cl::desc("Allow ARM64 (or (shift)(shift))->extract"),
-                          cl::init(true));
-
-static cl::opt<bool>
-EnableARM64SlrGeneration("arm64-shift-insert-generation", cl::Hidden,
-                         cl::desc("Allow ARM64 SLI/SRI formation"),
-                         cl::init(false));
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering public interface.
-//===----------------------------------------------------------------------===//
-static TargetLoweringObjectFile *createTLOF(TargetMachine &TM) {
-  if (TM.getSubtarget<ARM64Subtarget>().isTargetDarwin())
-    return new ARM64_MachoTargetObjectFile();
-
-  return new ARM64_ELFTargetObjectFile();
-}
-
-ARM64TargetLowering::ARM64TargetLowering(ARM64TargetMachine &TM)
-    : TargetLowering(TM, createTLOF(TM)) {
-  Subtarget = &TM.getSubtarget<ARM64Subtarget>();
-
-  // ARM64 doesn't have comparisons which set GPRs or setcc instructions, so
-  // we have to make something up. Arbitrarily, choose ZeroOrOne.
-  setBooleanContents(ZeroOrOneBooleanContent);
-  // When comparing vectors the result sets the different elements in the
-  // vector to all-one or all-zero.
-  setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
-
-  // Set up the register classes.
-  addRegisterClass(MVT::i32, &ARM64::GPR32allRegClass);
-  addRegisterClass(MVT::i64, &ARM64::GPR64allRegClass);
-  addRegisterClass(MVT::f32, &ARM64::FPR32RegClass);
-  addRegisterClass(MVT::f64, &ARM64::FPR64RegClass);
-  addRegisterClass(MVT::f128, &ARM64::FPR128RegClass);
-  addRegisterClass(MVT::v16i8, &ARM64::FPR8RegClass);
-  addRegisterClass(MVT::v8i16, &ARM64::FPR16RegClass);
-
-  // Someone set us up the NEON.
-  addDRTypeForNEON(MVT::v2f32);
-  addDRTypeForNEON(MVT::v8i8);
-  addDRTypeForNEON(MVT::v4i16);
-  addDRTypeForNEON(MVT::v2i32);
-  addDRTypeForNEON(MVT::v1i64);
-  addDRTypeForNEON(MVT::v1f64);
-
-  addQRTypeForNEON(MVT::v4f32);
-  addQRTypeForNEON(MVT::v2f64);
-  addQRTypeForNEON(MVT::v16i8);
-  addQRTypeForNEON(MVT::v8i16);
-  addQRTypeForNEON(MVT::v4i32);
-  addQRTypeForNEON(MVT::v2i64);
-
-  // Compute derived properties from the register classes
-  computeRegisterProperties();
-
-  // Provide all sorts of operation actions
-  setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
-  setOperationAction(ISD::GlobalTLSAddress, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::i32, Custom);
-  setOperationAction(ISD::SETCC, MVT::i64, Custom);
-  setOperationAction(ISD::SETCC, MVT::f32, Custom);
-  setOperationAction(ISD::SETCC, MVT::f64, Custom);
-  setOperationAction(ISD::BRCOND, MVT::Other, Expand);
-  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::i64, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT, MVT::f64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::i64, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f64, Custom);
-  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
-  setOperationAction(ISD::JumpTable, MVT::i64, Custom);
-
-  setOperationAction(ISD::SHL_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRA_PARTS, MVT::i64, Custom);
-  setOperationAction(ISD::SRL_PARTS, MVT::i64, Custom);
-
-  setOperationAction(ISD::FREM, MVT::f32, Expand);
-  setOperationAction(ISD::FREM, MVT::f64, Expand);
-  setOperationAction(ISD::FREM, MVT::f80, Expand);
-
-  // FIXME: v1f64 shouldn't be legal if we can avoid it, because it leads to
-  // silliness like this:
-  setOperationAction(ISD::FABS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FADD, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCEIL, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::v1f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FDIV, MVT::v1f64, Expand);
-  setOperationAction(ISD::FFLOOR, MVT::v1f64, Expand);
-  setOperationAction(ISD::FMA, MVT::v1f64, Expand);
-  setOperationAction(ISD::FMUL, MVT::v1f64, Expand);
-  setOperationAction(ISD::FNEARBYINT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FNEG, MVT::v1f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::v1f64, Expand);
-  setOperationAction(ISD::FREM, MVT::v1f64, Expand);
-  setOperationAction(ISD::FROUND, MVT::v1f64, Expand);
-  setOperationAction(ISD::FRINT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSIN, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSQRT, MVT::v1f64, Expand);
-  setOperationAction(ISD::FSUB, MVT::v1f64, Expand);
-  setOperationAction(ISD::FTRUNC, MVT::v1f64, Expand);
-  setOperationAction(ISD::SETCC, MVT::v1f64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::v1f64, Expand);
-  setOperationAction(ISD::SELECT, MVT::v1f64, Expand);
-  setOperationAction(ISD::SELECT_CC, MVT::v1f64, Expand);
-  setOperationAction(ISD::FP_EXTEND, MVT::v1f64, Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, MVT::v1i64, Expand);
-  setOperationAction(ISD::FP_TO_UINT, MVT::v1i64, Expand);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v1i64, Expand);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v1i64, Expand);
-  setOperationAction(ISD::FP_ROUND, MVT::v1f64, Expand);
-
-  // Custom lowering hooks are needed for XOR
-  // to fold it into CSINC/CSINV.
-  setOperationAction(ISD::XOR, MVT::i32, Custom);
-  setOperationAction(ISD::XOR, MVT::i64, Custom);
-
-  // Virtually no operation on f128 is legal, but LLVM can't expand them when
-  // there's a valid register class, so we need custom operations in most cases.
-  setOperationAction(ISD::FABS, MVT::f128, Expand);
-  setOperationAction(ISD::FADD, MVT::f128, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand);
-  setOperationAction(ISD::FCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FDIV, MVT::f128, Custom);
-  setOperationAction(ISD::FMA, MVT::f128, Expand);
-  setOperationAction(ISD::FMUL, MVT::f128, Custom);
-  setOperationAction(ISD::FNEG, MVT::f128, Expand);
-  setOperationAction(ISD::FPOW, MVT::f128, Expand);
-  setOperationAction(ISD::FREM, MVT::f128, Expand);
-  setOperationAction(ISD::FRINT, MVT::f128, Expand);
-  setOperationAction(ISD::FSIN, MVT::f128, Expand);
-  setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
-  setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-  setOperationAction(ISD::FSUB, MVT::f128, Custom);
-  setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
-  setOperationAction(ISD::SETCC, MVT::f128, Custom);
-  setOperationAction(ISD::BR_CC, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT, MVT::f128, Custom);
-  setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
-  setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
-
-  // Lowering for many of the conversions is actually specified by the non-f128
-  // type. The LowerXXX function will be trivial when f128 isn't involved.
-  setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
-  setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
-  setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
-
-  // 128-bit atomics
-  setOperationAction(ISD::ATOMIC_SWAP, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_OR, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_CMP_SWAP, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_ADD, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_SUB, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_XOR, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_MIN, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_MAX, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_NAND, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_UMIN, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_LOAD_UMAX, MVT::i128, Custom);
-  // These are surprisingly difficult. The only single-copy atomic 128-bit
-  // instruction on AArch64 is stxp (when it succeeds). So a store can safely
-  // become a simple swap, but a load can only be determined to have been atomic
-  // if storing the same value back succeeds.
-  setOperationAction(ISD::ATOMIC_LOAD, MVT::i128, Custom);
-  setOperationAction(ISD::ATOMIC_STORE, MVT::i128, Expand);
-
-  // Variable arguments.
-  setOperationAction(ISD::VASTART, MVT::Other, Custom);
-  setOperationAction(ISD::VAARG, MVT::Other, Custom);
-  setOperationAction(ISD::VACOPY, MVT::Other, Custom);
-  setOperationAction(ISD::VAEND, MVT::Other, Expand);
-
-  // Variable-sized objects.
-  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
-  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
-  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Expand);
-
-  // Exception handling.
-  // FIXME: These are guesses. Has this been defined yet?
-  setExceptionPointerRegister(ARM64::X0);
-  setExceptionSelectorRegister(ARM64::X1);
-
-  // Constant pool entries
-  setOperationAction(ISD::ConstantPool, MVT::i64, Custom);
-
-  // BlockAddress
-  setOperationAction(ISD::BlockAddress, MVT::i64, Custom);
-
-  // Add/Sub overflow ops with MVT::Glues are lowered to CPSR dependences.
-  setOperationAction(ISD::ADDC, MVT::i32, Custom);
-  setOperationAction(ISD::ADDE, MVT::i32, Custom);
-  setOperationAction(ISD::SUBC, MVT::i32, Custom);
-  setOperationAction(ISD::SUBE, MVT::i32, Custom);
-  setOperationAction(ISD::ADDC, MVT::i64, Custom);
-  setOperationAction(ISD::ADDE, MVT::i64, Custom);
-  setOperationAction(ISD::SUBC, MVT::i64, Custom);
-  setOperationAction(ISD::SUBE, MVT::i64, Custom);
-
-  // ARM64 lacks both left-rotate and popcount instructions.
-  setOperationAction(ISD::ROTL, MVT::i32, Expand);
-  setOperationAction(ISD::ROTL, MVT::i64, Expand);
-
-  // ARM64 doesn't have a direct vector ->f32 conversion instructions for
-  // elements smaller than i32, so promote the input to i32 first.
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i8, Promote);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Promote);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v4i16, Promote);
-  // Similarly, there is no direct i32 -> f64 vector conversion instruction.
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
-  setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Custom);
-  setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Custom);
-
-  // ARM64 doesn't have {U|S}MUL_LOHI.
-  setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
-  setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
-
-  // ARM64 doesn't have MUL.2d:
-  setOperationAction(ISD::MUL, MVT::v2i64, Expand);
-
-  // Expand the undefined-at-zero variants to cttz/ctlz to their defined-at-zero
-  // counterparts, which ARM64 supports directly.
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32, Expand);
-  setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Expand);
-  setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Expand);
-
-  setOperationAction(ISD::CTPOP, MVT::i32, Custom);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
-
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::SREM, MVT::i32, Expand);
-  setOperationAction(ISD::SREM, MVT::i64, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
-  setOperationAction(ISD::UREM, MVT::i32, Expand);
-  setOperationAction(ISD::UREM, MVT::i64, Expand);
-
-  // Custom lower Add/Sub/Mul with overflow.
-  setOperationAction(ISD::SADDO, MVT::i32, Custom);
-  setOperationAction(ISD::SADDO, MVT::i64, Custom);
-  setOperationAction(ISD::UADDO, MVT::i32, Custom);
-  setOperationAction(ISD::UADDO, MVT::i64, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i32, Custom);
-  setOperationAction(ISD::SSUBO, MVT::i64, Custom);
-  setOperationAction(ISD::USUBO, MVT::i32, Custom);
-  setOperationAction(ISD::USUBO, MVT::i64, Custom);
-  setOperationAction(ISD::SMULO, MVT::i32, Custom);
-  setOperationAction(ISD::SMULO, MVT::i64, Custom);
-  setOperationAction(ISD::UMULO, MVT::i32, Custom);
-  setOperationAction(ISD::UMULO, MVT::i64, Custom);
-
-  setOperationAction(ISD::FSIN, MVT::f32, Expand);
-  setOperationAction(ISD::FSIN, MVT::f64, Expand);
-  setOperationAction(ISD::FCOS, MVT::f32, Expand);
-  setOperationAction(ISD::FCOS, MVT::f64, Expand);
-  setOperationAction(ISD::FPOW, MVT::f32, Expand);
-  setOperationAction(ISD::FPOW, MVT::f64, Expand);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f64, Custom);
-  setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
-
-  // ARM64 has implementations of a lot of rounding-like FP operations.
-  static MVT RoundingTypes[] = { MVT::f32,   MVT::f64,  MVT::v2f32,
-                                 MVT::v4f32, MVT::v2f64 };
-  for (unsigned I = 0; I < array_lengthof(RoundingTypes); ++I) {
-    MVT Ty = RoundingTypes[I];
-    setOperationAction(ISD::FFLOOR, Ty, Legal);
-    setOperationAction(ISD::FNEARBYINT, Ty, Legal);
-    setOperationAction(ISD::FCEIL, Ty, Legal);
-    setOperationAction(ISD::FRINT, Ty, Legal);
-    setOperationAction(ISD::FTRUNC, Ty, Legal);
-    setOperationAction(ISD::FROUND, Ty, Legal);
-  }
-
-  setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
-
-  if (Subtarget->isTargetMachO()) {
-    // For iOS, we don't want to the normal expansion of a libcall to
-    // sincos. We want to issue a libcall to __sincos_stret to avoid memory
-    // traffic.
-    setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
-  } else {
-    setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
-    setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
-  }
-
-  // ARM64 does not have floating-point extending loads, i1 sign-extending load,
-  // floating-point truncating stores, or v2i32->v2i16 truncating store.
-  setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f64, Expand);
-  setLoadExtAction(ISD::EXTLOAD, MVT::f80, Expand);
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Expand);
-  setTruncStoreAction(MVT::f64, MVT::f32, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f80, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-  setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-  setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
-  // Indexed loads and stores are supported.
-  for (unsigned im = (unsigned)ISD::PRE_INC;
-       im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
-    setIndexedLoadAction(im, MVT::i8, Legal);
-    setIndexedLoadAction(im, MVT::i16, Legal);
-    setIndexedLoadAction(im, MVT::i32, Legal);
-    setIndexedLoadAction(im, MVT::i64, Legal);
-    setIndexedLoadAction(im, MVT::f64, Legal);
-    setIndexedLoadAction(im, MVT::f32, Legal);
-    setIndexedStoreAction(im, MVT::i8, Legal);
-    setIndexedStoreAction(im, MVT::i16, Legal);
-    setIndexedStoreAction(im, MVT::i32, Legal);
-    setIndexedStoreAction(im, MVT::i64, Legal);
-    setIndexedStoreAction(im, MVT::f64, Legal);
-    setIndexedStoreAction(im, MVT::f32, Legal);
-  }
-
-  // Likewise, narrowing and extending vector loads/stores aren't handled
-  // directly.
-  for (unsigned VT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-       VT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++VT) {
-
-    setOperationAction(ISD::SIGN_EXTEND_INREG, (MVT::SimpleValueType)VT,
-                       Expand);
-
-    for (unsigned InnerVT = (unsigned)MVT::FIRST_VECTOR_VALUETYPE;
-         InnerVT <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++InnerVT)
-      setTruncStoreAction((MVT::SimpleValueType)VT,
-                          (MVT::SimpleValueType)InnerVT, Expand);
-    setLoadExtAction(ISD::SEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::ZEXTLOAD, (MVT::SimpleValueType)VT, Expand);
-    setLoadExtAction(ISD::EXTLOAD, (MVT::SimpleValueType)VT, Expand);
-  }
-
-  // Trap.
-  setOperationAction(ISD::TRAP, MVT::Other, Legal);
-  setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Legal);
-
-  // We combine OR nodes for bitfield operations.
-  setTargetDAGCombine(ISD::OR);
-
-  // Vector add and sub nodes may conceal a high-half opportunity.
-  // Also, try to fold ADD into CSINC/CSINV..
-  setTargetDAGCombine(ISD::ADD);
-  setTargetDAGCombine(ISD::SUB);
-
-  setTargetDAGCombine(ISD::XOR);
-  setTargetDAGCombine(ISD::SINT_TO_FP);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
-
-  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
-
-  setTargetDAGCombine(ISD::ANY_EXTEND);
-  setTargetDAGCombine(ISD::ZERO_EXTEND);
-  setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::BITCAST);
-  setTargetDAGCombine(ISD::CONCAT_VECTORS);
-  setTargetDAGCombine(ISD::STORE);
-
-  setTargetDAGCombine(ISD::MUL);
-
-  MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
-  MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
-
-  setStackPointerRegisterToSaveRestore(ARM64::SP);
-
-  setSchedulingPreference(Sched::Hybrid);
-
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
-
-  setMinFunctionAlignment(2);
-
-  RequireStrictAlign = StrictAlign;
-}
-
-void ARM64TargetLowering::addTypeForNEON(EVT VT, EVT PromotedBitwiseVT) {
-  if (VT == MVT::v2f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i32);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i32);
-  } else if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    setOperationAction(ISD::LOAD, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::LOAD, VT.getSimpleVT(), MVT::v2i64);
-
-    setOperationAction(ISD::STORE, VT.getSimpleVT(), Promote);
-    AddPromotedToType(ISD::STORE, VT.getSimpleVT(), MVT::v2i64);
-  }
-
-  // Mark vector float intrinsics as expand.
-  if (VT == MVT::v2f32 || VT == MVT::v4f32 || VT == MVT::v2f64) {
-    setOperationAction(ISD::FSIN, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FCOS, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOWI, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FPOW, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG2, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FLOG10, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP, VT.getSimpleVT(), Expand);
-    setOperationAction(ISD::FEXP2, VT.getSimpleVT(), Expand);
-  }
-
-  setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::INSERT_VECTOR_ELT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SCALAR_TO_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::BUILD_VECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::VECTOR_SHUFFLE, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRA, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SRL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SHL, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::AND, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::OR, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::SETCC, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::CONCAT_VECTORS, VT.getSimpleVT(), Legal);
-
-  setOperationAction(ISD::SELECT, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SELECT_CC, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::VSELECT, VT.getSimpleVT(), Expand);
-  setLoadExtAction(ISD::EXTLOAD, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::UDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SDIV, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::UREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::SREM, VT.getSimpleVT(), Expand);
-  setOperationAction(ISD::FREM, VT.getSimpleVT(), Expand);
-
-  setOperationAction(ISD::FP_TO_SINT, VT.getSimpleVT(), Custom);
-  setOperationAction(ISD::FP_TO_UINT, VT.getSimpleVT(), Custom);
-}
-
-void ARM64TargetLowering::addDRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR64RegClass);
-  addTypeForNEON(VT, MVT::v2i32);
-}
-
-void ARM64TargetLowering::addQRTypeForNEON(MVT VT) {
-  addRegisterClass(VT, &ARM64::FPR128RegClass);
-  addTypeForNEON(VT, MVT::v4i32);
-}
-
-EVT ARM64TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-  if (!VT.isVector())
-    return MVT::i32;
-  return VT.changeVectorElementTypeToInteger();
-}
-
-/// computeMaskedBitsForTargetNode - Determine which of the bits specified in
-/// Mask are known to be either zero or one and return them in the
-/// KnownZero/KnownOne bitsets.
-void ARM64TargetLowering::computeMaskedBitsForTargetNode(
-    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
-  switch (Op.getOpcode()) {
-  default:
-    break;
-  case ARM64ISD::CSEL: {
-    APInt KnownZero2, KnownOne2;
-    DAG.ComputeMaskedBits(Op->getOperand(0), KnownZero, KnownOne, Depth + 1);
-    DAG.ComputeMaskedBits(Op->getOperand(1), KnownZero2, KnownOne2, Depth + 1);
-    KnownZero &= KnownZero2;
-    KnownOne &= KnownOne2;
-    break;
-  }
-  case ISD::INTRINSIC_W_CHAIN:
-    break;
-  case ISD::INTRINSIC_WO_CHAIN:
-  case ISD::INTRINSIC_VOID: {
-    unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-    switch (IntNo) {
-    default:
-      break;
-    case Intrinsic::arm64_neon_umaxv:
-    case Intrinsic::arm64_neon_uminv: {
-      // Figure out the datatype of the vector operand. The UMINV instruction
-      // will zero extend the result, so we can mark as known zero all the
-      // bits larger than the element datatype. 32-bit or larget doesn't need
-      // this as those are legal types and will be handled by isel directly.
-      MVT VT = Op.getOperand(1).getValueType().getSimpleVT();
-      unsigned BitWidth = KnownZero.getBitWidth();
-      if (VT == MVT::v8i8 || VT == MVT::v16i8) {
-        assert(BitWidth >= 8 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 8);
-        KnownZero |= Mask;
-      } else if (VT == MVT::v4i16 || VT == MVT::v8i16) {
-        assert(BitWidth >= 16 && "Unexpected width!");
-        APInt Mask = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
-        KnownZero |= Mask;
-      }
-      break;
-    } break;
-    }
-  }
-  }
-}
-
-MVT ARM64TargetLowering::getScalarShiftAmountTy(EVT LHSTy) const {
-  return MVT::i64;
-}
-
-unsigned ARM64TargetLowering::getMaximalGlobalOffset() const {
-  // FIXME: On ARM64, this depends on the type.
-  // Basically, the addressable offsets are o to 4095 * Ty.getSizeInBytes().
-  // and the offset has to be a multiple of the related size in bytes.
-  return 4095;
-}
-
-FastISel *
-ARM64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
-                                    const TargetLibraryInfo *libInfo) const {
-  return ARM64::createFastISel(funcInfo, libInfo);
-}
-
-const char *ARM64TargetLowering::getTargetNodeName(unsigned Opcode) const {
-  switch (Opcode) {
-  default:
-    return 0;
-  case ARM64ISD::CALL:              return "ARM64ISD::CALL";
-  case ARM64ISD::ADRP:              return "ARM64ISD::ADRP";
-  case ARM64ISD::ADDlow:            return "ARM64ISD::ADDlow";
-  case ARM64ISD::LOADgot:           return "ARM64ISD::LOADgot";
-  case ARM64ISD::RET_FLAG:          return "ARM64ISD::RET_FLAG";
-  case ARM64ISD::BRCOND:            return "ARM64ISD::BRCOND";
-  case ARM64ISD::CSEL:              return "ARM64ISD::CSEL";
-  case ARM64ISD::FCSEL:             return "ARM64ISD::FCSEL";
-  case ARM64ISD::CSINV:             return "ARM64ISD::CSINV";
-  case ARM64ISD::CSNEG:             return "ARM64ISD::CSNEG";
-  case ARM64ISD::CSINC:             return "ARM64ISD::CSINC";
-  case ARM64ISD::THREAD_POINTER:    return "ARM64ISD::THREAD_POINTER";
-  case ARM64ISD::TLSDESC_CALL:      return "ARM64ISD::TLSDESC_CALL";
-  case ARM64ISD::ADC:               return "ARM64ISD::ADC";
-  case ARM64ISD::SBC:               return "ARM64ISD::SBC";
-  case ARM64ISD::ADDS:              return "ARM64ISD::ADDS";
-  case ARM64ISD::SUBS:              return "ARM64ISD::SUBS";
-  case ARM64ISD::ADCS:              return "ARM64ISD::ADCS";
-  case ARM64ISD::SBCS:              return "ARM64ISD::SBCS";
-  case ARM64ISD::ANDS:              return "ARM64ISD::ANDS";
-  case ARM64ISD::FCMP:              return "ARM64ISD::FCMP";
-  case ARM64ISD::FMIN:              return "ARM64ISD::FMIN";
-  case ARM64ISD::FMAX:              return "ARM64ISD::FMAX";
-  case ARM64ISD::DUP:               return "ARM64ISD::DUP";
-  case ARM64ISD::DUPLANE8:          return "ARM64ISD::DUPLANE8";
-  case ARM64ISD::DUPLANE16:         return "ARM64ISD::DUPLANE16";
-  case ARM64ISD::DUPLANE32:         return "ARM64ISD::DUPLANE32";
-  case ARM64ISD::DUPLANE64:         return "ARM64ISD::DUPLANE64";
-  case ARM64ISD::MOVI:              return "ARM64ISD::MOVI";
-  case ARM64ISD::MOVIshift:         return "ARM64ISD::MOVIshift";
-  case ARM64ISD::MOVIedit:          return "ARM64ISD::MOVIedit";
-  case ARM64ISD::MOVImsl:           return "ARM64ISD::MOVImsl";
-  case ARM64ISD::FMOV:              return "ARM64ISD::FMOV";
-  case ARM64ISD::MVNIshift:         return "ARM64ISD::MVNIshift";
-  case ARM64ISD::MVNImsl:           return "ARM64ISD::MVNImsl";
-  case ARM64ISD::BICi:              return "ARM64ISD::BICi";
-  case ARM64ISD::ORRi:              return "ARM64ISD::ORRi";
-  case ARM64ISD::NEG:               return "ARM64ISD::NEG";
-  case ARM64ISD::EXTR:              return "ARM64ISD::EXTR";
-  case ARM64ISD::ZIP1:              return "ARM64ISD::ZIP1";
-  case ARM64ISD::ZIP2:              return "ARM64ISD::ZIP2";
-  case ARM64ISD::UZP1:              return "ARM64ISD::UZP1";
-  case ARM64ISD::UZP2:              return "ARM64ISD::UZP2";
-  case ARM64ISD::TRN1:              return "ARM64ISD::TRN1";
-  case ARM64ISD::TRN2:              return "ARM64ISD::TRN2";
-  case ARM64ISD::REV16:             return "ARM64ISD::REV16";
-  case ARM64ISD::REV32:             return "ARM64ISD::REV32";
-  case ARM64ISD::REV64:             return "ARM64ISD::REV64";
-  case ARM64ISD::EXT:               return "ARM64ISD::EXT";
-  case ARM64ISD::VSHL:              return "ARM64ISD::VSHL";
-  case ARM64ISD::VLSHR:             return "ARM64ISD::VLSHR";
-  case ARM64ISD::VASHR:             return "ARM64ISD::VASHR";
-  case ARM64ISD::CMEQ:              return "ARM64ISD::CMEQ";
-  case ARM64ISD::CMGE:              return "ARM64ISD::CMGE";
-  case ARM64ISD::CMGT:              return "ARM64ISD::CMGT";
-  case ARM64ISD::CMHI:              return "ARM64ISD::CMHI";
-  case ARM64ISD::CMHS:              return "ARM64ISD::CMHS";
-  case ARM64ISD::FCMEQ:             return "ARM64ISD::FCMEQ";
-  case ARM64ISD::FCMGE:             return "ARM64ISD::FCMGE";
-  case ARM64ISD::FCMGT:             return "ARM64ISD::FCMGT";
-  case ARM64ISD::CMEQz:             return "ARM64ISD::CMEQz";
-  case ARM64ISD::CMGEz:             return "ARM64ISD::CMGEz";
-  case ARM64ISD::CMGTz:             return "ARM64ISD::CMGTz";
-  case ARM64ISD::CMLEz:             return "ARM64ISD::CMLEz";
-  case ARM64ISD::CMLTz:             return "ARM64ISD::CMLTz";
-  case ARM64ISD::FCMEQz:            return "ARM64ISD::FCMEQz";
-  case ARM64ISD::FCMGEz:            return "ARM64ISD::FCMGEz";
-  case ARM64ISD::FCMGTz:            return "ARM64ISD::FCMGTz";
-  case ARM64ISD::FCMLEz:            return "ARM64ISD::FCMLEz";
-  case ARM64ISD::FCMLTz:            return "ARM64ISD::FCMLTz";
-  case ARM64ISD::NOT:               return "ARM64ISD::NOT";
-  case ARM64ISD::BIT:               return "ARM64ISD::BIT";
-  case ARM64ISD::CBZ:               return "ARM64ISD::CBZ";
-  case ARM64ISD::CBNZ:              return "ARM64ISD::CBNZ";
-  case ARM64ISD::TBZ:               return "ARM64ISD::TBZ";
-  case ARM64ISD::TBNZ:              return "ARM64ISD::TBNZ";
-  case ARM64ISD::TC_RETURN:         return "ARM64ISD::TC_RETURN";
-  case ARM64ISD::SITOF:             return "ARM64ISD::SITOF";
-  case ARM64ISD::UITOF:             return "ARM64ISD::UITOF";
-  case ARM64ISD::SQSHL_I:           return "ARM64ISD::SQSHL_I";
-  case ARM64ISD::UQSHL_I:           return "ARM64ISD::UQSHL_I";
-  case ARM64ISD::SRSHR_I:           return "ARM64ISD::SRSHR_I";
-  case ARM64ISD::URSHR_I:           return "ARM64ISD::URSHR_I";
-  case ARM64ISD::SQSHLU_I:          return "ARM64ISD::SQSHLU_I";
-  case ARM64ISD::WrapperLarge:      return "ARM64ISD::WrapperLarge";
-  }
-}
-
-static void getExclusiveOperation(unsigned Size, AtomicOrdering Ord,
-                                  unsigned &LdrOpc, unsigned &StrOpc) {
-  static unsigned LoadBares[] = { ARM64::LDXRB, ARM64::LDXRH, ARM64::LDXRW,
-                                  ARM64::LDXRX, ARM64::LDXPX };
-  static unsigned LoadAcqs[] = { ARM64::LDAXRB, ARM64::LDAXRH, ARM64::LDAXRW,
-                                 ARM64::LDAXRX, ARM64::LDAXPX };
-  static unsigned StoreBares[] = { ARM64::STXRB, ARM64::STXRH, ARM64::STXRW,
-                                   ARM64::STXRX, ARM64::STXPX };
-  static unsigned StoreRels[] = { ARM64::STLXRB, ARM64::STLXRH, ARM64::STLXRW,
-                                  ARM64::STLXRX, ARM64::STLXPX };
-
-  unsigned *LoadOps, *StoreOps;
-  if (Ord == Acquire || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    LoadOps = LoadAcqs;
-  else
-    LoadOps = LoadBares;
-
-  if (Ord == Release || Ord == AcquireRelease || Ord == SequentiallyConsistent)
-    StoreOps = StoreRels;
-  else
-    StoreOps = StoreBares;
-
-  assert(isPowerOf2_32(Size) && Size <= 16 &&
-         "unsupported size for atomic binary op!");
-
-  LdrOpc = LoadOps[Log2_32(Size)];
-  StrOpc = StoreOps[Log2_32(Size)];
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicCmpSwap(MachineInstr *MI,
-                                                          MachineBasicBlock *BB,
-                                                          unsigned Size) const {
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned oldval = MI->getOperand(2).getReg();
-  unsigned newval = MI->getOperand(3).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(4).getImm());
-  unsigned scratch = BB->getParent()->getRegInfo().createVirtualRegister(
-      &ARM64::GPR32RegClass);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc dl = MI->getDebugLoc();
-
-  // FIXME: We currently always generate a seq_cst operation; we should
-  // be able to relax this in some cases.
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loop1MBB);
-  MF->insert(It, loop2MBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loop1MBB
-  BB->addSuccessor(loop1MBB);
-
-  // loop1MBB:
-  //   ldrex dest, [ptr]
-  //   cmp dest, oldval
-  //   bne exitMBB
-  BB = loop1MBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  BuildMI(BB, dl, TII->get(Size == 8 ? ARM64::SUBSXrr : ARM64::SUBSWrr))
-      .addReg(Size == 8 ? ARM64::XZR : ARM64::WZR, RegState::Define)
-      .addReg(dest)
-      .addReg(oldval);
-  BuildMI(BB, dl, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(exitMBB);
-  BB->addSuccessor(loop2MBB);
-  BB->addSuccessor(exitMBB);
-
-  // loop2MBB:
-  //   strex scratch, newval, [ptr]
-  //   cmp scratch, #0
-  //   bne loop1MBB
-  BB = loop2MBB;
-  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(newval).addReg(ptr);
-  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loop1MBB);
-  BB->addSuccessor(loop1MBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                      unsigned Size, unsigned BinOpcode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned dest = MI->getOperand(0).getReg();
-  unsigned ptr = MI->getOperand(1).getReg();
-  unsigned incr = MI->getOperand(2).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(3).getImm());
-  DebugLoc dl = MI->getDebugLoc();
-
-  unsigned ldrOpc, strOpc;
-  getExclusiveOperation(Size, Ord, ldrOpc, strOpc);
-
-  MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, loopMBB);
-  MF->insert(It, exitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  exitMBB->splice(exitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  exitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned scratch = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned scratch2 =
-      (!BinOpcode)
-          ? incr
-          : RegInfo.createVirtualRegister(Size == 8 ? &ARM64::GPR64RegClass
-                                                    : &ARM64::GPR32RegClass);
-
-  //  thisMBB:
-  //   ...
-  //   fallthrough --> loopMBB
-  BB->addSuccessor(loopMBB);
-
-  //  loopMBB:
-  //   ldxr dest, ptr
-  //   <binop> scratch2, dest, incr
-  //   stxr scratch, scratch2, ptr
-  //   cbnz scratch, loopMBB
-  //   fallthrough --> exitMBB
-  BB = loopMBB;
-  BuildMI(BB, dl, TII->get(ldrOpc), dest).addReg(ptr);
-  if (BinOpcode) {
-    // operand order needs to go the other way for NAND
-    if (BinOpcode == ARM64::BICWrr || BinOpcode == ARM64::BICXrr)
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(incr).addReg(dest);
-    else
-      BuildMI(BB, dl, TII->get(BinOpcode), scratch2).addReg(dest).addReg(incr);
-  }
-
-  BuildMI(BB, dl, TII->get(strOpc), scratch).addReg(scratch2).addReg(ptr);
-  BuildMI(BB, dl, TII->get(ARM64::CBNZW)).addReg(scratch).addMBB(loopMBB);
-
-  BB->addSuccessor(loopMBB);
-  BB->addSuccessor(exitMBB);
-
-  //  exitMBB:
-  //   ...
-  BB = exitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicBinary128(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned BinOpcodeLo,
-    unsigned BinOpcodeHi) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned IncrLo = MI->getOperand(3).getReg();
-  unsigned IncrHi = MI->getOperand(4).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, LoopMBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned ScratchLo = IncrLo, ScratchHi = IncrHi;
-  if (BinOpcodeLo) {
-    assert(BinOpcodeHi && "Expect neither or both opcodes to be defined");
-    ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-    ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-  }
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> LoopMBB
-  BB->addSuccessor(LoopMBB);
-
-  //  LoopMBB:
-  //   ldxp DestLo, DestHi, Ptr
-  //   <binoplo> ScratchLo, DestLo, IncrLo
-  //   <binophi> ScratchHi, DestHi, IncrHi
-  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
-  //   cbnz ScratchRes, LoopMBB
-  //   fallthrough --> ExitMBB
-  BB = LoopMBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-  if (BinOpcodeLo) {
-    // operand order needs to go the other way for NAND
-    if (BinOpcodeLo == ARM64::BICXrr) {
-      std::swap(IncrLo, DestLo);
-      std::swap(IncrHi, DestHi);
-    }
-
-    BuildMI(BB, DL, TII->get(BinOpcodeLo), ScratchLo).addReg(DestLo).addReg(
-        IncrLo);
-    BuildMI(BB, DL, TII->get(BinOpcodeHi), ScratchHi).addReg(DestHi).addReg(
-        IncrHi);
-  }
-
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(ScratchLo)
-      .addReg(ScratchHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
-  BB->addSuccessor(LoopMBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitAtomicCmpSwap128(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const {
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned OldValLo = MI->getOperand(3).getReg();
-  unsigned OldValHi = MI->getOperand(4).getReg();
-  unsigned NewValLo = MI->getOperand(5).getReg();
-  unsigned NewValHi = MI->getOperand(6).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(7).getImm());
-  unsigned ScratchRes = BB->getParent()->getRegInfo().createVirtualRegister(
-      &ARM64::GPR32RegClass);
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineFunction *MF = BB->getParent();
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction::iterator It = BB;
-  ++It; // insert the new blocks after the current block
-
-  MachineBasicBlock *Loop1MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *Loop2MBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, Loop1MBB);
-  MF->insert(It, Loop2MBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> Loop1MBB
-  BB->addSuccessor(Loop1MBB);
-
-  // Loop1MBB:
-  //   ldxp DestLo, DestHi, [Ptr]
-  //   cmp DestLo, OldValLo
-  //   sbc xzr, DestHi, OldValHi
-  //   bne ExitMBB
-  BB = Loop1MBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
-      OldValLo);
-  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
-      OldValHi);
-
-  BuildMI(BB, DL, TII->get(ARM64::Bcc)).addImm(ARM64CC::NE).addMBB(ExitMBB);
-  BB->addSuccessor(Loop2MBB);
-  BB->addSuccessor(ExitMBB);
-
-  // Loop2MBB:
-  //   stxp ScratchRes, NewValLo, NewValHi, [Ptr]
-  //   cbnz ScratchRes, Loop1MBB
-  BB = Loop2MBB;
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(NewValLo)
-      .addReg(NewValHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(Loop1MBB);
-  BB->addSuccessor(Loop1MBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *ARM64TargetLowering::EmitAtomicMinMax128(
-    MachineInstr *MI, MachineBasicBlock *BB, unsigned CondCode) const {
-  // This also handles ATOMIC_SWAP, indicated by BinOpcode==0.
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-
-  const BasicBlock *LLVM_BB = BB->getBasicBlock();
-  MachineFunction *MF = BB->getParent();
-  MachineFunction::iterator It = BB;
-  ++It;
-
-  unsigned DestLo = MI->getOperand(0).getReg();
-  unsigned DestHi = MI->getOperand(1).getReg();
-  unsigned Ptr = MI->getOperand(2).getReg();
-  unsigned IncrLo = MI->getOperand(3).getReg();
-  unsigned IncrHi = MI->getOperand(4).getReg();
-  AtomicOrdering Ord = static_cast<AtomicOrdering>(MI->getOperand(5).getImm());
-  DebugLoc DL = MI->getDebugLoc();
-
-  unsigned LdrOpc, StrOpc;
-  getExclusiveOperation(16, Ord, LdrOpc, StrOpc);
-
-  MachineBasicBlock *LoopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *ExitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, LoopMBB);
-  MF->insert(It, ExitMBB);
-
-  // Transfer the remainder of BB and its successor edges to exitMBB.
-  ExitMBB->splice(ExitMBB->begin(), BB,
-                  std::next(MachineBasicBlock::iterator(MI)), BB->end());
-  ExitMBB->transferSuccessorsAndUpdatePHIs(BB);
-
-  MachineRegisterInfo &RegInfo = MF->getRegInfo();
-  unsigned ScratchRes = RegInfo.createVirtualRegister(&ARM64::GPR32RegClass);
-  unsigned ScratchLo = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-  unsigned ScratchHi = RegInfo.createVirtualRegister(&ARM64::GPR64RegClass);
-
-  //  ThisMBB:
-  //   ...
-  //   fallthrough --> LoopMBB
-  BB->addSuccessor(LoopMBB);
-
-  //  LoopMBB:
-  //   ldxp DestLo, DestHi, Ptr
-  //   cmp ScratchLo, DestLo, IncrLo
-  //   sbc xzr, ScratchHi, DestHi, IncrHi
-  //   csel ScratchLo, DestLo, IncrLo, <cmp-op>
-  //   csel ScratchHi, DestHi, IncrHi, <cmp-op>
-  //   stxp ScratchRes, ScratchLo, ScratchHi, ptr
-  //   cbnz ScratchRes, LoopMBB
-  //   fallthrough --> ExitMBB
-  BB = LoopMBB;
-  BuildMI(BB, DL, TII->get(LdrOpc), DestLo)
-      .addReg(DestHi, RegState::Define)
-      .addReg(Ptr);
-
-  BuildMI(BB, DL, TII->get(ARM64::SUBSXrr), ARM64::XZR).addReg(DestLo).addReg(
-      IncrLo);
-  BuildMI(BB, DL, TII->get(ARM64::SBCXr), ARM64::XZR).addReg(DestHi).addReg(
-      IncrHi);
-
-  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchLo)
-      .addReg(DestLo)
-      .addReg(IncrLo)
-      .addImm(CondCode);
-  BuildMI(BB, DL, TII->get(ARM64::CSELXr), ScratchHi)
-      .addReg(DestHi)
-      .addReg(IncrHi)
-      .addImm(CondCode);
-
-  BuildMI(BB, DL, TII->get(StrOpc), ScratchRes)
-      .addReg(ScratchLo)
-      .addReg(ScratchHi)
-      .addReg(Ptr);
-  BuildMI(BB, DL, TII->get(ARM64::CBNZW)).addReg(ScratchRes).addMBB(LoopMBB);
-
-  BB->addSuccessor(LoopMBB);
-  BB->addSuccessor(ExitMBB);
-
-  //  ExitMBB:
-  //   ...
-  BB = ExitMBB;
-
-  MI->eraseFromParent(); // The instruction is gone now.
-
-  return BB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const {
-  // We materialise the F128CSEL pseudo-instruction as some control flow and a
-  // phi node:
-
-  // OrigBB:
-  //     [... previous instrs leading to comparison ...]
-  //     b.ne TrueBB
-  //     b EndBB
-  // TrueBB:
-  //     ; Fallthrough
-  // EndBB:
-  //     Dest = PHI [IfTrue, TrueBB], [IfFalse, OrigBB]
-
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  MachineFunction *MF = MBB->getParent();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  DebugLoc DL = MI->getDebugLoc();
-  MachineFunction::iterator It = MBB;
-  ++It;
-
-  unsigned DestReg = MI->getOperand(0).getReg();
-  unsigned IfTrueReg = MI->getOperand(1).getReg();
-  unsigned IfFalseReg = MI->getOperand(2).getReg();
-  unsigned CondCode = MI->getOperand(3).getImm();
-  bool CPSRKilled = MI->getOperand(4).isKill();
-
-  MachineBasicBlock *TrueBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *EndBB = MF->CreateMachineBasicBlock(LLVM_BB);
-  MF->insert(It, TrueBB);
-  MF->insert(It, EndBB);
-
-  // Transfer rest of current basic-block to EndBB
-  EndBB->splice(EndBB->begin(), MBB, std::next(MachineBasicBlock::iterator(MI)),
-                MBB->end());
-  EndBB->transferSuccessorsAndUpdatePHIs(MBB);
-
-  BuildMI(MBB, DL, TII->get(ARM64::Bcc)).addImm(CondCode).addMBB(TrueBB);
-  BuildMI(MBB, DL, TII->get(ARM64::B)).addMBB(EndBB);
-  MBB->addSuccessor(TrueBB);
-  MBB->addSuccessor(EndBB);
-
-  // TrueBB falls through to the end.
-  TrueBB->addSuccessor(EndBB);
-
-  if (!CPSRKilled) {
-    TrueBB->addLiveIn(ARM64::CPSR);
-    EndBB->addLiveIn(ARM64::CPSR);
-  }
-
-  BuildMI(*EndBB, EndBB->begin(), DL, TII->get(ARM64::PHI), DestReg)
-      .addReg(IfTrueReg)
-      .addMBB(TrueBB)
-      .addReg(IfFalseReg)
-      .addMBB(MBB);
-
-  MI->eraseFromParent();
-  return EndBB;
-}
-
-MachineBasicBlock *
-ARM64TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                 MachineBasicBlock *BB) const {
-  switch (MI->getOpcode()) {
-  default:
-#ifndef NDEBUG
-    MI->dump();
-#endif
-    assert(0 && "Unexpected instruction for custom inserter!");
-    break;
-
-  case ARM64::ATOMIC_LOAD_ADD_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ADDWrr);
-  case ARM64::ATOMIC_LOAD_ADD_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ADDXrr);
-  case ARM64::ATOMIC_LOAD_ADD_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ADDSXrr, ARM64::ADCXr);
-
-  case ARM64::ATOMIC_LOAD_AND_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ANDWrr);
-  case ARM64::ATOMIC_LOAD_AND_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ANDXrr);
-  case ARM64::ATOMIC_LOAD_AND_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ANDXrr, ARM64::ANDXrr);
-
-  case ARM64::ATOMIC_LOAD_OR_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::ORRWrr);
-  case ARM64::ATOMIC_LOAD_OR_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::ORRXrr);
-  case ARM64::ATOMIC_LOAD_OR_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::ORRXrr, ARM64::ORRXrr);
-
-  case ARM64::ATOMIC_LOAD_XOR_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::EORWrr);
-  case ARM64::ATOMIC_LOAD_XOR_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::EORXrr);
-  case ARM64::ATOMIC_LOAD_XOR_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::EORXrr, ARM64::EORXrr);
-
-  case ARM64::ATOMIC_LOAD_NAND_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::BICWrr);
-  case ARM64::ATOMIC_LOAD_NAND_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::BICXrr);
-  case ARM64::ATOMIC_LOAD_NAND_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::BICXrr, ARM64::BICXrr);
-
-  case ARM64::ATOMIC_LOAD_SUB_I8:
-    return EmitAtomicBinary(MI, BB, 1, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I16:
-    return EmitAtomicBinary(MI, BB, 2, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I32:
-    return EmitAtomicBinary(MI, BB, 4, ARM64::SUBWrr);
-  case ARM64::ATOMIC_LOAD_SUB_I64:
-    return EmitAtomicBinary(MI, BB, 8, ARM64::SUBXrr);
-  case ARM64::ATOMIC_LOAD_SUB_I128:
-    return EmitAtomicBinary128(MI, BB, ARM64::SUBSXrr, ARM64::SBCXr);
-
-  case ARM64::ATOMIC_LOAD_MIN_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::LT);
-
-  case ARM64::ATOMIC_LOAD_MAX_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::GT);
-
-  case ARM64::ATOMIC_LOAD_UMIN_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::CC);
-
-  case ARM64::ATOMIC_LOAD_UMAX_I128:
-    return EmitAtomicMinMax128(MI, BB, ARM64CC::HI);
-
-  case ARM64::ATOMIC_SWAP_I8:
-    return EmitAtomicBinary(MI, BB, 1, 0);
-  case ARM64::ATOMIC_SWAP_I16:
-    return EmitAtomicBinary(MI, BB, 2, 0);
-  case ARM64::ATOMIC_SWAP_I32:
-    return EmitAtomicBinary(MI, BB, 4, 0);
-  case ARM64::ATOMIC_SWAP_I64:
-    return EmitAtomicBinary(MI, BB, 8, 0);
-  case ARM64::ATOMIC_SWAP_I128:
-    return EmitAtomicBinary128(MI, BB, 0, 0);
-
-  case ARM64::ATOMIC_CMP_SWAP_I8:
-    return EmitAtomicCmpSwap(MI, BB, 1);
-  case ARM64::ATOMIC_CMP_SWAP_I16:
-    return EmitAtomicCmpSwap(MI, BB, 2);
-  case ARM64::ATOMIC_CMP_SWAP_I32:
-    return EmitAtomicCmpSwap(MI, BB, 4);
-  case ARM64::ATOMIC_CMP_SWAP_I64:
-    return EmitAtomicCmpSwap(MI, BB, 8);
-  case ARM64::ATOMIC_CMP_SWAP_I128:
-    return EmitAtomicCmpSwap128(MI, BB);
-
-  case ARM64::F128CSEL:
-    return EmitF128CSEL(MI, BB);
-
-  case TargetOpcode::STACKMAP:
-  case TargetOpcode::PATCHPOINT:
-    return emitPatchPoint(MI, BB);
-  }
-  llvm_unreachable("Unexpected instruction for custom inserter!");
-}
-
-//===----------------------------------------------------------------------===//
-// ARM64 Lowering private implementation.
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Lowering Code
-//===----------------------------------------------------------------------===//
-
-/// changeIntCCToARM64CC - Convert a DAG integer condition code to an ARM64 CC
-static ARM64CC::CondCode changeIntCCToARM64CC(ISD::CondCode CC) {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown condition code!");
-  case ISD::SETNE:
-    return ARM64CC::NE;
-  case ISD::SETEQ:
-    return ARM64CC::EQ;
-  case ISD::SETGT:
-    return ARM64CC::GT;
-  case ISD::SETGE:
-    return ARM64CC::GE;
-  case ISD::SETLT:
-    return ARM64CC::LT;
-  case ISD::SETLE:
-    return ARM64CC::LE;
-  case ISD::SETUGT:
-    return ARM64CC::HI;
-  case ISD::SETUGE:
-    return ARM64CC::CS;
-  case ISD::SETULT:
-    return ARM64CC::CC;
-  case ISD::SETULE:
-    return ARM64CC::LS;
-  }
-}
-
-/// changeFPCCToARM64CC - Convert a DAG fp condition code to an ARM64 CC.
-static void changeFPCCToARM64CC(ISD::CondCode CC, ARM64CC::CondCode &CondCode,
-                                ARM64CC::CondCode &CondCode2) {
-  CondCode2 = ARM64CC::AL;
-  switch (CC) {
-  default:
-    llvm_unreachable("Unknown FP condition!");
-  case ISD::SETEQ:
-  case ISD::SETOEQ:
-    CondCode = ARM64CC::EQ;
-    break;
-  case ISD::SETGT:
-  case ISD::SETOGT:
-    CondCode = ARM64CC::GT;
-    break;
-  case ISD::SETGE:
-  case ISD::SETOGE:
-    CondCode = ARM64CC::GE;
-    break;
-  case ISD::SETOLT:
-    CondCode = ARM64CC::MI;
-    break;
-  case ISD::SETOLE:
-    CondCode = ARM64CC::LS;
-    break;
-  case ISD::SETONE:
-    CondCode = ARM64CC::MI;
-    CondCode2 = ARM64CC::GT;
-    break;
-  case ISD::SETO:
-    CondCode = ARM64CC::VC;
-    break;
-  case ISD::SETUO:
-    CondCode = ARM64CC::VS;
-    break;
-  case ISD::SETUEQ:
-    CondCode = ARM64CC::EQ;
-    CondCode2 = ARM64CC::VS;
-    break;
-  case ISD::SETUGT:
-    CondCode = ARM64CC::HI;
-    break;
-  case ISD::SETUGE:
-    CondCode = ARM64CC::PL;
-    break;
-  case ISD::SETLT:
-  case ISD::SETULT:
-    CondCode = ARM64CC::LT;
-    break;
-  case ISD::SETLE:
-  case ISD::SETULE:
-    CondCode = ARM64CC::LE;
-    break;
-  case ISD::SETNE:
-  case ISD::SETUNE:
-    CondCode = ARM64CC::NE;
-    break;
-  }
-}
-
-static bool isLegalArithImmed(uint64_t C) {
-  // Matches ARM64DAGToDAGISel::SelectArithImmed().
-  return (C >> 12 == 0) || ((C & 0xFFFULL) == 0 && C >> 24 == 0);
-}
-
-static SDValue emitComparison(SDValue LHS, SDValue RHS, SDLoc dl,
-                              SelectionDAG &DAG) {
-  EVT VT = LHS.getValueType();
-
-  if (VT.isFloatingPoint())
-    return DAG.getNode(ARM64ISD::FCMP, dl, VT, LHS, RHS);
-
-  // The CMP instruction is just an alias for SUBS, and representing it as
-  // SUBS means that it's possible to get CSE with subtract operations.
-  // A later phase can perform the optimization of setting the destination
-  // register to WZR/XZR if it ends up being unused.
-  return DAG.getNode(ARM64ISD::SUBS, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS)
-      .getValue(1);
-}
-
-static SDValue getARM64Cmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
-                           SDValue &ARM64cc, SelectionDAG &DAG, SDLoc dl) {
-  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
-    EVT VT = RHS.getValueType();
-    uint64_t C = RHSC->getZExtValue();
-    if (!isLegalArithImmed(C)) {
-      // Constant does not fit, try adjusting it by one?
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETLT:
-      case ISD::SETGE:
-        if ((VT == MVT::i32 && C != 0x80000000 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0x80000000ULL &&
-             isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULT:
-      case ISD::SETUGE:
-        if ((VT == MVT::i32 && C != 0 &&
-             isLegalArithImmed((uint32_t)(C - 1))) ||
-            (VT == MVT::i64 && C != 0ULL && isLegalArithImmed(C - 1ULL))) {
-          CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
-          C = (VT == MVT::i32) ? (uint32_t)(C - 1) : C - 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETLE:
-      case ISD::SETGT:
-        if ((VT == MVT::i32 && C != 0x7fffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0x7ffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      case ISD::SETULE:
-      case ISD::SETUGT:
-        if ((VT == MVT::i32 && C != 0xffffffff &&
-             isLegalArithImmed((uint32_t)(C + 1))) ||
-            (VT == MVT::i64 && C != 0xfffffffffffffffULL &&
-             isLegalArithImmed(C + 1ULL))) {
-          CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
-          C = (VT == MVT::i32) ? (uint32_t)(C + 1) : C + 1;
-          RHS = DAG.getConstant(C, VT);
-        }
-        break;
-      }
-    }
-  }
-
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-  ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-  ARM64cc = DAG.getConstant(ARM64CC, MVT::i32);
-  return Cmp;
-}
-
-static std::pair<SDValue, SDValue>
-getARM64XALUOOp(ARM64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
-  assert((Op.getValueType() == MVT::i32 || Op.getValueType() == MVT::i64) &&
-         "Unsupported value type");
-  SDValue Value, Overflow;
-  SDLoc DL(Op);
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  unsigned Opc = 0;
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("Unknown overflow instruction!");
-  case ISD::SADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::UADDO:
-    Opc = ARM64ISD::ADDS;
-    CC = ARM64CC::CS;
-    break;
-  case ISD::SSUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::VS;
-    break;
-  case ISD::USUBO:
-    Opc = ARM64ISD::SUBS;
-    CC = ARM64CC::CC;
-    break;
-  // Multiply needs a little bit extra work.
-  case ISD::SMULO:
-  case ISD::UMULO: {
-    CC = ARM64CC::NE;
-    bool IsSigned = (Op.getOpcode() == ISD::SMULO) ? true : false;
-    if (Op.getValueType() == MVT::i32) {
-      unsigned ExtendOpc = IsSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
-      // For a 32 bit multiply with overflow check we want the instruction
-      // selector to generate a widening multiply (SMADDL/UMADDL). For that we
-      // need to generate the following pattern:
-      // (i64 add 0, (i64 mul (i64 sext|zext i32 %a), (i64 sext|zext i32 %b))
-      LHS = DAG.getNode(ExtendOpc, DL, MVT::i64, LHS);
-      RHS = DAG.getNode(ExtendOpc, DL, MVT::i64, RHS);
-      SDValue Mul = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-      SDValue Add = DAG.getNode(ISD::ADD, DL, MVT::i64, Mul,
-                                DAG.getConstant(0, MVT::i64));
-      // On ARM64 the upper 32 bits are always zero extended for a 32 bit
-      // operation. We need to clear out the upper 32 bits, because we used a
-      // widening multiply that wrote all 64 bits. In the end this should be a
-      // noop.
-      Value = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Add);
-      if (IsSigned) {
-        // The signed overflow check requires more than just a simple check for
-        // any bit set in the upper 32 bits of the result. These bits could be
-        // just the sign bits of a negative number. To perform the overflow
-        // check we have to arithmetic shift right the 32nd bit of the result by
-        // 31 bits. Then we compare the result to the upper 32 bits.
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Add,
-                                        DAG.getConstant(32, MVT::i64));
-        UpperBits = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, UpperBits);
-        SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i32, Value,
-                                        DAG.getConstant(31, MVT::i64));
-        // It is important that LowerBits is last, otherwise the arithmetic
-        // shift will not be folded into the compare (SUBS).
-        SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32);
-        Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                       .getValue(1);
-      } else {
-        // The overflow check for unsigned multiply is easy. We only need to
-        // check if any of the upper 32 bits are set. This can be done with a
-        // CMP (shifted register). For that we need to generate the following
-        // pattern:
-        // (i64 ARM64ISD::SUBS i64 0, (i64 srl i64 %Mul, i64 32)
-        SDValue UpperBits = DAG.getNode(ISD::SRL, DL, MVT::i64, Mul,
-                                        DAG.getConstant(32, MVT::i64));
-        SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-        Overflow =
-            DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                        UpperBits).getValue(1);
-      }
-      break;
-    }
-    assert(Op.getValueType() == MVT::i64 && "Expected an i64 value type");
-    // For the 64 bit multiply
-    Value = DAG.getNode(ISD::MUL, DL, MVT::i64, LHS, RHS);
-    if (IsSigned) {
-      SDValue UpperBits = DAG.getNode(ISD::MULHS, DL, MVT::i64, LHS, RHS);
-      SDValue LowerBits = DAG.getNode(ISD::SRA, DL, MVT::i64, Value,
-                                      DAG.getConstant(63, MVT::i64));
-      // It is important that LowerBits is last, otherwise the arithmetic
-      // shift will not be folded into the compare (SUBS).
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow = DAG.getNode(ARM64ISD::SUBS, DL, VTs, UpperBits, LowerBits)
-                     .getValue(1);
-    } else {
-      SDValue UpperBits = DAG.getNode(ISD::MULHU, DL, MVT::i64, LHS, RHS);
-      SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i32);
-      Overflow =
-          DAG.getNode(ARM64ISD::SUBS, DL, VTs, DAG.getConstant(0, MVT::i64),
-                      UpperBits).getValue(1);
-    }
-    break;
-  }
-  } // switch (...)
-
-  if (Opc) {
-    SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
-
-    // Emit the ARM64 operation with overflow check.
-    Value = DAG.getNode(Opc, DL, VTs, LHS, RHS);
-    Overflow = Value.getValue(1);
-  }
-  return std::make_pair(Value, Overflow);
-}
-
-SDValue ARM64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                                           RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, Call, MVT::f128, &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
-  SDValue Sel = Op.getOperand(0);
-  SDValue Other = Op.getOperand(1);
-
-  // If neither operand is a SELECT_CC, give up.
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    std::swap(Sel, Other);
-  if (Sel.getOpcode() != ISD::SELECT_CC)
-    return Op;
-
-  // The folding we want to perform is:
-  // (xor x, (select_cc a, b, cc, 0, -1) )
-  //   -->
-  // (csel x, (xor x, -1), cc ...)
-  //
-  // The latter will get matched to a CSINV instruction.
-
-  ISD::CondCode CC = cast<CondCodeSDNode>(Sel.getOperand(4))->get();
-  SDValue LHS = Sel.getOperand(0);
-  SDValue RHS = Sel.getOperand(1);
-  SDValue TVal = Sel.getOperand(2);
-  SDValue FVal = Sel.getOperand(3);
-  SDLoc dl(Sel);
-
-  // FIXME: This could be generalized to non-integer comparisons.
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return Op;
-
-  ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-  ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-  // The the values aren't constants, this isn't the pattern we're looking for.
-  if (!CFVal || !CTVal)
-    return Op;
-
-  // We can commute the SELECT_CC by inverting the condition.  This
-  // might be needed to make this fit into a CSINV pattern.
-  if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-    std::swap(TVal, FVal);
-    std::swap(CTVal, CFVal);
-    CC = ISD::getSetCCInverse(CC, true);
-  }
-
-  // If the constants line up, perform the transform!
-  if (CTVal->isNullValue() && CFVal->isAllOnesValue()) {
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    FVal = Other;
-    TVal = DAG.getNode(ISD::XOR, dl, Other.getValueType(), Other,
-                       DAG.getConstant(-1ULL, Other.getValueType()));
-
-    return DAG.getNode(ARM64ISD::CSEL, dl, Sel.getValueType(), FVal, TVal,
-                       CCVal, Cmp);
-  }
-
-  return Op;
-}
-
-static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
-  EVT VT = Op.getValueType();
-
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDVTList VTs = DAG.getVTList(VT, MVT::i32);
-
-  unsigned Opc;
-  bool ExtraOp = false;
-  switch (Op.getOpcode()) {
-  default:
-    assert(0 && "Invalid code");
-  case ISD::ADDC:
-    Opc = ARM64ISD::ADDS;
-    break;
-  case ISD::SUBC:
-    Opc = ARM64ISD::SUBS;
-    break;
-  case ISD::ADDE:
-    Opc = ARM64ISD::ADCS;
-    ExtraOp = true;
-    break;
-  case ISD::SUBE:
-    Opc = ARM64ISD::SBCS;
-    ExtraOp = true;
-    break;
-  }
-
-  if (!ExtraOp)
-    return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1));
-  return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0), Op.getOperand(1),
-                     Op.getOperand(2));
-}
-
-static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
-  // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(Op.getValueType()))
-    return SDValue();
-
-  ARM64CC::CondCode CC;
-  // The actual operation that sets the overflow or carry flag.
-  SDValue Value, Overflow;
-  std::tie(Value, Overflow) = getARM64XALUOOp(CC, Op, DAG);
-
-  // We use 0 and 1 as false and true values.
-  SDValue TVal = DAG.getConstant(1, MVT::i32);
-  SDValue FVal = DAG.getConstant(0, MVT::i32);
-
-  // We use an inverted condition, because the conditional select is inverted
-  // too. This will allow it to be selected to a single instruction:
-  // CSINC Wd, WZR, WZR, invert(cond).
-  SDValue CCVal = DAG.getConstant(getInvertedCondCode(CC), MVT::i32);
-  Overflow = DAG.getNode(ARM64ISD::CSEL, SDLoc(Op), MVT::i32, FVal, TVal, CCVal,
-                         Overflow);
-
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-  return DAG.getNode(ISD::MERGE_VALUES, SDLoc(Op), VTs, Value, Overflow);
-}
-
-// Prefetch operands are:
-// 1: Address to prefetch
-// 2: bool isWrite
-// 3: int locality (0 = no locality ... 3 = extreme locality)
-// 4: bool isDataCache
-static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) {
-  SDLoc DL(Op);
-  unsigned IsWrite = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned Locality = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
-  // The data thing is not used.
-  // unsigned isData = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
-
-  bool IsStream = !Locality;
-  // When the locality number is set
-  if (Locality) {
-    // The front-end should have filtered out the out-of-range values
-    assert(Locality <= 3 && "Prefetch locality out-of-range");
-    // The locality degree is the opposite of the cache speed.
-    // Put the number the other way around.
-    // The encoding starts at 0 for level 1
-    Locality = 3 - Locality;
-  }
-
-  // built the mask value encoding the expected behavior.
-  unsigned PrfOp = (IsWrite << 4) |     // Load/Store bit
-                   (Locality << 1) |    // Cache level bits
-                   (unsigned)IsStream;  // Stream bit
-  return DAG.getNode(ARM64ISD::PREFETCH, DL, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(PrfOp, MVT::i32), Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerFP_EXTEND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  assert(Op.getValueType() == MVT::f128 && "Unexpected lowering");
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPEXT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFP_ROUND(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  // FP_ROUND node has a second operand indicating whether it is known to be
-  // precise. That doesn't take part in the LibCall so we can't directly use
-  // LowerF128Call.
-  SDValue SrcVal = Op.getOperand(0);
-  return makeLibCall(DAG, LC, Op.getValueType(), &SrcVal, 1,
-                     /*isSigned*/ false, SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT InVT = Op.getOperand(0).getValueType();
-  EVT VT = Op.getValueType();
-
-  // FP_TO_XINT conversion from the same type are legal.
-  if (VT.getSizeInBits() == InVT.getSizeInBits())
-    return Op;
-
-  if (InVT == MVT::v2f64) {
-    SDLoc dl(Op);
-    SDValue Cv = DAG.getNode(Op.getOpcode(), dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(ISD::TRUNCATE, dl, VT, Cv);
-  }
-
-  // Type changing conversions are illegal.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType().isVector())
-    return LowerVectorFP_TO_INT(Op, DAG);
-
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
-    // It's legal except when f128 is involved
-    return Op;
-  }
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  SmallVector<SDValue, 2> Ops;
-  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i)
-    Ops.push_back(Op.getOperand(i));
-
-  return makeLibCall(DAG, LC, Op.getValueType(), &Ops[0], Ops.size(), false,
-                     SDLoc(Op)).first;
-}
-
-static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
-  // Warning: We maintain cost tables in ARM64TargetTransformInfo.cpp.
-  // Any additional optimization in this function should be recorded
-  // in the cost tables.
-  EVT VT = Op.getValueType();
-  SDLoc dl(Op);
-  SDValue In = Op.getOperand(0);
-  EVT InVT = In.getValueType();
-
-  // v2i32 to v2f32 is legal.
-  if (VT == MVT::v2f32 && InVT == MVT::v2i32)
-    return Op;
-
-  // This function only handles v2f64 outputs.
-  if (VT == MVT::v2f64) {
-    // Extend the input argument to a v2i64 that we can feed into the
-    // floating point conversion. Zero or sign extend based on whether
-    // we're doing a signed or unsigned float conversion.
-    unsigned Opc =
-        Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
-    assert(Op.getNumOperands() == 1 && "FP conversions take one argument");
-    SDValue Promoted = DAG.getNode(Opc, dl, MVT::v2i64, Op.getOperand(0));
-    return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Promoted);
-  }
-
-  // Scalarize v2i64 to v2f32 conversions.
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
-    SDValue Sclr = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, In,
-                               DAG.getConstant(i, MVT::i64));
-    Sclr = DAG.getNode(Op->getOpcode(), dl, MVT::f32, Sclr);
-    BuildVectorOps.push_back(Sclr);
-  }
-
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &BuildVectorOps[0],
-                     BuildVectorOps.size());
-}
-
-SDValue ARM64TargetLowering::LowerINT_TO_FP(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  if (Op.getValueType().isVector())
-    return LowerVectorINT_TO_FP(Op, DAG);
-
-  // i128 conversions are libcalls.
-  if (Op.getOperand(0).getValueType() == MVT::i128)
-    return SDValue();
-
-  // Other conversions are legal, unless it's to the completely software-based
-  // fp128.
-  if (Op.getValueType() != MVT::f128)
-    return Op;
-
-  RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-  else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
-
-  return LowerF128Call(Op, DAG, LC);
-}
-
-SDValue ARM64TargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
-  // For iOS, we want to call an alternative entry point: __sincos_stret,
-  // which returns the values in two S / D registers.
-  SDLoc dl(Op);
-  SDValue Arg = Op.getOperand(0);
-  EVT ArgVT = Arg.getValueType();
-  Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
-
-  ArgListTy Args;
-  ArgListEntry Entry;
-
-  Entry.Node = Arg;
-  Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
-  Args.push_back(Entry);
-
-  const char *LibcallName =
-      (ArgVT == MVT::f64) ? "__sincos_stret" : "__sincosf_stret";
-  SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy());
-
-  StructType *RetTy = StructType::get(ArgTy, ArgTy, NULL);
-  TargetLowering::CallLoweringInfo CLI(
-      DAG.getEntryNode(), RetTy, false, false, false, false, 0,
-      CallingConv::Fast, /*isTaillCall=*/false,
-      /*doesNotRet=*/false, /*isReturnValueUsed*/ true, Callee, Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-  return CallResult.first;
-}
-
-SDValue ARM64TargetLowering::LowerOperation(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unimplemented operand");
-    return SDValue();
-  case ISD::GlobalAddress:
-    return LowerGlobalAddress(Op, DAG);
-  case ISD::GlobalTLSAddress:
-    return LowerGlobalTLSAddress(Op, DAG);
-  case ISD::SETCC:
-    return LowerSETCC(Op, DAG);
-  case ISD::BR_CC:
-    return LowerBR_CC(Op, DAG);
-  case ISD::SELECT:
-    return LowerSELECT(Op, DAG);
-  case ISD::SELECT_CC:
-    return LowerSELECT_CC(Op, DAG);
-  case ISD::JumpTable:
-    return LowerJumpTable(Op, DAG);
-  case ISD::ConstantPool:
-    return LowerConstantPool(Op, DAG);
-  case ISD::BlockAddress:
-    return LowerBlockAddress(Op, DAG);
-  case ISD::VASTART:
-    return LowerVASTART(Op, DAG);
-  case ISD::VACOPY:
-    return LowerVACOPY(Op, DAG);
-  case ISD::VAARG:
-    return LowerVAARG(Op, DAG);
-  case ISD::ADDC:
-  case ISD::ADDE:
-  case ISD::SUBC:
-  case ISD::SUBE:
-    return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
-  case ISD::SMULO:
-  case ISD::UMULO:
-    return LowerXALUO(Op, DAG);
-  case ISD::FADD:
-    return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
-  case ISD::FSUB:
-    return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
-  case ISD::FMUL:
-    return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
-  case ISD::FDIV:
-    return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
-  case ISD::FP_ROUND:
-    return LowerFP_ROUND(Op, DAG);
-  case ISD::FP_EXTEND:
-    return LowerFP_EXTEND(Op, DAG);
-  case ISD::FRAMEADDR:
-    return LowerFRAMEADDR(Op, DAG);
-  case ISD::RETURNADDR:
-    return LowerRETURNADDR(Op, DAG);
-  case ISD::INSERT_VECTOR_ELT:
-    return LowerINSERT_VECTOR_ELT(Op, DAG);
-  case ISD::EXTRACT_VECTOR_ELT:
-    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
-  case ISD::SCALAR_TO_VECTOR:
-    return LowerSCALAR_TO_VECTOR(Op, DAG);
-  case ISD::BUILD_VECTOR:
-    return LowerBUILD_VECTOR(Op, DAG);
-  case ISD::VECTOR_SHUFFLE:
-    return LowerVECTOR_SHUFFLE(Op, DAG);
-  case ISD::EXTRACT_SUBVECTOR:
-    return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::SRA:
-  case ISD::SRL:
-  case ISD::SHL:
-    return LowerVectorSRA_SRL_SHL(Op, DAG);
-  case ISD::SHL_PARTS:
-    return LowerShiftLeftParts(Op, DAG);
-  case ISD::SRL_PARTS:
-  case ISD::SRA_PARTS:
-    return LowerShiftRightParts(Op, DAG);
-  case ISD::CTPOP:
-    return LowerCTPOP(Op, DAG);
-  case ISD::FCOPYSIGN:
-    return LowerFCOPYSIGN(Op, DAG);
-  case ISD::AND:
-    return LowerVectorAND(Op, DAG);
-  case ISD::OR:
-    return LowerVectorOR(Op, DAG);
-  case ISD::XOR:
-    return LowerXOR(Op, DAG);
-  case ISD::PREFETCH:
-    return LowerPREFETCH(Op, DAG);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return LowerINT_TO_FP(Op, DAG);
-  case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:
-    return LowerFP_TO_INT(Op, DAG);
-  case ISD::FSINCOS:
-    return LowerFSINCOS(Op, DAG);
-  }
-}
-
-/// getFunctionAlignment - Return the Log2 alignment of this function.
-unsigned ARM64TargetLowering::getFunctionAlignment(const Function *F) const {
-  return 2;
-}
-
-//===----------------------------------------------------------------------===//
-//                      Calling Convention Implementation
-//===----------------------------------------------------------------------===//
-
-#include "ARM64GenCallingConv.inc"
-
-/// Selects the correct CCAssignFn for a the given CallingConvention
-/// value.
-CCAssignFn *ARM64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
-                                                   bool IsVarArg) const {
-  switch (CC) {
-  default:
-    llvm_unreachable("Unsupported calling convention.");
-  case CallingConv::WebKit_JS:
-    return CC_ARM64_WebKit_JS;
-  case CallingConv::C:
-  case CallingConv::Fast:
-    if (!Subtarget->isTargetDarwin())
-      return CC_ARM64_AAPCS;
-    return IsVarArg ? CC_ARM64_DarwinPCS_VarArg : CC_ARM64_DarwinPCS;
-  }
-}
-
-SDValue ARM64TargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Assign locations to all of the incoming arguments.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  // At this point, Ins[].VT may already be promoted to i32. To correctly
-  // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-  // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-  // Since AnalyzeFormalArguments uses Ins[].VT for both ValVT and LocVT, here
-  // we use a special version of AnalyzeFormalArguments to pass in ValVT and
-  // LocVT.
-  unsigned NumArgs = Ins.size();
-  Function::const_arg_iterator CurOrigArg = MF.getFunction()->arg_begin();
-  unsigned CurArgIdx = 0;
-  for (unsigned i = 0; i != NumArgs; ++i) {
-    MVT ValVT = Ins[i].VT;
-    std::advance(CurOrigArg, Ins[i].OrigArgIndex - CurArgIdx);
-    CurArgIdx = Ins[i].OrigArgIndex;
-
-    // Get type of the original argument.
-    EVT ActualVT = getValueType(CurOrigArg->getType(), /*AllowUnknown*/ true);
-    MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : MVT::Other;
-    // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-    MVT LocVT = ValVT;
-    if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-      LocVT = MVT::i8;
-    else if (ActualMVT == MVT::i16)
-      LocVT = MVT::i16;
-
-    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-    bool Res =
-        AssignFn(i, ValVT, LocVT, CCValAssign::Full, Ins[i].Flags, CCInfo);
-    assert(!Res && "Call operand has unhandled type");
-    (void)Res;
-  }
-
-  SmallVector<SDValue, 16> ArgValues;
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-
-    // Arguments stored in registers.
-    if (VA.isRegLoc()) {
-      EVT RegVT = VA.getLocVT();
-
-      SDValue ArgValue;
-      const TargetRegisterClass *RC;
-
-      if (RegVT == MVT::i32)
-        RC = &ARM64::GPR32RegClass;
-      else if (RegVT == MVT::i64)
-        RC = &ARM64::GPR64RegClass;
-      else if (RegVT == MVT::f32)
-        RC = &ARM64::FPR32RegClass;
-      else if (RegVT == MVT::f64 || RegVT == MVT::v1i64 ||
-               RegVT == MVT::v1f64 || RegVT == MVT::v2i32 ||
-               RegVT == MVT::v4i16 || RegVT == MVT::v8i8)
-        RC = &ARM64::FPR64RegClass;
-      else if (RegVT == MVT::v2i64 || RegVT == MVT::v4i32 ||
-               RegVT == MVT::v8i16 || RegVT == MVT::v16i8)
-        RC = &ARM64::FPR128RegClass;
-      else
-        llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
-
-      // Transform the arguments in physical registers into virtual ones.
-      unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
-      ArgValue = DAG.getCopyFromReg(Chain, DL, Reg, RegVT);
-
-      // If this is an 8, 16 or 32-bit value, it is really passed promoted
-      // to 64 bits.  Insert an assert[sz]ext to capture this, then
-      // truncate to the right size.
-      switch (VA.getLocInfo()) {
-      default:
-        llvm_unreachable("Unknown loc info!");
-      case CCValAssign::Full:
-        break;
-      case CCValAssign::BCvt:
-        ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::SExt:
-        ArgValue = DAG.getNode(ISD::AssertSext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      case CCValAssign::ZExt:
-        ArgValue = DAG.getNode(ISD::AssertZext, DL, RegVT, ArgValue,
-                               DAG.getValueType(VA.getValVT()));
-        ArgValue = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), ArgValue);
-        break;
-      }
-
-      InVals.push_back(ArgValue);
-
-    } else { // VA.isRegLoc()
-      assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
-      unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getLocVT().getSizeInBits() / 8;
-      int FI = MFI->CreateFixedObject(ArgSize, ArgOffset, true);
-
-      // Create load nodes to retrieve arguments from the stack.
-      SDValue FIN = DAG.getFrameIndex(FI, getPointerTy());
-      InVals.push_back(DAG.getLoad(VA.getValVT(), DL, Chain, FIN,
-                                   MachinePointerInfo::getFixedStack(FI), false,
-                                   false, false, 0));
-    }
-  }
-
-  // varargs
-  if (isVarArg) {
-    if (!Subtarget->isTargetDarwin()) {
-      // The AAPCS variadic function ABI is identical to the non-variadic
-      // one. As a result there may be more arguments in registers and we should
-      // save them for future reference.
-      saveVarArgRegisters(CCInfo, DAG, DL, Chain);
-    }
-
-    ARM64FunctionInfo *AFI = MF.getInfo<ARM64FunctionInfo>();
-    // This will point to the next argument passed via stack.
-    unsigned StackOffset = CCInfo.getNextStackOffset();
-    // We currently pass all varargs at 8-byte alignment.
-    StackOffset = ((StackOffset + 7) & ~7);
-    AFI->setVarArgsStackIndex(MFI->CreateFixedObject(4, StackOffset, true));
-  }
-
-  return Chain;
-}
-
-void ARM64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
-                                              SelectionDAG &DAG, SDLoc DL,
-                                              SDValue &Chain) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-
-  SmallVector<SDValue, 8> MemOps;
-
-  static const uint16_t GPRArgRegs[] = { ARM64::X0, ARM64::X1, ARM64::X2,
-                                         ARM64::X3, ARM64::X4, ARM64::X5,
-                                         ARM64::X6, ARM64::X7 };
-  static const unsigned NumGPRArgRegs = array_lengthof(GPRArgRegs);
-  unsigned FirstVariadicGPR =
-      CCInfo.getFirstUnallocated(GPRArgRegs, NumGPRArgRegs);
-
-  static const uint16_t FPRArgRegs[] = { ARM64::Q0, ARM64::Q1, ARM64::Q2,
-                                         ARM64::Q3, ARM64::Q4, ARM64::Q5,
-                                         ARM64::Q6, ARM64::Q7 };
-  static const unsigned NumFPRArgRegs = array_lengthof(FPRArgRegs);
-  unsigned FirstVariadicFPR =
-      CCInfo.getFirstUnallocated(FPRArgRegs, NumFPRArgRegs);
-
-  unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR);
-  int GPRIdx = 0;
-  if (GPRSaveSize != 0) {
-    GPRIdx = MFI->CreateStackObject(GPRSaveSize, 8, false);
-
-    SDValue FIN = DAG.getFrameIndex(GPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicGPR; i < NumGPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(GPRArgRegs[i], &ARM64::GPR64RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 8), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(8, getPointerTy()));
-    }
-  }
-
-  unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
-  int FPRIdx = 0;
-  if (FPRSaveSize != 0) {
-    FPRIdx = MFI->CreateStackObject(FPRSaveSize, 16, false);
-
-    SDValue FIN = DAG.getFrameIndex(FPRIdx, getPointerTy());
-
-    for (unsigned i = FirstVariadicFPR; i < NumFPRArgRegs; ++i) {
-      unsigned VReg = MF.addLiveIn(FPRArgRegs[i], &ARM64::FPR128RegClass);
-      SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::v2i64);
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), DL, Val, FIN,
-                       MachinePointerInfo::getStack(i * 16), false, false, 0);
-      MemOps.push_back(Store);
-      FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(), FIN,
-                        DAG.getConstant(16, getPointerTy()));
-    }
-  }
-
-  FuncInfo->setVarArgsGPRIndex(GPRIdx);
-  FuncInfo->setVarArgsGPRSize(GPRSaveSize);
-  FuncInfo->setVarArgsFPRIndex(FPRIdx);
-  FuncInfo->setVarArgsFPRSize(FPRSaveSize);
-
-  if (!MemOps.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                        MemOps.size());
-  }
-}
-
-/// LowerCallResult - Lower the result values of a call into the
-/// appropriate copies out of appropriate physical registers.
-SDValue ARM64TargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
-    SDValue ThisVal) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  // Assign locations to each value returned by this call.
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeCallResult(Ins, RetCC);
-
-  // Copy all of the result registers out of their specified physreg.
-  for (unsigned i = 0; i != RVLocs.size(); ++i) {
-    CCValAssign VA = RVLocs[i];
-
-    // Pass 'this' value directly from the argument to return value, to avoid
-    // reg unit interference
-    if (i == 0 && isThisReturn) {
-      assert(!VA.needsCustom() && VA.getLocVT() == MVT::i64 &&
-             "unexpected return calling convention register assignment");
-      InVals.push_back(ThisVal);
-      continue;
-    }
-
-    SDValue Val =
-        DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InFlag);
-    Chain = Val.getValue(1);
-    InFlag = Val.getValue(2);
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
-      break;
-    }
-
-    InVals.push_back(Val);
-  }
-
-  return Chain;
-}
-
-bool ARM64TargetLowering::isEligibleForTailCallOptimization(
-    SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-    bool isCalleeStructRet, bool isCallerStructRet,
-    const SmallVectorImpl<ISD::OutputArg> &Outs,
-    const SmallVectorImpl<SDValue> &OutVals,
-    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
-  // Look for obvious safe cases to perform tail call optimization that do not
-  // require ABI changes. This is what gcc calls sibcall.
-
-  // Do not sibcall optimize vararg calls unless the call site is not passing
-  // any arguments.
-  if (isVarArg && !Outs.empty())
-    return false;
-
-  // Also avoid sibcall optimization if either caller or callee uses struct
-  // return semantics.
-  if (isCalleeStructRet || isCallerStructRet)
-    return false;
-
-  // Note that currently ARM64 "C" calling convention and "Fast" calling
-  // convention are compatible. If/when that ever changes, we'll need to
-  // add checks here to make sure any interactions are OK.
-
-  // If the callee takes no arguments then go on to check the results of the
-  // call.
-  if (!Outs.empty()) {
-    // Check if stack adjustment is needed. For now, do not do this if any
-    // argument is passed on the stack.
-    SmallVector<CCValAssign, 16> ArgLocs;
-    CCState CCInfo(CalleeCC, isVarArg, DAG.getMachineFunction(),
-                   getTargetMachine(), ArgLocs, *DAG.getContext());
-    CCAssignFn *AssignFn = CCAssignFnForCall(CalleeCC, /*IsVarArg=*/false);
-    CCInfo.AnalyzeCallOperands(Outs, AssignFn);
-    if (CCInfo.getNextStackOffset()) {
-      // Check if the arguments are already laid out in the right way as
-      // the caller's fixed stack objects.
-      for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-           ++i, ++realArgIdx) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.getLocInfo() == CCValAssign::Indirect)
-          return false;
-        if (VA.needsCustom()) {
-          // Just don't handle anything that needs custom adjustments for now.
-          // If need be, we can revisit later, but we shouldn't ever end up
-          // here.
-          return false;
-        } else if (!VA.isRegLoc()) {
-          // Likewise, don't try to handle stack based arguments for the
-          // time being.
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-/// LowerCall - Lower a call to a callseq_start + CALL + callseq_end chain,
-/// and add input and output parameter nodes.
-SDValue ARM64TargetLowering::LowerCall(CallLoweringInfo &CLI,
-                                       SmallVectorImpl<SDValue> &InVals) const {
-  SelectionDAG &DAG = CLI.DAG;
-  SDLoc &DL = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins = CLI.Ins;
-  SDValue Chain = CLI.Chain;
-  SDValue Callee = CLI.Callee;
-  bool &IsTailCall = CLI.IsTailCall;
-  CallingConv::ID CallConv = CLI.CallConv;
-  bool IsVarArg = CLI.IsVarArg;
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
-  bool IsThisReturn = false;
-
-  // If tail calls are explicitly disabled, make sure not to use them.
-  if (!EnableARM64TailCalls)
-    IsTailCall = false;
-
-  if (IsTailCall) {
-    // Check if it's really possible to do a tail call.
-    IsTailCall = isEligibleForTailCallOptimization(
-        Callee, CallConv, IsVarArg, IsStructRet,
-        MF.getFunction()->hasStructRetAttr(), Outs, OutVals, Ins, DAG);
-    // We don't support GuaranteedTailCallOpt, only automatically
-    // detected sibcalls.
-    // FIXME: Re-evaluate. Is this true? Should it be true?
-    if (IsTailCall)
-      ++NumTailCalls;
-  }
-
-  // Analyze operands of the call, assigning locations to each operand.
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), ArgLocs, *DAG.getContext());
-
-  if (IsVarArg) {
-    // Handle fixed and variable vector arguments differently.
-    // Variable vector arguments always go into memory.
-    unsigned NumArgs = Outs.size();
-
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ArgVT = Outs[i].VT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv,
-                                               /*IsVarArg=*/ !Outs[i].IsFixed);
-      bool Res = AssignFn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  } else {
-    // At this point, Outs[].VT may already be promoted to i32. To correctly
-    // handle passing i8 as i8 instead of i32 on stack, we pass in both i32 and
-    // i8 to CC_ARM64_AAPCS with i32 being ValVT and i8 being LocVT.
-    // Since AnalyzeCallOperands uses Ins[].VT for both ValVT and LocVT, here
-    // we use a special version of AnalyzeCallOperands to pass in ValVT and
-    // LocVT.
-    unsigned NumArgs = Outs.size();
-    for (unsigned i = 0; i != NumArgs; ++i) {
-      MVT ValVT = Outs[i].VT;
-      // Get type of the original argument.
-      EVT ActualVT = getValueType(CLI.Args[Outs[i].OrigArgIndex].Ty,
-                                  /*AllowUnknown*/ true);
-      MVT ActualMVT = ActualVT.isSimple() ? ActualVT.getSimpleVT() : ValVT;
-      ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
-      // If ActualMVT is i1/i8/i16, we should set LocVT to i8/i8/i16.
-      MVT LocVT = ValVT;
-      if (ActualMVT == MVT::i1 || ActualMVT == MVT::i8)
-        LocVT = MVT::i8;
-      else if (ActualMVT == MVT::i16)
-        LocVT = MVT::i16;
-
-      CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
-      bool Res = AssignFn(i, ValVT, LocVT, CCValAssign::Full, ArgFlags, CCInfo);
-      assert(!Res && "Call operand has unhandled type");
-      (void)Res;
-    }
-  }
-
-  // Get a count of how many bytes are to be pushed on the stack.
-  unsigned NumBytes = CCInfo.getNextStackOffset();
-
-  // Adjust the stack pointer for the new arguments...
-  // These operations are automatically eliminated by the prolog/epilog pass
-  if (!IsTailCall)
-    Chain =
-        DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, true), DL);
-
-  SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ARM64::SP, getPointerTy());
-
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-  SmallVector<SDValue, 8> MemOpChains;
-
-  // Walk the register/memloc assignments, inserting copies/loads.
-  for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
-       ++i, ++realArgIdx) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue Arg = OutVals[realArgIdx];
-    ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
-
-    // Promote the value if needed.
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::SExt:
-      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::ZExt:
-      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::AExt:
-      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    case CCValAssign::FPExt:
-      Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
-        assert(VA.getLocVT() == MVT::i64 &&
-               "unexpected calling convention register assignment");
-        assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
-               "unexpected use of 'returned'");
-        IsThisReturn = true;
-      }
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
-    } else {
-      assert(VA.isMemLoc());
-      // There's no reason we can't support stack args w/ tailcall, but
-      // we currently don't, so assert if we see one.
-      assert(!IsTailCall && "stack argument with tail call!?");
-      unsigned LocMemOffset = VA.getLocMemOffset();
-      SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset);
-      PtrOff = DAG.getNode(ISD::ADD, DL, getPointerTy(), StackPtr, PtrOff);
-
-      // Since we pass i1/i8/i16 as i1/i8/i16 on stack and Arg is already
-      // promoted to a legal register type i32, we should truncate Arg back to
-      // i1/i8/i16.
-      if (Arg.getValueType().isSimple() &&
-          Arg.getValueType().getSimpleVT() == MVT::i32 &&
-          (VA.getLocVT() == MVT::i1 || VA.getLocVT() == MVT::i8 ||
-           VA.getLocVT() == MVT::i16))
-        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getLocVT(), Arg);
-
-      SDValue Store = DAG.getStore(Chain, DL, Arg, PtrOff,
-                                   MachinePointerInfo::getStack(LocMemOffset),
-                                   false, false, 0);
-      MemOpChains.push_back(Store);
-    }
-  }
-
-  if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOpChains[0],
-                        MemOpChains.size());
-
-  // Build a sequence of copy-to-reg nodes chained together with token chain
-  // and flag operands which copy the outgoing args into the appropriate regs.
-  SDValue InFlag;
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
-                             RegsToPass[i].second, InFlag);
-    InFlag = Chain.getValue(1);
-  }
-
-  // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
-  // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
-  // node so that legalize doesn't hack it.
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      Subtarget->isTargetMachO()) {
-    if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-      const GlobalValue *GV = G->getGlobal();
-      bool InternalLinkage = GV->hasInternalLinkage();
-      if (InternalLinkage)
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-      else {
-        Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0,
-                                            ARM64II::MO_GOT);
-        Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-      }
-    } else if (ExternalSymbolSDNode *S =
-                   dyn_cast<ExternalSymbolSDNode>(Callee)) {
-      const char *Sym = S->getSymbol();
-      Callee =
-          DAG.getTargetExternalSymbol(Sym, getPointerTy(), ARM64II::MO_GOT);
-      Callee = DAG.getNode(ARM64ISD::LOADgot, DL, getPointerTy(), Callee);
-    }
-  } else if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    const GlobalValue *GV = G->getGlobal();
-    Callee = DAG.getTargetGlobalAddress(GV, DL, getPointerTy(), 0, 0);
-  } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    const char *Sym = S->getSymbol();
-    Callee = DAG.getTargetExternalSymbol(Sym, getPointerTy(), 0);
-  }
-
-  std::vector<SDValue> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Callee);
-
-  // Add argument registers to the end of the list so that they are known live
-  // into the call.
-  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
-    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
-                                  RegsToPass[i].second.getValueType()));
-
-  // Add a register mask operand representing the call-preserved registers.
-  const uint32_t *Mask;
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  if (IsThisReturn) {
-    // For 'this' returns, use the X0-preserving mask if applicable
-    Mask = ARI->getThisReturnPreservedMask(CallConv);
-    if (!Mask) {
-      IsThisReturn = false;
-      Mask = ARI->getCallPreservedMask(CallConv);
-    }
-  } else
-    Mask = ARI->getCallPreservedMask(CallConv);
-
-  assert(Mask && "Missing call preserved mask for calling convention");
-  Ops.push_back(DAG.getRegisterMask(Mask));
-
-  if (InFlag.getNode())
-    Ops.push_back(InFlag);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-
-  // If we're doing a tall call, use a TC_RETURN here rather than an
-  // actual call instruction.
-  if (IsTailCall)
-    return DAG.getNode(ARM64ISD::TC_RETURN, DL, NodeTys, &Ops[0], Ops.size());
-
-  // Returns a chain and a flag for retval copy to use.
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
-  InFlag = Chain.getValue(1);
-
-  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(NumBytes, true),
-                             DAG.getIntPtrConstant(0, true), InFlag, DL);
-  if (!Ins.empty())
-    InFlag = Chain.getValue(1);
-
-  // Handle result values, copying them out of physregs into vregs that we
-  // return.
-  return LowerCallResult(Chain, InFlag, CallConv, IsVarArg, Ins, DL, DAG,
-                         InVals, IsThisReturn,
-                         IsThisReturn ? OutVals[0] : SDValue());
-}
-
-bool ARM64TargetLowering::CanLowerReturn(
-    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, getTargetMachine(), RVLocs, Context);
-  return CCInfo.CheckReturn(Outs, RetCC);
-}
-
-SDValue
-ARM64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                                 bool isVarArg,
-                                 const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                 const SmallVectorImpl<SDValue> &OutVals,
-                                 SDLoc DL, SelectionDAG &DAG) const {
-  CCAssignFn *RetCC = CallConv == CallingConv::WebKit_JS ? RetCC_ARM64_WebKit_JS
-                                                         : RetCC_ARM64_AAPCS;
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(),
-                 getTargetMachine(), RVLocs, *DAG.getContext());
-  CCInfo.AnalyzeReturn(Outs, RetCC);
-
-  // Copy the result values into the output registers.
-  SDValue Flag;
-  SmallVector<SDValue, 4> RetOps(1, Chain);
-  for (unsigned i = 0, realRVLocIdx = 0; i != RVLocs.size();
-       ++i, ++realRVLocIdx) {
-    CCValAssign &VA = RVLocs[i];
-    assert(VA.isRegLoc() && "Can only return in registers!");
-    SDValue Arg = OutVals[realRVLocIdx];
-
-    switch (VA.getLocInfo()) {
-    default:
-      llvm_unreachable("Unknown loc info!");
-    case CCValAssign::Full:
-      break;
-    case CCValAssign::BCvt:
-      Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
-      break;
-    }
-
-    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Flag);
-    Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
-  }
-
-  RetOps[0] = Chain; // Update chain.
-
-  // Add the flag if we have it.
-  if (Flag.getNode())
-    RetOps.push_back(Flag);
-
-  return DAG.getNode(ARM64ISD::RET_FLAG, DL, MVT::Other, &RetOps[0],
-                     RetOps.size());
-}
-
-//===----------------------------------------------------------------------===//
-//  Other Lowering Code
-//===----------------------------------------------------------------------===//
-
-SDValue ARM64TargetLowering::LowerGlobalAddress(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-  unsigned char OpFlags =
-      Subtarget->ClassifyGlobalReference(GV, getTargetMachine());
-
-  assert(cast<GlobalAddressSDNode>(Op)->getOffset() == 0 &&
-         "unexpected offset in global node");
-
-  // This also catched the large code model case for Darwin.
-  if ((OpFlags & ARM64II::MO_GOT) != 0) {
-    SDValue GotAddr = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
-    // FIXME: Once remat is capable of dealing with instructions with register
-    // operands, expand this into two nodes instead of using a wrapper node.
-    return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-  }
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small model on ELF and
-    // the only correct model on Darwin.
-    SDValue Hi = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0,
-                                            OpFlags | ARM64II::MO_PAGE);
-    unsigned char LoFlags = OpFlags | ARM64II::MO_PAGEOFF | ARM64II::MO_NC;
-    SDValue Lo = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, LoFlags);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-/// \brief Convert a TLS address reference into the correct sequence of loads
-/// and calls to compute the variable's address (for Darwin, currently) and
-/// return an SDValue containing the final node.
-
-/// Darwin only has one TLS scheme which must be capable of dealing with the
-/// fully general situation, in the worst case. This means:
-///     + "extern __thread" declaration.
-///     + Defined in a possibly unknown dynamic library.
-///
-/// The general system is that each __thread variable has a [3 x i64] descriptor
-/// which contains information used by the runtime to calculate the address. The
-/// only part of this the compiler needs to know about is the first xword, which
-/// contains a function pointer that must be called with the address of the
-/// entire descriptor in "x0".
-///
-/// Since this descriptor may be in a different unit, in general even the
-/// descriptor must be accessed via an indirect load. The "ideal" code sequence
-/// is:
-///     adrp x0, _var@TLVPPAGE
-///     ldr x0, [x0, _var@TLVPPAGEOFF]   ; x0 now contains address of descriptor
-///     ldr x1, [x0]                     ; x1 contains 1st entry of descriptor,
-///                                      ; the function pointer
-///     blr x1                           ; Uses descriptor address in x0
-///     ; Address of _var is now in x0.
-///
-/// If the address of _var's descriptor *is* known to the linker, then it can
-/// change the first "ldr" instruction to an appropriate "add x0, x0, #imm" for
-/// a slight efficiency gain.
-SDValue
-ARM64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() && "TLS only supported on Darwin");
-
-  SDLoc DL(Op);
-  MVT PtrVT = getPointerTy();
-  const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
-
-  SDValue TLVPAddr =
-      DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-  SDValue DescAddr = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TLVPAddr);
-
-  // The first entry in the descriptor is a function pointer that we must call
-  // to obtain the address of the variable.
-  SDValue Chain = DAG.getEntryNode();
-  SDValue FuncTLVGet =
-      DAG.getLoad(MVT::i64, DL, Chain, DescAddr, MachinePointerInfo::getGOT(),
-                  false, true, true, 8);
-  Chain = FuncTLVGet.getValue(1);
-
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setAdjustsStack(true);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // Finally, we can make the call. This is just a degenerate version of a
-  // normal ARM64 call node: x0 takes the address of the descriptor, and returns
-  // the address of the variable in this thread.
-  Chain = DAG.getCopyToReg(Chain, DL, ARM64::X0, DescAddr, SDValue());
-  Chain = DAG.getNode(ARM64ISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
-                      Chain, FuncTLVGet, DAG.getRegister(ARM64::X0, MVT::i64),
-                      DAG.getRegisterMask(Mask), Chain.getValue(1));
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Chain.getValue(1));
-}
-
-/// When accessing thread-local variables under either the general-dynamic or
-/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
-/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
-/// is a function pointer to carry out the resolution. This function takes the
-/// address of the descriptor in X0 and returns the TPIDR_EL0 offset in X0. All
-/// other registers (except LR, CPSR) are preserved.
-///
-/// Thus, the ideal call sequence on AArch64 is:
-///
-///     adrp x0, :tlsdesc:thread_var
-///     ldr x8, [x0, :tlsdesc_lo12:thread_var]
-///     add x0, x0, :tlsdesc_lo12:thread_var
-///     .tlsdesccall thread_var
-///     blr x8
-///     (TPIDR_EL0 offset now in x0).
-///
-/// The ".tlsdesccall" directive instructs the assembler to insert a particular
-/// relocation to help the linker relax this sequence if it turns out to be too
-/// conservative.
-///
-/// FIXME: we currently produce an extra, duplicated, ADRP instruction, but this
-/// is harmless.
-SDValue ARM64TargetLowering::LowerELFTLSDescCall(SDValue SymAddr,
-                                                 SDValue DescAddr, SDLoc DL,
-                                                 SelectionDAG &DAG) const {
-  EVT PtrVT = getPointerTy();
-
-  // The function we need to call is simply the first entry in the GOT for this
-  // descriptor, load it in preparation.
-  SDValue Func = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, SymAddr);
-
-  // TLS calls preserve all registers except those that absolutely must be
-  // trashed: X0 (it takes an argument), LR (it's a call) and CPSR (let's not be
-  // silly).
-  const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
-  const ARM64RegisterInfo *ARI = static_cast<const ARM64RegisterInfo *>(TRI);
-  const uint32_t *Mask = ARI->getTLSCallPreservedMask();
-
-  // The function takes only one argument: the address of the descriptor itself
-  // in X0.
-  SDValue Glue, Chain;
-  Chain = DAG.getCopyToReg(DAG.getEntryNode(), DL, ARM64::X0, DescAddr, Glue);
-  Glue = Chain.getValue(1);
-
-  // We're now ready to populate the argument list, as with a normal call:
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(Chain);
-  Ops.push_back(Func);
-  Ops.push_back(SymAddr);
-  Ops.push_back(DAG.getRegister(ARM64::X0, PtrVT));
-  Ops.push_back(DAG.getRegisterMask(Mask));
-  Ops.push_back(Glue);
-
-  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(ARM64ISD::TLSDESC_CALL, DL, NodeTys, &Ops[0], Ops.size());
-  Glue = Chain.getValue(1);
-
-  return DAG.getCopyFromReg(Chain, DL, ARM64::X0, PtrVT, Glue);
-}
-
-SDValue ARM64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
-                                                      SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
-         "ELF TLS only supported in small memory model");
-  const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
-
-  TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
-
-  SDValue TPOff;
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  const GlobalValue *GV = GA->getGlobal();
-
-  SDValue ThreadBase = DAG.getNode(ARM64ISD::THREAD_POINTER, DL, PtrVT);
-
-  if (Model == TLSModel::LocalExec) {
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                       DAG.getTargetConstant(16, MVT::i32)),
-                    0);
-    TPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, TPOff, LoVar,
-                                       DAG.getTargetConstant(0, MVT::i32)),
-                    0);
-  } else if (Model == TLSModel::InitialExec) {
-    TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-    TPOff = DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, TPOff);
-  } else if (Model == TLSModel::LocalDynamic) {
-    // Local-dynamic accesses proceed in two phases. A general-dynamic TLS
-    // descriptor call against the special symbol _TLS_MODULE_BASE_ to calculate
-    // the beginning of the module's TLS region, followed by a DTPREL offset
-    // calculation.
-
-    // These accesses will need deduplicating if there's more than one.
-    ARM64FunctionInfo *MFI =
-        DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-    MFI->incNumLocalDynamicTLSAccesses();
-
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetExternalSymbol(
-        "_TLS_MODULE_BASE_", PtrVT,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr = DAG.getTargetExternalSymbol("_TLS_MODULE_BASE_", PtrVT,
-                                                  ARM64II::MO_TLS);
-
-    // Now we can calculate the offset from TPIDR_EL0 to this module's
-    // thread-local area.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-
-    // Now use :dtprel_whatever: operations to calculate this variable's offset
-    // in its thread-storage area.
-    SDValue HiVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G1);
-    SDValue LoVar = DAG.getTargetGlobalAddress(
-        GV, DL, MVT::i64, 0, ARM64II::MO_TLS | ARM64II::MO_G0 | ARM64II::MO_NC);
-
-    SDValue DTPOff =
-        SDValue(DAG.getMachineNode(ARM64::MOVZXi, DL, PtrVT, HiVar,
-                                   DAG.getTargetConstant(16, MVT::i32)),
-                0);
-    DTPOff = SDValue(DAG.getMachineNode(ARM64::MOVKXi, DL, PtrVT, DTPOff, LoVar,
-                                        DAG.getTargetConstant(0, MVT::i32)),
-                     0);
-
-    TPOff = DAG.getNode(ISD::ADD, DL, PtrVT, TPOff, DTPOff);
-  } else if (Model == TLSModel::GeneralDynamic) {
-    // Accesses used in this sequence go via the TLS descriptor which lives in
-    // the GOT. Prepare an address we can use to handle this.
-    SDValue HiDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0, ARM64II::MO_TLS | ARM64II::MO_PAGE);
-    SDValue LoDesc = DAG.getTargetGlobalAddress(
-        GV, DL, PtrVT, 0,
-        ARM64II::MO_TLS | ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    // First argument to the descriptor call is the address of the descriptor
-    // itself.
-    SDValue DescAddr = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, HiDesc);
-    DescAddr = DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, DescAddr, LoDesc);
-
-    // The call needs a relocation too for linker relaxation. It doesn't make
-    // sense to call it MO_PAGE or MO_PAGEOFF though so we need another copy of
-    // the address.
-    SDValue SymAddr =
-        DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, ARM64II::MO_TLS);
-
-    // Finally we can make a call to calculate the offset from tpidr_el0.
-    TPOff = LowerELFTLSDescCall(SymAddr, DescAddr, DL, DAG);
-  } else
-    llvm_unreachable("Unsupported ELF TLS access model");
-
-  return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
-}
-
-SDValue ARM64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  if (Subtarget->isTargetDarwin())
-    return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
-    return LowerELFGlobalTLSAddress(Op, DAG);
-
-  llvm_unreachable("Unexpected platform trying to use TLS");
-}
-SDValue ARM64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
-  SDValue Chain = Op.getOperand(0);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
-  SDValue LHS = Op.getOperand(2);
-  SDValue RHS = Op.getOperand(3);
-  SDValue Dest = Op.getOperand(4);
-  SDLoc dl(Op);
-
-  // Handle f128 first, since lowering it will result in comparing the return
-  // value of a libcall against zero, which is just what the rest of LowerBR_CC
-  // is expecting to deal with.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
-  // instruction.
-  unsigned Opc = LHS.getOpcode();
-  if (LHS.getResNo() == 1 && isa<ConstantSDNode>(RHS) &&
-      cast<ConstantSDNode>(RHS)->isOne() &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    assert((CC == ISD::SETEQ || CC == ISD::SETNE) &&
-           "Unexpected condition code.");
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
-      return SDValue();
-
-    // The actual operation with overflow check.
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, LHS.getValue(0), DAG);
-
-    if (CC == ISD::SETNE)
-      OFCC = getInvertedCondCode(OFCC);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::BRCOND, SDLoc(LHS), MVT::Other, Chain, Dest,
-                       CCVal, Overflow);
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    // If the RHS of the comparison is zero, we can potentially fold this
-    // to a specialized branch.
-    const ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
-    if (RHSC && RHSC->getZExtValue() == 0) {
-      if (CC == ISD::SETEQ) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBZ, dl, MVT::Other, Chain, LHS, Dest);
-      } else if (CC == ISD::SETNE) {
-        // See if we can use a TBZ to fold in an AND as well.
-        // TBZ has a smaller branch displacement than CBZ.  If the offset is
-        // out of bounds, a late MI-layer pass rewrites branches.
-        // 403.gcc is an example that hits this case.
-        if (LHS.getOpcode() == ISD::AND &&
-            isa<ConstantSDNode>(LHS.getOperand(1)) &&
-            isPowerOf2_64(LHS.getConstantOperandVal(1))) {
-          SDValue Test = LHS.getOperand(0);
-          uint64_t Mask = LHS.getConstantOperandVal(1);
-
-          // TBNZ only operates on i64's, but the ext should be free.
-          if (Test.getValueType() == MVT::i32)
-            Test = DAG.getAnyExtOrTrunc(Test, dl, MVT::i64);
-
-          return DAG.getNode(ARM64ISD::TBNZ, dl, MVT::Other, Chain, Test,
-                             DAG.getConstant(Log2_64(Mask), MVT::i64), Dest);
-        }
-
-        return DAG.getNode(ARM64ISD::CBNZ, dl, MVT::Other, Chain, LHS, Dest);
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
-                       Cmp);
-  }
-
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue BR1 =
-      DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, Chain, Dest, CC1Val, Cmp);
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::BRCOND, dl, MVT::Other, BR1, Dest, CC2Val,
-                       Cmp);
-  }
-
-  return BR1;
-}
-
-SDValue ARM64TargetLowering::LowerFCOPYSIGN(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-
-  SDValue In1 = Op.getOperand(0);
-  SDValue In2 = Op.getOperand(1);
-  EVT SrcVT = In2.getValueType();
-  if (SrcVT != VT) {
-    if (SrcVT == MVT::f32 && VT == MVT::f64)
-      In2 = DAG.getNode(ISD::FP_EXTEND, DL, VT, In2);
-    else if (SrcVT == MVT::f64 && VT == MVT::f32)
-      In2 = DAG.getNode(ISD::FP_ROUND, DL, VT, In2, DAG.getIntPtrConstant(0));
-    else
-      // FIXME: Src type is different, bail out for now. Can VT really be a
-      // vector type?
-      return SDValue();
-  }
-
-  EVT VecVT;
-  EVT EltVT;
-  SDValue EltMask, VecVal1, VecVal2;
-  if (VT == MVT::f32 || VT == MVT::v2f32 || VT == MVT::v4f32) {
-    EltVT = MVT::i32;
-    VecVT = MVT::v4i32;
-    EltMask = DAG.getConstant(0x80000000ULL, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::ssub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else if (VT == MVT::f64 || VT == MVT::v2f64) {
-    EltVT = MVT::i64;
-    VecVT = MVT::v2i64;
-
-    // We want to materialize a mask with the the high bit set, but the AdvSIMD
-    // immediate moves cannot materialize that in a single instruction for
-    // 64-bit elements. Instead, materialize zero and then negate it.
-    EltMask = DAG.getConstant(0, EltVT);
-
-    if (!VT.isVector()) {
-      VecVal1 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In1);
-      VecVal2 = DAG.getTargetInsertSubreg(ARM64::dsub, DL, VecVT,
-                                          DAG.getUNDEF(VecVT), In2);
-    } else {
-      VecVal1 = DAG.getNode(ISD::BITCAST, DL, VecVT, In1);
-      VecVal2 = DAG.getNode(ISD::BITCAST, DL, VecVT, In2);
-    }
-  } else {
-    llvm_unreachable("Invalid type for copysign!");
-  }
-
-  std::vector<SDValue> BuildVectorOps;
-  for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i)
-    BuildVectorOps.push_back(EltMask);
-
-  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT,
-                                 &BuildVectorOps[0], BuildVectorOps.size());
-
-  // If we couldn't materialize the mask above, then the mask vector will be
-  // the zero vector, and we need to negate it here.
-  if (VT == MVT::f64 || VT == MVT::v2f64) {
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::FNEG, DL, MVT::v2f64, BuildVec);
-    BuildVec = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, BuildVec);
-  }
-
-  SDValue Sel =
-      DAG.getNode(ARM64ISD::BIT, DL, VecVT, VecVal1, VecVal2, BuildVec);
-
-  if (VT == MVT::f32)
-    return DAG.getTargetExtractSubreg(ARM64::ssub, DL, VT, Sel);
-  else if (VT == MVT::f64)
-    return DAG.getTargetExtractSubreg(ARM64::dsub, DL, VT, Sel);
-  else
-    return DAG.getNode(ISD::BITCAST, DL, VT, Sel);
-}
-
-SDValue ARM64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  if (DAG.getMachineFunction().getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::NoImplicitFloat))
-    return SDValue();
-
-  // While there is no integer popcount instruction, it can
-  // be more efficiently lowered to the following sequence that uses
-  // AdvSIMD registers/instructions as long as the copies to/from
-  // the AdvSIMD registers are cheap.
-  //  FMOV    D0, X0        // copy 64-bit int to vector, high bits zero'd
-  //  CNT     V0.8B, V0.8B  // 8xbyte pop-counts
-  //  ADDV    B0, V0.8B     // sum 8xbyte pop-counts
-  //  UMOV    X0, V0.B[0]   // copy byte result back to integer reg
-  SDValue Val = Op.getOperand(0);
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-  SDValue ZeroVec = DAG.getUNDEF(MVT::v8i8);
-
-  SDValue VecVal;
-  if (VT == MVT::i32) {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Val);
-    VecVal =
-        DAG.getTargetInsertSubreg(ARM64::ssub, DL, MVT::v8i8, ZeroVec, VecVal);
-  } else {
-    VecVal = DAG.getNode(ISD::BITCAST, DL, MVT::v8i8, Val);
-  }
-
-  SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v8i8, VecVal);
-  SDValue UaddLV = DAG.getNode(
-      ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
-      DAG.getConstant(Intrinsic::arm64_neon_uaddlv, MVT::i32), CtPop);
-
-  if (VT == MVT::i64)
-    UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
-  return UaddLV;
-}
-
-SDValue ARM64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
-
-  if (Op.getValueType().isVector())
-    return LowerVSETCC(Op, DAG);
-
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDLoc dl(Op);
-
-  // We chose ZeroOrOneBooleanContents, so use zero and one.
-  EVT VT = Op.getValueType();
-  SDValue TVal = DAG.getConstant(1, VT);
-  SDValue FVal = DAG.getConstant(0, VT);
-
-  // Handle f128 first, since one possible outcome is a normal integer
-  // comparison which gets picked up by the next if statement.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, use it.
-    if (RHS.getNode() == 0) {
-      assert(LHS.getValueType() == Op.getValueType() &&
-             "Unexpected setcc expansion!");
-      return LHS;
-    }
-  }
-
-  if (LHS.getValueType().isInteger()) {
-    SDValue CCVal;
-    SDValue Cmp =
-        getARM64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  if (CC2 == ARM64CC::AL) {
-    changeFPCCToARM64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-
-    // Note that we inverted the condition above, so we reverse the order of
-    // the true and false operands here.  This will allow the setcc to be
-    // matched to a single CSINC instruction.
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
-  } else {
-    // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-    // clean.  Some of them require two CSELs to implement.  As is in this case,
-    // we emit the first CSEL and then emit a second using the output of the
-    // first as the RHS.  We're effectively OR'ing the two CC's together.
-
-    // FIXME: It would be nice if we could match the two CSELs to two CSINCs.
-    SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-    SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-}
-
-/// A SELECT_CC operation is really some kind of max or min if both values being
-/// compared are, in some sense, equal to the results in either case. However,
-/// it is permissible to compare f32 values and produce directly extended f64
-/// values.
-///
-/// Extending the comparison operands would also be allowed, but is less likely
-/// to happen in practice since their use is right here. Note that truncate
-/// operations would *not* be semantically equivalent.
-static bool selectCCOpsAreFMaxCompatible(SDValue Cmp, SDValue Result) {
-  if (Cmp == Result)
-    return true;
-
-  ConstantFPSDNode *CCmp = dyn_cast<ConstantFPSDNode>(Cmp);
-  ConstantFPSDNode *CResult = dyn_cast<ConstantFPSDNode>(Result);
-  if (CCmp && CResult && Cmp.getValueType() == MVT::f32 &&
-      Result.getValueType() == MVT::f64) {
-    bool Lossy;
-    APFloat CmpVal = CCmp->getValueAPF();
-    CmpVal.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven, &Lossy);
-    return CResult->getValueAPF().bitwiseIsEqual(CmpVal);
-  }
-
-  return Result->getOpcode() == ISD::FP_EXTEND && Result->getOperand(0) == Cmp;
-}
-
-SDValue ARM64TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
-  SDValue CC = Op->getOperand(0);
-  SDValue TVal = Op->getOperand(1);
-  SDValue FVal = Op->getOperand(2);
-  SDLoc DL(Op);
-
-  unsigned Opc = CC.getOpcode();
-  // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
-  // instruction.
-  if (CC.getResNo() == 1 &&
-      (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-       Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO)) {
-    // Only lower legal XALUO ops.
-    if (!DAG.getTargetLoweringInfo().isTypeLegal(CC->getValueType(0)))
-      return SDValue();
-
-    ARM64CC::CondCode OFCC;
-    SDValue Value, Overflow;
-    std::tie(Value, Overflow) = getARM64XALUOOp(OFCC, CC.getValue(0), DAG);
-    SDValue CCVal = DAG.getConstant(OFCC, MVT::i32);
-
-    return DAG.getNode(ARM64ISD::CSEL, DL, Op.getValueType(), TVal, FVal, CCVal,
-                       Overflow);
-  }
-
-  if (CC.getOpcode() == ISD::SETCC)
-    return DAG.getSelectCC(DL, CC.getOperand(0), CC.getOperand(1), TVal, FVal,
-                           cast<CondCodeSDNode>(CC.getOperand(2))->get());
-  else
-    return DAG.getSelectCC(DL, CC, DAG.getConstant(0, CC.getValueType()), TVal,
-                           FVal, ISD::SETNE);
-}
-
-SDValue ARM64TargetLowering::LowerSELECT_CC(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue TVal = Op.getOperand(2);
-  SDValue FVal = Op.getOperand(3);
-  SDLoc dl(Op);
-
-  // Handle f128 first, because it will result in a comparison of some RTLIB
-  // call result against zero.
-  if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl);
-
-    // If softenSetCCOperands returned a scalar, we need to compare the result
-    // against zero to select between true and false values.
-    if (RHS.getNode() == 0) {
-      RHS = DAG.getConstant(0, LHS.getValueType());
-      CC = ISD::SETNE;
-    }
-  }
-
-  // Handle integers first.
-  if (LHS.getValueType().isInteger()) {
-    assert((LHS.getValueType() == RHS.getValueType()) &&
-           (LHS.getValueType() == MVT::i32 || LHS.getValueType() == MVT::i64));
-
-    unsigned Opcode = ARM64ISD::CSEL;
-
-    // If both the TVal and the FVal are constants, see if we can swap them in
-    // order to for a CSINV or CSINC out of them.
-    ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FVal);
-    ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TVal);
-
-    if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
-      std::swap(TVal, FVal);
-      std::swap(CTVal, CFVal);
-      CC = ISD::getSetCCInverse(CC, true);
-    } else if (TVal.getOpcode() == ISD::XOR) {
-      // If TVal is a NOT we want to swap TVal and FVal so that we can match
-      // with a CSINV rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(1));
-
-      if (CVal && CVal->isAllOnesValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (TVal.getOpcode() == ISD::SUB) {
-      // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
-      // that we can match with a CSNEG rather than a CSEL.
-      ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(TVal.getOperand(0));
-
-      if (CVal && CVal->isNullValue()) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-    } else if (CTVal && CFVal) {
-      const int64_t TrueVal = CTVal->getSExtValue();
-      const int64_t FalseVal = CFVal->getSExtValue();
-      bool Swap = false;
-
-      // If both TVal and FVal are constants, see if FVal is the
-      // inverse/negation/increment of TVal and generate a CSINV/CSNEG/CSINC
-      // instead of a CSEL in that case.
-      if (TrueVal == ~FalseVal) {
-        Opcode = ARM64ISD::CSINV;
-      } else if (TrueVal == -FalseVal) {
-        Opcode = ARM64ISD::CSNEG;
-      } else if (TVal.getValueType() == MVT::i32) {
-        // If our operands are only 32-bit wide, make sure we use 32-bit
-        // arithmetic for the check whether we can use CSINC. This ensures that
-        // the addition in the check will wrap around properly in case there is
-        // an overflow (which would not be the case if we do the check with
-        // 64-bit arithmetic).
-        const uint32_t TrueVal32 = CTVal->getZExtValue();
-        const uint32_t FalseVal32 = CFVal->getZExtValue();
-
-        if ((TrueVal32 == FalseVal32 + 1) || (TrueVal32 + 1 == FalseVal32)) {
-          Opcode = ARM64ISD::CSINC;
-
-          if (TrueVal32 > FalseVal32) {
-            Swap = true;
-          }
-        }
-        // 64-bit check whether we can use CSINC.
-      } else if ((TrueVal == FalseVal + 1) || (TrueVal + 1 == FalseVal)) {
-        Opcode = ARM64ISD::CSINC;
-
-        if (TrueVal > FalseVal) {
-          Swap = true;
-        }
-      }
-
-      // Swap TVal and FVal if necessary.
-      if (Swap) {
-        std::swap(TVal, FVal);
-        std::swap(CTVal, CFVal);
-        CC = ISD::getSetCCInverse(CC, true);
-      }
-
-      if (Opcode != ARM64ISD::CSEL) {
-        // Drop FVal since we can get its value by simply inverting/negating
-        // TVal.
-        FVal = TVal;
-      }
-    }
-
-    SDValue CCVal;
-    SDValue Cmp = getARM64Cmp(LHS, RHS, CC, CCVal, DAG, dl);
-
-    EVT VT = Op.getValueType();
-    return DAG.getNode(Opcode, dl, VT, TVal, FVal, CCVal, Cmp);
-  }
-
-  // Now we know we're dealing with FP values.
-  assert(LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
-  assert(LHS.getValueType() == RHS.getValueType());
-  EVT VT = Op.getValueType();
-
-  // Try to match this select into a max/min operation, which have dedicated
-  // opcode in the instruction set.
-  // NOTE: This is not correct in the presence of NaNs, so we only enable this
-  // in no-NaNs mode.
-  if (getTargetMachine().Options.NoNaNsFPMath) {
-    if (selectCCOpsAreFMaxCompatible(LHS, FVal) &&
-        selectCCOpsAreFMaxCompatible(RHS, TVal)) {
-      CC = ISD::getSetCCSwappedOperands(CC);
-      std::swap(TVal, FVal);
-    }
-
-    if (selectCCOpsAreFMaxCompatible(LHS, TVal) &&
-        selectCCOpsAreFMaxCompatible(RHS, FVal)) {
-      switch (CC) {
-      default:
-        break;
-      case ISD::SETGT:
-      case ISD::SETGE:
-      case ISD::SETUGT:
-      case ISD::SETUGE:
-      case ISD::SETOGT:
-      case ISD::SETOGE:
-        return DAG.getNode(ARM64ISD::FMAX, dl, VT, TVal, FVal);
-        break;
-      case ISD::SETLT:
-      case ISD::SETLE:
-      case ISD::SETULT:
-      case ISD::SETULE:
-      case ISD::SETOLT:
-      case ISD::SETOLE:
-        return DAG.getNode(ARM64ISD::FMIN, dl, VT, TVal, FVal);
-        break;
-      }
-    }
-  }
-
-  // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
-  // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, dl, DAG);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two CSELs to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-  SDValue CC1Val = DAG.getConstant(CC1, MVT::i32);
-  SDValue CS1 = DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
-
-  // If we need a second CSEL, emit it, using the output of the first as the
-  // RHS.  We're effectively OR'ing the two CC's together.
-  if (CC2 != ARM64CC::AL) {
-    SDValue CC2Val = DAG.getConstant(CC2, MVT::i32);
-    return DAG.getNode(ARM64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
-  }
-
-  // Otherwise, return the output of the first CSEL.
-  return CS1;
-}
-
-SDValue ARM64TargetLowering::LowerJumpTable(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  // Jump table entries as PC relative offsets. No additional tweaking
-  // is necessary here. Just get the address of the jump table.
-  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  SDValue Hi = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, ARM64II::MO_PAGE);
-  SDValue Lo = DAG.getTargetJumpTable(JT->getIndex(), PtrVT,
-                                      ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-  SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-  return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-}
-
-SDValue ARM64TargetLowering::LowerConstantPool(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-
-  if (getTargetMachine().getCodeModel() == CodeModel::Large) {
-    // Use the GOT for the large code model on iOS.
-    if (Subtarget->isTargetMachO()) {
-      SDValue GotAddr = DAG.getTargetConstantPool(
-          CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-          ARM64II::MO_GOT);
-      return DAG.getNode(ARM64ISD::LOADgot, DL, PtrVT, GotAddr);
-    }
-
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G3),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_G0 | MO_NC));
-  } else {
-    // Use ADRP/ADD or ADRP/LDR for everything else: the small memory model on
-    // ELF, the only valid one on Darwin.
-    SDValue Hi =
-        DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlignment(),
-                                  CP->getOffset(), ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetConstantPool(
-        CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(),
-        ARM64II::MO_PAGEOFF | ARM64II::MO_NC);
-
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerBlockAddress(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  EVT PtrVT = getPointerTy();
-  SDLoc DL(Op);
-  if (getTargetMachine().getCodeModel() == CodeModel::Large &&
-      !Subtarget->isTargetMachO()) {
-    const unsigned char MO_NC = ARM64II::MO_NC;
-    return DAG.getNode(
-        ARM64ISD::WrapperLarge, DL, PtrVT,
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G3),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G2 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G1 | MO_NC),
-        DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_G0 | MO_NC));
-  } else {
-    SDValue Hi = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGE);
-    SDValue Lo = DAG.getTargetBlockAddress(BA, PtrVT, 0, ARM64II::MO_PAGEOFF |
-                                                             ARM64II::MO_NC);
-    SDValue ADRP = DAG.getNode(ARM64ISD::ADRP, DL, PtrVT, Hi);
-    return DAG.getNode(ARM64ISD::ADDlow, DL, PtrVT, ADRP, Lo);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerDarwin_VASTART(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  ARM64FunctionInfo *FuncInfo =
-      DAG.getMachineFunction().getInfo<ARM64FunctionInfo>();
-
-  SDLoc DL(Op);
-  SDValue FR =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
-                      MachinePointerInfo(SV), false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerAAPCS_VASTART(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  // The layout of the va_list struct is specified in the AArch64 Procedure Call
-  // Standard, section B.3.
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARM64FunctionInfo *FuncInfo = MF.getInfo<ARM64FunctionInfo>();
-  SDLoc DL(Op);
-
-  SDValue Chain = Op.getOperand(0);
-  SDValue VAList = Op.getOperand(1);
-  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  SmallVector<SDValue, 4> MemOps;
-
-  // void *__stack at offset 0
-  SDValue Stack =
-      DAG.getFrameIndex(FuncInfo->getVarArgsStackIndex(), getPointerTy());
-  MemOps.push_back(DAG.getStore(Chain, DL, Stack, VAList,
-                                MachinePointerInfo(SV), false, false, 8));
-
-  // void *__gr_top at offset 8
-  int GPRSize = FuncInfo->getVarArgsGPRSize();
-  if (GPRSize > 0) {
-    SDValue GRTop, GRTopAddr;
-
-    GRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(8, getPointerTy()));
-
-    GRTop = DAG.getFrameIndex(FuncInfo->getVarArgsGPRIndex(), getPointerTy());
-    GRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), GRTop,
-                        DAG.getConstant(GPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, GRTop, GRTopAddr,
-                                  MachinePointerInfo(SV, 8), false, false, 8));
-  }
-
-  // void *__vr_top at offset 16
-  int FPRSize = FuncInfo->getVarArgsFPRSize();
-  if (FPRSize > 0) {
-    SDValue VRTop, VRTopAddr;
-    VRTopAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                            DAG.getConstant(16, getPointerTy()));
-
-    VRTop = DAG.getFrameIndex(FuncInfo->getVarArgsFPRIndex(), getPointerTy());
-    VRTop = DAG.getNode(ISD::ADD, DL, getPointerTy(), VRTop,
-                        DAG.getConstant(FPRSize, getPointerTy()));
-
-    MemOps.push_back(DAG.getStore(Chain, DL, VRTop, VRTopAddr,
-                                  MachinePointerInfo(SV, 16), false, false, 8));
-  }
-
-  // int __gr_offs at offset 24
-  SDValue GROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(24, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-GPRSize, MVT::i32),
-                                GROffsAddr, MachinePointerInfo(SV, 24), false,
-                                false, 4));
-
-  // int __vr_offs at offset 28
-  SDValue VROffsAddr = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                                   DAG.getConstant(28, getPointerTy()));
-  MemOps.push_back(DAG.getStore(Chain, DL, DAG.getConstant(-FPRSize, MVT::i32),
-                                VROffsAddr, MachinePointerInfo(SV, 28), false,
-                                false, 4));
-
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &MemOps[0],
-                     MemOps.size());
-}
-
-SDValue ARM64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
-  return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG)
-                                     : LowerAAPCS_VASTART(Op, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
-  // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single
-  // pointer.
-  unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32;
-  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
-  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
-
-  return DAG.getMemcpy(Op.getOperand(0), SDLoc(Op), Op.getOperand(1),
-                       Op.getOperand(2), DAG.getConstant(VaListSize, MVT::i32),
-                       8, false, false, MachinePointerInfo(DestSV),
-                       MachinePointerInfo(SrcSV));
-}
-
-SDValue ARM64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
-  assert(Subtarget->isTargetDarwin() &&
-         "automatic va_arg instruction only works on Darwin");
-
-  const Value *V = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  SDValue Chain = Op.getOperand(0);
-  SDValue Addr = Op.getOperand(1);
-  unsigned Align = Op.getConstantOperandVal(3);
-
-  SDValue VAList = DAG.getLoad(getPointerTy(), DL, Chain, Addr,
-                               MachinePointerInfo(V), false, false, false, 0);
-  Chain = VAList.getValue(1);
-
-  if (Align > 8) {
-    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
-    VAList = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                         DAG.getConstant(Align - 1, getPointerTy()));
-    VAList = DAG.getNode(ISD::AND, DL, getPointerTy(), VAList,
-                         DAG.getConstant(-(int64_t)Align, getPointerTy()));
-  }
-
-  Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
-  uint64_t ArgSize = getDataLayout()->getTypeAllocSize(ArgTy);
-
-  // Scalar integer and FP values smaller than 64 bits are implicitly extended
-  // up to 64 bits.  At the very least, we have to increase the striding of the
-  // vaargs list to match this, and for FP values we need to introduce
-  // FP_ROUND nodes as well.
-  if (VT.isInteger() && !VT.isVector())
-    ArgSize = 8;
-  bool NeedFPTrunc = false;
-  if (VT.isFloatingPoint() && !VT.isVector() && VT != MVT::f64) {
-    ArgSize = 8;
-    NeedFPTrunc = true;
-  }
-
-  // Increment the pointer, VAList, to the next vaarg
-  SDValue VANext = DAG.getNode(ISD::ADD, DL, getPointerTy(), VAList,
-                               DAG.getConstant(ArgSize, getPointerTy()));
-  // Store the incremented VAList to the legalized pointer
-  SDValue APStore = DAG.getStore(Chain, DL, VANext, Addr, MachinePointerInfo(V),
-                                 false, false, 0);
-
-  // Load the actual argument out of the pointer VAList
-  if (NeedFPTrunc) {
-    // Load the value as an f64.
-    SDValue WideFP = DAG.getLoad(MVT::f64, DL, APStore, VAList,
-                                 MachinePointerInfo(), false, false, false, 0);
-    // Round the value down to an f32.
-    SDValue NarrowFP = DAG.getNode(ISD::FP_ROUND, DL, VT, WideFP.getValue(0),
-                                   DAG.getIntPtrConstant(1));
-    SDValue Ops[] = { NarrowFP, WideFP.getValue(1) };
-    // Merge the rounded value with the chain output of the load.
-    return DAG.getMergeValues(Ops, 2, DL);
-  }
-
-  return DAG.getLoad(VT, DL, APStore, VAList, MachinePointerInfo(), false,
-                     false, false, 0);
-}
-
-SDValue ARM64TargetLowering::LowerFRAMEADDR(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
-  MFI->setFrameAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), DL, ARM64::FP, VT);
-  while (Depth--)
-    FrameAddr = DAG.getLoad(VT, DL, DAG.getEntryNode(), FrameAddr,
-                            MachinePointerInfo(), false, false, false, 0);
-  return FrameAddr;
-}
-
-SDValue ARM64TargetLowering::LowerRETURNADDR(SDValue Op,
-                                             SelectionDAG &DAG) const {
-  MachineFunction &MF = DAG.getMachineFunction();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  MFI->setReturnAddressIsTaken(true);
-
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  if (Depth) {
-    SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
-    SDValue Offset = DAG.getConstant(8, getPointerTy());
-    return DAG.getLoad(VT, DL, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, DL, VT, FrameAddr, Offset),
-                       MachinePointerInfo(), false, false, false, 0);
-  }
-
-  // Return LR, which contains the return address. Mark it an implicit live-in.
-  unsigned Reg = MF.addLiveIn(ARM64::LR, &ARM64::GPR64RegClass);
-  return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
-}
-
-/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftRightParts(SDValue Op,
-                                                  SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-  unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
-
-  assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
-
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
-
-  SDValue Cmp =
-      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-
-  SDValue FalseValLo = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-  SDValue TrueValLo = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  // ARM64 shifts larger than the register width are wrapped rather than
-  // clamped, so we can't just emit "hi >> x".
-  SDValue FalseValHi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
-  SDValue TrueValHi = Opc == ISD::SRA
-                          ? DAG.getNode(Opc, dl, VT, ShOpHi,
-                                        DAG.getConstant(VTBits - 1, MVT::i64))
-                          : DAG.getConstant(0, VT);
-  SDValue Hi =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValHi, FalseValHi, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
-}
-
-/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
-/// i64 values and take a 2 x i64 value to shift plus a shift amount.
-SDValue ARM64TargetLowering::LowerShiftLeftParts(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  assert(Op.getNumOperands() == 3 && "Not a double-shift!");
-  EVT VT = Op.getValueType();
-  unsigned VTBits = VT.getSizeInBits();
-  SDLoc dl(Op);
-  SDValue ShOpLo = Op.getOperand(0);
-  SDValue ShOpHi = Op.getOperand(1);
-  SDValue ShAmt = Op.getOperand(2);
-  SDValue ARMcc;
-
-  assert(Op.getOpcode() == ISD::SHL_PARTS);
-  SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64,
-                                 DAG.getConstant(VTBits, MVT::i64), ShAmt);
-  SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
-  SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i64, ShAmt,
-                                   DAG.getConstant(VTBits, MVT::i64));
-  SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
-  SDValue Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
-
-  SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
-
-  SDValue Cmp =
-      emitComparison(ExtraShAmt, DAG.getConstant(0, MVT::i64), dl, DAG);
-  SDValue CCVal = DAG.getConstant(ARM64CC::GE, MVT::i32);
-  SDValue Hi = DAG.getNode(ARM64ISD::CSEL, dl, VT, Tmp3, FalseVal, CCVal, Cmp);
-
-  // ARM64 shifts of larger than register sizes are wrapped rather than clamped,
-  // so we can't just emit "lo << a" if a is too big.
-  SDValue TrueValLo = DAG.getConstant(0, VT);
-  SDValue FalseValLo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
-  SDValue Lo =
-      DAG.getNode(ARM64ISD::CSEL, dl, VT, TrueValLo, FalseValLo, CCVal, Cmp);
-
-  SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
-}
-
-bool
-ARM64TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
-  // The ARM64 target doesn't support folding offsets into global addresses.
-  return false;
-}
-
-bool ARM64TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
-  // We can materialize #0.0 as fmov $Rd, XZR for 64-bit and 32-bit cases.
-  // FIXME: We should be able to handle f128 as well with a clever lowering.
-  if (Imm.isPosZero() && (VT == MVT::f64 || VT == MVT::f32))
-    return true;
-
-  if (VT == MVT::f64)
-    return ARM64_AM::getFP64Imm(Imm) != -1;
-  else if (VT == MVT::f32)
-    return ARM64_AM::getFP32Imm(Imm) != -1;
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Optimization Hooks
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                          ARM64 Inline Assembly Support
-//===----------------------------------------------------------------------===//
-
-// Table of Constraints
-// TODO: This is the current set of constraints supported by ARM for the
-// compiler, not all of them may make sense, e.g. S may be difficult to support.
-//
-// r - A general register
-// w - An FP/SIMD register of some size in the range v0-v31
-// x - An FP/SIMD register of some size in the range v0-v15
-// I - Constant that can be used with an ADD instruction
-// J - Constant that can be used with a SUB instruction
-// K - Constant that can be used with a 32-bit logical instruction
-// L - Constant that can be used with a 64-bit logical instruction
-// M - Constant that can be used as a 32-bit MOV immediate
-// N - Constant that can be used as a 64-bit MOV immediate
-// Q - A memory reference with base register and no offset
-// S - A symbolic address
-// Y - Floating point constant zero
-// Z - Integer constant zero
-//
-//   Note that general register operands will be output using their 64-bit x
-// register name, whatever the size of the variable, unless the asm operand
-// is prefixed by the %w modifier. Floating-point and SIMD register operands
-// will be output with the v prefix unless prefixed by the %b, %h, %s, %d or
-// %q modifier.
-
-/// getConstraintType - Given a constraint letter, return the type of
-/// constraint it is for this target.
-ARM64TargetLowering::ConstraintType
-ARM64TargetLowering::getConstraintType(const std::string &Constraint) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    default:
-      break;
-    case 'z':
-      return C_Other;
-    case 'x':
-    case 'w':
-      return C_RegisterClass;
-    // An address with a single base register. Due to the way we
-    // currently handle addresses it is the same as 'r'.
-    case 'Q':
-      return C_Memory;
-    }
-  }
-  return TargetLowering::getConstraintType(Constraint);
-}
-
-/// Examine constraint type and operand type and determine a weight value.
-/// This object must already have been set up with the operand type
-/// and the current alternative constraint selected.
-TargetLowering::ConstraintWeight
-ARM64TargetLowering::getSingleConstraintMatchWeight(
-    AsmOperandInfo &info, const char *constraint) const {
-  ConstraintWeight weight = CW_Invalid;
-  Value *CallOperandVal = info.CallOperandVal;
-  // If we don't have a value, we can't do a match,
-  // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
-    return CW_Default;
-  Type *type = CallOperandVal->getType();
-  // Look at the constraint type.
-  switch (*constraint) {
-  default:
-    weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
-    break;
-  case 'x':
-  case 'w':
-    if (type->isFloatingPointTy() || type->isVectorTy())
-      weight = CW_Register;
-    break;
-  case 'z':
-    weight = CW_Constant;
-    break;
-  }
-  return weight;
-}
-
-std::pair<unsigned, const TargetRegisterClass *>
-ARM64TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                  MVT VT) const {
-  if (Constraint.size() == 1) {
-    switch (Constraint[0]) {
-    case 'r':
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::GPR64commonRegClass);
-      return std::make_pair(0U, &ARM64::GPR32commonRegClass);
-    case 'w':
-      if (VT == MVT::f32)
-        return std::make_pair(0U, &ARM64::FPR32RegClass);
-      if (VT.getSizeInBits() == 64)
-        return std::make_pair(0U, &ARM64::FPR64RegClass);
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128RegClass);
-      break;
-    // The instructions that this constraint is designed for can
-    // only take 128-bit registers so just use that regclass.
-    case 'x':
-      if (VT.getSizeInBits() == 128)
-        return std::make_pair(0U, &ARM64::FPR128_loRegClass);
-      break;
-    }
-  }
-  if (StringRef("{cc}").equals_lower(Constraint))
-    return std::make_pair(unsigned(ARM64::CPSR), &ARM64::CCRRegClass);
-
-  // Use the default implementation in TargetLowering to convert the register
-  // constraint into a member of a register class.
-  std::pair<unsigned, const TargetRegisterClass *> Res;
-  Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
-
-  // Not found as a standard register?
-  if (Res.second == 0) {
-    unsigned Size = Constraint.size();
-    if ((Size == 4 || Size == 5) && Constraint[0] == '{' &&
-        tolower(Constraint[1]) == 'v' && Constraint[Size - 1] == '}') {
-      const std::string Reg =
-          std::string(&Constraint[2], &Constraint[Size - 1]);
-      int RegNo = atoi(Reg.c_str());
-      if (RegNo >= 0 && RegNo <= 31) {
-        // v0 - v31 are aliases of q0 - q31.
-        // By default we'll emit v0-v31 for this unless there's a modifier where
-        // we'll emit the correct register as well.
-        Res.first = ARM64::FPR128RegClass.getRegister(RegNo);
-        Res.second = &ARM64::FPR128RegClass;
-      }
-    }
-  }
-
-  return Res;
-}
-
-/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
-/// vector.  If it is invalid, don't add anything to Ops.
-void ARM64TargetLowering::LowerAsmOperandForConstraint(
-    SDValue Op, std::string &Constraint, std::vector<SDValue> &Ops,
-    SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
-
-  // Currently only support length 1 constraints.
-  if (Constraint.length() != 1)
-    return;
-
-  char ConstraintLetter = Constraint[0];
-  switch (ConstraintLetter) {
-  default:
-    break;
-
-  // This set of constraints deal with valid constants for various instructions.
-  // Validate and return a target constant for them if we can.
-  case 'z': {
-    // 'z' maps to xzr or wzr so it needs an input of 0.
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C || C->getZExtValue() != 0)
-      return;
-
-    if (Op.getValueType() == MVT::i64)
-      Result = DAG.getRegister(ARM64::XZR, MVT::i64);
-    else
-      Result = DAG.getRegister(ARM64::WZR, MVT::i32);
-    break;
-  }
-
-  case 'I':
-  case 'J':
-  case 'K':
-  case 'L':
-  case 'M':
-  case 'N':
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-    if (!C)
-      return;
-
-    // Grab the value and do some validation.
-    uint64_t CVal = C->getZExtValue();
-    switch (ConstraintLetter) {
-    // The I constraint applies only to simple ADD or SUB immediate operands:
-    // i.e. 0 to 4095 with optional shift by 12
-    // The J constraint applies only to ADD or SUB immediates that would be
-    // valid when negated, i.e. if [an add pattern] were to be output as a SUB
-    // instruction [or vice versa], in other words -1 to -4095 with optional
-    // left shift by 12.
-    case 'I':
-      if (isUInt<12>(CVal) || isShiftedUInt<12, 12>(CVal))
-        break;
-      return;
-    case 'J': {
-      uint64_t NVal = -C->getSExtValue();
-      if (isUInt<12>(NVal) || isShiftedUInt<12, 12>(NVal))
-        break;
-      return;
-    }
-    // The K and L constraints apply *only* to logical immediates, including
-    // what used to be the MOVI alias for ORR (though the MOVI alias has now
-    // been removed and MOV should be used). So these constraints have to
-    // distinguish between bit patterns that are valid 32-bit or 64-bit
-    // "bitmask immediates": for example 0xaaaaaaaa is a valid bimm32 (K), but
-    // not a valid bimm64 (L) where 0xaaaaaaaaaaaaaaaa would be valid, and vice
-    // versa.
-    case 'K':
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      return;
-    case 'L':
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      return;
-    // The M and N constraints are a superset of K and L respectively, for use
-    // with the MOV (immediate) alias. As well as the logical immediates they
-    // also match 32 or 64-bit immediates that can be loaded either using a
-    // *single* MOVZ or MOVN , such as 32-bit 0x12340000, 0x00001234, 0xffffedca
-    // (M) or 64-bit 0x1234000000000000 (N) etc.
-    // As a note some of this code is liberally stolen from the asm parser.
-    case 'M': {
-      if (!isUInt<32>(CVal))
-        return;
-      if (ARM64_AM::isLogicalImmediate(CVal, 32))
-        break;
-      if ((CVal & 0xFFFF) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~(uint32_t)CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      return;
-    }
-    case 'N': {
-      if (ARM64_AM::isLogicalImmediate(CVal, 64))
-        break;
-      if ((CVal & 0xFFFFULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF0000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF00000000ULL) == CVal)
-        break;
-      if ((CVal & 0xFFFF000000000000ULL) == CVal)
-        break;
-      uint64_t NCVal = ~CVal;
-      if ((NCVal & 0xFFFFULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF0000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF00000000ULL) == NCVal)
-        break;
-      if ((NCVal & 0xFFFF000000000000ULL) == NCVal)
-        break;
-      return;
-    }
-    default:
-      return;
-    }
-
-    // All assembler immediates are 64-bit integers.
-    Result = DAG.getTargetConstant(CVal, MVT::i64);
-    break;
-  }
-
-  if (Result.getNode()) {
-    Ops.push_back(Result);
-    return;
-  }
-
-  return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
-}
-
-//===----------------------------------------------------------------------===//
-//                     ARM64 Advanced SIMD Support
-//===----------------------------------------------------------------------===//
-
-/// WidenVector - Given a value in the V64 register class, produce the
-/// equivalent value in the V128 register class.
-static SDValue WidenVector(SDValue V64Reg, SelectionDAG &DAG) {
-  EVT VT = V64Reg.getValueType();
-  unsigned NarrowSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT WideTy = MVT::getVectorVT(EltTy, 2 * NarrowSize);
-  SDLoc DL(V64Reg);
-
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideTy, DAG.getUNDEF(WideTy),
-                     V64Reg, DAG.getConstant(0, MVT::i32));
-}
-
-/// getExtFactor - Determine the adjustment factor for the position when
-/// generating an "extract from vector registers" instruction.
-static unsigned getExtFactor(SDValue &V) {
-  EVT EltType = V.getValueType().getVectorElementType();
-  return EltType.getSizeInBits() / 8;
-}
-
-/// NarrowVector - Given a value in the V128 register class, produce the
-/// equivalent value in the V64 register class.
-static SDValue NarrowVector(SDValue V128Reg, SelectionDAG &DAG) {
-  EVT VT = V128Reg.getValueType();
-  unsigned WideSize = VT.getVectorNumElements();
-  MVT EltTy = VT.getVectorElementType().getSimpleVT();
-  MVT NarrowTy = MVT::getVectorVT(EltTy, WideSize / 2);
-  SDLoc DL(V128Reg);
-
-  return DAG.getTargetExtractSubreg(ARM64::dsub, DL, NarrowTy, V128Reg);
-}
-
-// Gather data to see if the operation can be modelled as a
-// shuffle in combination with VEXTs.
-SDValue ARM64TargetLowering::ReconstructShuffle(SDValue Op,
-                                                SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
-
-  SmallVector<SDValue, 2> SourceVecs;
-  SmallVector<unsigned, 2> MinElts;
-  SmallVector<unsigned, 2> MaxElts;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
-      // A shuffle can only come from building a vector from various
-      // elements of other vectors.
-      return SDValue();
-    }
-
-    // Record this extraction against the appropriate vector if possible...
-    SDValue SourceVec = V.getOperand(0);
-    unsigned EltNo = cast<ConstantSDNode>(V.getOperand(1))->getZExtValue();
-    bool FoundSource = false;
-    for (unsigned j = 0; j < SourceVecs.size(); ++j) {
-      if (SourceVecs[j] == SourceVec) {
-        if (MinElts[j] > EltNo)
-          MinElts[j] = EltNo;
-        if (MaxElts[j] < EltNo)
-          MaxElts[j] = EltNo;
-        FoundSource = true;
-        break;
-      }
-    }
-
-    // Or record a new source if not...
-    if (!FoundSource) {
-      SourceVecs.push_back(SourceVec);
-      MinElts.push_back(EltNo);
-      MaxElts.push_back(EltNo);
-    }
-  }
-
-  // Currently only do something sane when at most two source vectors
-  // involved.
-  if (SourceVecs.size() > 2)
-    return SDValue();
-
-  SDValue ShuffleSrcs[2] = { DAG.getUNDEF(VT), DAG.getUNDEF(VT) };
-  int VEXTOffsets[2] = { 0, 0 };
-
-  // This loop extracts the usage patterns of the source vectors
-  // and prepares appropriate SDValues for a shuffle if possible.
-  for (unsigned i = 0; i < SourceVecs.size(); ++i) {
-    if (SourceVecs[i].getValueType() == VT) {
-      // No VEXT necessary
-      ShuffleSrcs[i] = SourceVecs[i];
-      VEXTOffsets[i] = 0;
-      continue;
-    } else if (SourceVecs[i].getValueType().getVectorNumElements() < NumElts) {
-      // It probably isn't worth padding out a smaller vector just to
-      // break it down again in a shuffle.
-      return SDValue();
-    }
-
-    // Don't attempt to extract subvectors from BUILD_VECTOR sources
-    // that expand or trunc the original value.
-    // TODO: We can try to bitcast and ANY_EXTEND the result but
-    // we need to consider the cost of vector ANY_EXTEND, and the
-    // legality of all the types.
-    if (SourceVecs[i].getValueType().getVectorElementType() !=
-        VT.getVectorElementType())
-      return SDValue();
-
-    // Since only 64-bit and 128-bit vectors are legal on ARM and
-    // we've eliminated the other cases...
-    assert(SourceVecs[i].getValueType().getVectorNumElements() == 2 * NumElts &&
-           "unexpected vector sizes in ReconstructShuffle");
-
-    if (MaxElts[i] - MinElts[i] >= NumElts) {
-      // Span too large for a VEXT to cope
-      return SDValue();
-    }
-
-    if (MinElts[i] >= NumElts) {
-      // The extraction can just take the second half
-      VEXTOffsets[i] = NumElts;
-      ShuffleSrcs[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-    } else if (MaxElts[i] < NumElts) {
-      // The extraction can just take the first half
-      VEXTOffsets[i] = 0;
-      ShuffleSrcs[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                   SourceVecs[i], DAG.getIntPtrConstant(0));
-    } else {
-      // An actual VEXT is needed
-      VEXTOffsets[i] = MinElts[i];
-      SDValue VEXTSrc1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                                     SourceVecs[i], DAG.getIntPtrConstant(0));
-      SDValue VEXTSrc2 =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, SourceVecs[i],
-                      DAG.getIntPtrConstant(NumElts));
-      unsigned Imm = VEXTOffsets[i] * getExtFactor(VEXTSrc1);
-      ShuffleSrcs[i] = DAG.getNode(ARM64ISD::EXT, dl, VT, VEXTSrc1, VEXTSrc2,
-                                   DAG.getConstant(Imm, MVT::i32));
-    }
-  }
-
-  SmallVector<int, 8> Mask;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Entry = Op.getOperand(i);
-    if (Entry.getOpcode() == ISD::UNDEF) {
-      Mask.push_back(-1);
-      continue;
-    }
-
-    SDValue ExtractVec = Entry.getOperand(0);
-    int ExtractElt =
-        cast<ConstantSDNode>(Op.getOperand(i).getOperand(1))->getSExtValue();
-    if (ExtractVec == SourceVecs[0]) {
-      Mask.push_back(ExtractElt - VEXTOffsets[0]);
-    } else {
-      Mask.push_back(ExtractElt + NumElts - VEXTOffsets[1]);
-    }
-  }
-
-  // Final check before we try to produce nonsense...
-  if (isShuffleMaskLegal(Mask, VT))
-    return DAG.getVectorShuffle(VT, dl, ShuffleSrcs[0], ShuffleSrcs[1],
-                                &Mask[0]);
-
-  return SDValue();
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are the same.
-static bool isSingletonEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, just follow it
-    // back to index zero and keep going.
-    ++ExpectedElt;
-    if (ExpectedElt == NumElts)
-      ExpectedElt = 0;
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  return true;
-}
-
-// check if an EXT instruction can handle the shuffle mask when the
-// vector sources of the shuffle are different.
-static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
-                      unsigned &Imm) {
-  unsigned NumElts = VT.getVectorNumElements();
-  ReverseEXT = false;
-
-  // Assume that the first shuffle index is not UNDEF.  Fail if it is.
-  if (M[0] < 0)
-    return false;
-
-  Imm = M[0];
-
-  // If this is a VEXT shuffle, the immediate value is the index of the first
-  // element.  The other shuffle indices must be the successive elements after
-  // the first one.
-  unsigned ExpectedElt = Imm;
-  for (unsigned i = 1; i < NumElts; ++i) {
-    // Increment the expected index.  If it wraps around, it may still be
-    // a VEXT but the source vectors must be swapped.
-    ExpectedElt += 1;
-    if (ExpectedElt == NumElts * 2) {
-      ExpectedElt = 0;
-      ReverseEXT = true;
-    }
-
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if (ExpectedElt != static_cast<unsigned>(M[i]))
-      return false;
-  }
-
-  // Adjust the index value if the source operands will be swapped.
-  if (ReverseEXT)
-    Imm -= NumElts;
-
-  return true;
-}
-
-/// isREVMask - Check if a vector shuffle corresponds to a REV
-/// instruction with the specified blocksize.  (The order of the elements
-/// within each block of the vector is reversed.)
-static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
-  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
-         "Only possible block sizes for REV are: 16, 32, 64");
-
-  unsigned EltSz = VT.getVectorElementType().getSizeInBits();
-  if (EltSz == 64)
-    return false;
-
-  unsigned NumElts = VT.getVectorNumElements();
-  unsigned BlockElts = M[0] + 1;
-  // If the first shuffle index is UNDEF, be optimistic.
-  if (M[0] < 0)
-    BlockElts = BlockSize / EltSz;
-
-  if (BlockSize <= EltSz || BlockSize != BlockElts * EltSz)
-    return false;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
-      return false;
-  }
-
-  return true;
-}
-
-static bool isZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-static bool isUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i != NumElts; ++i) {
-    if (M[i] < 0)
-      continue; // ignore UNDEF indices
-    if ((unsigned)M[i] != 2 * i + WhichResult)
-      return false;
-  }
-
-  return true;
-}
-
-static bool isTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// isZIP_v_undef_Mask - Special case of isZIPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
-static bool isZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  unsigned Idx = WhichResult * NumElts / 2;
-  for (unsigned i = 0; i != NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != Idx) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx))
-      return false;
-    Idx += 1;
-  }
-
-  return true;
-}
-
-/// isUZP_v_undef_Mask - Special case of isUZPMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
-static bool isUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned Half = VT.getVectorNumElements() / 2;
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned j = 0; j != 2; ++j) {
-    unsigned Idx = WhichResult;
-    for (unsigned i = 0; i != Half; ++i) {
-      int MIdx = M[i + j * Half];
-      if (MIdx >= 0 && (unsigned)MIdx != Idx)
-        return false;
-      Idx += 2;
-    }
-  }
-
-  return true;
-}
-
-/// isTRN_v_undef_Mask - Special case of isTRNMask for canonical form of
-/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
-/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
-static bool isTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
-  unsigned NumElts = VT.getVectorNumElements();
-  WhichResult = (M[0] == 0 ? 0 : 1);
-  for (unsigned i = 0; i < NumElts; i += 2) {
-    if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) ||
-        (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + WhichResult))
-      return false;
-  }
-  return true;
-}
-
-/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
-/// the specified operations to build the shuffle.
-static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
-                                      SDValue RHS, SelectionDAG &DAG,
-                                      SDLoc dl) {
-  unsigned OpNum = (PFEntry >> 26) & 0x0F;
-  unsigned LHSID = (PFEntry >> 13) & ((1 << 13) - 1);
-  unsigned RHSID = (PFEntry >> 0) & ((1 << 13) - 1);
-
-  enum {
-    OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
-    OP_VREV,
-    OP_VDUP0,
-    OP_VDUP1,
-    OP_VDUP2,
-    OP_VDUP3,
-    OP_VEXT1,
-    OP_VEXT2,
-    OP_VEXT3,
-    OP_VUZPL, // VUZP, left result
-    OP_VUZPR, // VUZP, right result
-    OP_VZIPL, // VZIP, left result
-    OP_VZIPR, // VZIP, right result
-    OP_VTRNL, // VTRN, left result
-    OP_VTRNR  // VTRN, right result
-  };
-
-  if (OpNum == OP_COPY) {
-    if (LHSID == (1 * 9 + 2) * 9 + 3)
-      return LHS;
-    assert(LHSID == ((4 * 9 + 5) * 9 + 6) * 9 + 7 && "Illegal OP_COPY!");
-    return RHS;
-  }
-
-  SDValue OpLHS, OpRHS;
-  OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
-  OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
-  EVT VT = OpLHS.getValueType();
-
-  switch (OpNum) {
-  default:
-    llvm_unreachable("Unknown shuffle opcode!");
-  case OP_VREV:
-    // VREV divides the vector in half and swaps within the half.
-    if (VT.getVectorElementType() == MVT::i32 ||
-        VT.getVectorElementType() == MVT::f32)
-      return DAG.getNode(ARM64ISD::REV64, dl, VT, OpLHS);
-    // vrev <4 x i16> -> REV32
-    if (VT.getVectorElementType() == MVT::i16)
-      return DAG.getNode(ARM64ISD::REV32, dl, VT, OpLHS);
-    // vrev <4 x i8> -> REV16
-    assert(VT.getVectorElementType() == MVT::i8);
-    return DAG.getNode(ARM64ISD::REV16, dl, VT, OpLHS);
-  case OP_VDUP0:
-  case OP_VDUP1:
-  case OP_VDUP2:
-  case OP_VDUP3: {
-    EVT EltTy = VT.getVectorElementType();
-    unsigned Opcode;
-    if (EltTy == MVT::i8)
-      Opcode = ARM64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16)
-      Opcode = ARM64ISD::DUPLANE16;
-    else if (EltTy == MVT::i32 || EltTy == MVT::f32)
-      Opcode = ARM64ISD::DUPLANE32;
-    else if (EltTy == MVT::i64 || EltTy == MVT::f64)
-      Opcode = ARM64ISD::DUPLANE64;
-    else
-      llvm_unreachable("Invalid vector element type?");
-
-    if (VT.getSizeInBits() == 64)
-      OpLHS = WidenVector(OpLHS, DAG);
-    SDValue Lane = DAG.getConstant(OpNum - OP_VDUP0, MVT::i64);
-    return DAG.getNode(Opcode, dl, VT, OpLHS, Lane);
-  }
-  case OP_VEXT1:
-  case OP_VEXT2:
-  case OP_VEXT3: {
-    unsigned Imm = (OpNum - OP_VEXT1 + 1) * getExtFactor(OpLHS);
-    return DAG.getNode(ARM64ISD::EXT, dl, VT, OpLHS, OpRHS,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-  case OP_VUZPL:
-    return DAG.getNode(ARM64ISD::UZP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VUZPR:
-    return DAG.getNode(ARM64ISD::UZP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPL:
-    return DAG.getNode(ARM64ISD::ZIP1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VZIPR:
-    return DAG.getNode(ARM64ISD::ZIP2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNL:
-    return DAG.getNode(ARM64ISD::TRN1, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  case OP_VTRNR:
-    return DAG.getNode(ARM64ISD::TRN2, dl, DAG.getVTList(VT, VT), OpLHS, OpRHS);
-  }
-}
-
-static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
-                           SelectionDAG &DAG) {
-  // Check to see if we can use the TBL instruction.
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-  SDLoc DL(Op);
-
-  EVT EltVT = Op.getValueType().getVectorElementType();
-  unsigned BytesPerElt = EltVT.getSizeInBits() / 8;
-
-  SmallVector<SDValue, 8> TBLMask;
-  for (int Val : ShuffleMask) {
-    for (unsigned Byte = 0; Byte < BytesPerElt; ++Byte) {
-      unsigned Offset = Byte + Val * BytesPerElt;
-      TBLMask.push_back(DAG.getConstant(Offset, MVT::i32));
-    }
-  }
-
-  MVT IndexVT = MVT::v8i8;
-  unsigned IndexLen = 8;
-  if (Op.getValueType().getSizeInBits() == 128) {
-    IndexVT = MVT::v16i8;
-    IndexLen = 16;
-  }
-
-  SDValue V1Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V1);
-  SDValue V2Cst = DAG.getNode(ISD::BITCAST, DL, IndexVT, V2);
-
-  SDValue Shuffle;
-  if (V2.getNode()->getOpcode() == ISD::UNDEF) {
-    if (IndexLen == 8)
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V1Cst);
-    Shuffle = DAG.getNode(
-        ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-        DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-        DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-  } else {
-    if (IndexLen == 8) {
-      V1Cst = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, V1Cst, V2Cst);
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl1, MVT::i32), V1Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-    } else {
-      // FIXME: We cannot, for the moment, emit a TBL2 instruction because we
-      // cannot currently represent the register constraints on the input
-      // table registers.
-      //  Shuffle = DAG.getNode(ARM64ISD::TBL2, DL, IndexVT, V1Cst, V2Cst,
-      //                   DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT,
-      //                               &TBLMask[0], IndexLen));
-      Shuffle = DAG.getNode(
-          ISD::INTRINSIC_WO_CHAIN, DL, IndexVT,
-          DAG.getConstant(Intrinsic::arm64_neon_tbl2, MVT::i32), V1Cst, V2Cst,
-          DAG.getNode(ISD::BUILD_VECTOR, DL, IndexVT, &TBLMask[0], IndexLen));
-    }
-  }
-  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Shuffle);
-}
-
-static unsigned getDUPLANEOp(EVT EltType) {
-  if (EltType == MVT::i8)
-    return ARM64ISD::DUPLANE8;
-  if (EltType == MVT::i16)
-    return ARM64ISD::DUPLANE16;
-  if (EltType == MVT::i32 || EltType == MVT::f32)
-    return ARM64ISD::DUPLANE32;
-  if (EltType == MVT::i64 || EltType == MVT::f64)
-    return ARM64ISD::DUPLANE64;
-
-  llvm_unreachable("Invalid vector element type?");
-}
-
-SDValue ARM64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
-                                                 SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
-
-  // Convert shuffles that are directly supported on NEON to target-specific
-  // DAG nodes, instead of keeping them as shuffles and matching them again
-  // during code selection.  This is more efficient and avoids the possibility
-  // of inconsistencies between legalization and selection.
-  ArrayRef<int> ShuffleMask = SVN->getMask();
-
-  SDValue V1 = Op.getOperand(0);
-  SDValue V2 = Op.getOperand(1);
-
-  if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0],
-                                       V1.getValueType().getSimpleVT())) {
-    int Lane = SVN->getSplatIndex();
-    // If this is undef splat, generate it via "just" vdup, if possible.
-    if (Lane == -1)
-      Lane = 0;
-
-    if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      return DAG.getNode(ARM64ISD::DUP, dl, V1.getValueType(),
-                         V1.getOperand(0));
-    // Test if V1 is a BUILD_VECTOR and the lane being referenced is a non-
-    // constant. If so, we can just reference the lane's definition directly.
-    if (V1.getOpcode() == ISD::BUILD_VECTOR &&
-        !isa<ConstantSDNode>(V1.getOperand(Lane)))
-      return DAG.getNode(ARM64ISD::DUP, dl, VT, V1.getOperand(Lane));
-
-    // Otherwise, duplicate from the lane of the input vector.
-    unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
-
-    // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
-    // to make a vector of the same size as this SHUFFLE. We can ignore the
-    // extract entirely, and canonicalise the concat using WidenVector.
-    if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-      Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
-      V1 = V1.getOperand(0);
-    } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
-      unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
-      Lane -= Idx * VT.getVectorNumElements() / 2;
-      V1 = WidenVector(V1.getOperand(Idx), DAG);
-    } else if (VT.getSizeInBits() == 64)
-      V1 = WidenVector(V1, DAG);
-
-    return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, MVT::i64));
-  }
-
-  if (isREVMask(ShuffleMask, VT, 64))
-    return DAG.getNode(ARM64ISD::REV64, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 32))
-    return DAG.getNode(ARM64ISD::REV32, dl, V1.getValueType(), V1, V2);
-  if (isREVMask(ShuffleMask, VT, 16))
-    return DAG.getNode(ARM64ISD::REV16, dl, V1.getValueType(), V1, V2);
-
-  bool ReverseEXT = false;
-  unsigned Imm;
-  if (isEXTMask(ShuffleMask, VT, ReverseEXT, Imm)) {
-    if (ReverseEXT)
-      std::swap(V1, V2);
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V2,
-                       DAG.getConstant(Imm, MVT::i32));
-  } else if (V2->getOpcode() == ISD::UNDEF &&
-             isSingletonEXTMask(ShuffleMask, VT, Imm)) {
-    Imm *= getExtFactor(V1);
-    return DAG.getNode(ARM64ISD::EXT, dl, V1.getValueType(), V1, V1,
-                       DAG.getConstant(Imm, MVT::i32));
-  }
-
-  unsigned WhichResult;
-  if (isZIPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isUZPMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-  if (isTRNMask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V2);
-  }
-
-  if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::ZIP1 : ARM64ISD::ZIP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isUZP_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::UZP1 : ARM64ISD::UZP2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-  if (isTRN_v_undef_Mask(ShuffleMask, VT, WhichResult)) {
-    unsigned Opc = (WhichResult == 0) ? ARM64ISD::TRN1 : ARM64ISD::TRN2;
-    return DAG.getNode(Opc, dl, V1.getValueType(), V1, V1);
-  }
-
-  // If the shuffle is not directly supported and it has 4 elements, use
-  // the PerfectShuffle-generated table to synthesize it from other shuffles.
-  unsigned NumElts = VT.getVectorNumElements();
-  if (NumElts == 4) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (ShuffleMask[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = ShuffleMask[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
-  }
-
-  return GenerateTBL(Op, ShuffleMask, DAG);
-}
-
-static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
-                               APInt &UndefBits) {
-  EVT VT = BVN->getValueType(0);
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    unsigned NumSplats = VT.getSizeInBits() / SplatBitSize;
-
-    for (unsigned i = 0; i < NumSplats; ++i) {
-      CnstBits <<= SplatBitSize;
-      UndefBits <<= SplatBitSize;
-      CnstBits |= SplatBits.zextOrTrunc(VT.getSizeInBits());
-      UndefBits |= (SplatBits ^ SplatUndef).zextOrTrunc(VT.getSizeInBits());
-    }
-
-    return true;
-  }
-
-  return false;
-}
-
-SDValue ARM64TargetLowering::LowerVectorAND(SDValue Op,
-                                            SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  SDValue LHS = Op.getOperand(0);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We only have BIC vector immediate instruction, which is and-not.
-    CnstBits = ~CnstBits;
-
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::BICi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = ~UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate AND.
-FailedModImm:
-  return Op;
-}
-
-// Specialized code to quickly find if PotentialBVec is a BuildVector that
-// consists of only the same constant int value, returned in reference arg
-// ConstVal
-static bool isAllConstantBuildVector(const SDValue &PotentialBVec,
-                                     uint64_t &ConstVal) {
-  BuildVectorSDNode *Bvec = dyn_cast<BuildVectorSDNode>(PotentialBVec);
-  if (!Bvec)
-    return false;
-  ConstantSDNode *FirstElt = dyn_cast<ConstantSDNode>(Bvec->getOperand(0));
-  if (!FirstElt)
-    return false;
-  EVT VT = Bvec->getValueType(0);
-  unsigned NumElts = VT.getVectorNumElements();
-  for (unsigned i = 1; i < NumElts; ++i)
-    if (dyn_cast<ConstantSDNode>(Bvec->getOperand(i)) != FirstElt)
-      return false;
-  ConstVal = FirstElt->getZExtValue();
-  return true;
-}
-
-static unsigned getIntrinsicID(const SDNode *N) {
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
-  default:
-    return Intrinsic::not_intrinsic;
-  case ISD::INTRINSIC_WO_CHAIN: {
-    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
-    if (IID < Intrinsic::num_intrinsics)
-      return IID;
-    return Intrinsic::not_intrinsic;
-  }
-  }
-}
-
-// Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
-// to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
-static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  if (!VT.isVector())
-    return SDValue();
-
-  SDLoc DL(N);
-
-  // Is the first op an AND?
-  const SDValue And = N->getOperand(0);
-  if (And.getOpcode() != ISD::AND)
-    return SDValue();
-
-  // Is the second op an shl or lshr?
-  SDValue Shift = N->getOperand(1);
-  // This will have been turned into: ARM64ISD::VSHL vector, #shift
-  // or ARM64ISD::VLSHR vector, #shift
-  unsigned ShiftOpc = Shift.getOpcode();
-  if ((ShiftOpc != ARM64ISD::VSHL && ShiftOpc != ARM64ISD::VLSHR))
-    return SDValue();
-  bool IsShiftRight = ShiftOpc == ARM64ISD::VLSHR;
-
-  // Is the shift amount constant?
-  ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
-  if (!C2node)
-    return SDValue();
-
-  // Is the and mask vector all constant?
-  uint64_t C1;
-  if (!isAllConstantBuildVector(And.getOperand(1), C1))
-    return SDValue();
-
-  // Is C1 == ~C2, taking into account how much one can shift elements of a
-  // particular size?
-  uint64_t C2 = C2node->getZExtValue();
-  unsigned ElemSizeInBits = VT.getVectorElementType().getSizeInBits();
-  if (C2 > ElemSizeInBits)
-    return SDValue();
-  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
-  if ((C1 & ElemMask) != (~C2 & ElemMask))
-    return SDValue();
-
-  SDValue X = And.getOperand(0);
-  SDValue Y = Shift.getOperand(0);
-
-  unsigned Intrin =
-      IsShiftRight ? Intrinsic::arm64_neon_vsri : Intrinsic::arm64_neon_vsli;
-  SDValue ResultSLI =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                  DAG.getConstant(Intrin, MVT::i32), X, Y, Shift.getOperand(1));
-
-  DEBUG(dbgs() << "arm64-lower: transformed: \n");
-  DEBUG(N->dump(&DAG));
-  DEBUG(dbgs() << "into: \n");
-  DEBUG(ResultSLI->dump(&DAG));
-
-  ++NumShiftInserts;
-  return ResultSLI;
-}
-
-SDValue ARM64TargetLowering::LowerVectorOR(SDValue Op,
-                                           SelectionDAG &DAG) const {
-  // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
-  if (EnableARM64SlrGeneration) {
-    SDValue Res = tryLowerToSLI(Op.getNode(), DAG);
-    if (Res.getNode())
-      return Res;
-  }
-
-  BuildVectorSDNode *BVN =
-      dyn_cast<BuildVectorSDNode>(Op.getOperand(0).getNode());
-  SDValue LHS = Op.getOperand(1);
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  // OR commutes, so try swapping the operands.
-  if (!BVN) {
-    LHS = Op.getOperand(0);
-    BVN = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode());
-  }
-  if (!BVN)
-    return Op;
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::ORRi, dl, MovTy, LHS,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-
-// We can always fall back to a non-immediate OR.
-FailedModImm:
-  return Op;
-}
-
-SDValue ARM64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
-                                               SelectionDAG &DAG) const {
-  BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
-  SDLoc dl(Op);
-  EVT VT = Op.getValueType();
-
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  if (resolveBuildVector(BVN, CnstBits, UndefBits)) {
-    // We make use of a little bit of goto ickiness in order to avoid having to
-    // duplicate the immediate matching logic for the undef toggled case.
-    bool SecondTry = false;
-  AttemptModImm:
-
-    if (CnstBits.getHiBits(64) == CnstBits.getLoBits(64)) {
-      CnstBits = CnstBits.zextOrTrunc(64);
-      uint64_t CnstVal = CnstBits.getZExtValue();
-
-      // Certain magic vector constants (used to express things like NOT
-      // and NEG) are passed through unmodified.  This allows codegen patterns
-      // for these operations to match.  Special-purpose patterns will lower
-      // these immediates to MOVIs if it proves necessary.
-      if (VT.isInteger() && (CnstVal == 0 || CnstVal == ~0ULL))
-        return Op;
-
-      // The many faces of MOVI...
-      if (ARM64_AM::isAdvSIMDModImmType10(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType10(CnstVal);
-        if (VT.getSizeInBits() == 128) {
-          SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::v2i64,
-                                    DAG.getConstant(CnstVal, MVT::i32));
-          return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-        }
-
-        // Support the V64 version via subregister insertion.
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIedit, dl, MVT::f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType9(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType9(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v16i8 : MVT::v8i8;
-        SDValue Mov = DAG.getNode(ARM64ISD::MOVI, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The few faces of FMOV...
-      if (ARM64_AM::isAdvSIMDModImmType11(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType11(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4f32 : MVT::v2f32;
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType12(CnstVal) &&
-          VT.getSizeInBits() == 128) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType12(CnstVal);
-        SDValue Mov = DAG.getNode(ARM64ISD::FMOV, dl, MVT::v2f64,
-                                  DAG.getConstant(CnstVal, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      // The many faces of MVNI...
-      CnstVal = ~CnstVal;
-      if (ARM64_AM::isAdvSIMDModImmType1(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType1(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType2(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType2(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType3(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType3(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(16, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType4(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType4(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(24, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType5(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType5(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(0, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType6(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType6(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v8i16 : MVT::v4i16;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNIshift, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(8, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType7(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType7(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(264, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-
-      if (ARM64_AM::isAdvSIMDModImmType8(CnstVal)) {
-        CnstVal = ARM64_AM::encodeAdvSIMDModImmType8(CnstVal);
-        MVT MovTy = (VT.getSizeInBits() == 128) ? MVT::v4i32 : MVT::v2i32;
-        SDValue Mov = DAG.getNode(ARM64ISD::MVNImsl, dl, MovTy,
-                                  DAG.getConstant(CnstVal, MVT::i32),
-                                  DAG.getConstant(272, MVT::i32));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Mov);
-      }
-    }
-
-    if (SecondTry)
-      goto FailedModImm;
-    SecondTry = true;
-    CnstBits = UndefBits;
-    goto AttemptModImm;
-  }
-FailedModImm:
-
-  // Scan through the operands to find some interesting properties we can
-  // exploit:
-  //   1) If only one value is used, we can use a DUP, or
-  //   2) if only the low element is not undef, we can just insert that, or
-  //   3) if only one constant value is used (w/ some non-constant lanes),
-  //      we can splat the constant value into the whole vector then fill
-  //      in the non-constant lanes.
-  //   4) FIXME: If different constant values are used, but we can intelligently
-  //             select the values we'll be overwriting for the non-constant
-  //             lanes such that we can directly materialize the vector
-  //             some other way (MOVI, e.g.), we can be sneaky.
-  unsigned NumElts = VT.getVectorNumElements();
-  bool isOnlyLowElement = true;
-  bool usesOnlyOneValue = true;
-  bool usesOnlyOneConstantValue = true;
-  bool isConstant = true;
-  unsigned NumConstantLanes = 0;
-  SDValue Value;
-  SDValue ConstantValue;
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue V = Op.getOperand(i);
-    if (V.getOpcode() == ISD::UNDEF)
-      continue;
-    if (i > 0)
-      isOnlyLowElement = false;
-    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
-      isConstant = false;
-
-    if (isa<ConstantSDNode>(V)) {
-      ++NumConstantLanes;
-      if (!ConstantValue.getNode())
-        ConstantValue = V;
-      else if (ConstantValue != V)
-        usesOnlyOneConstantValue = false;
-    }
-
-    if (!Value.getNode())
-      Value = V;
-    else if (V != Value)
-      usesOnlyOneValue = false;
-  }
-
-  if (!Value.getNode())
-    return DAG.getUNDEF(VT);
-
-  if (isOnlyLowElement)
-    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
-
-  // Use DUP for non-constant splats.  For f32 constant splats, reduce to
-  // i32 and try again.
-  if (usesOnlyOneValue) {
-    if (!isConstant) {
-      if (Value.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
-          Value.getValueType() != VT)
-        return DAG.getNode(ARM64ISD::DUP, dl, VT, Value);
-
-      // This is actually a DUPLANExx operation, which keeps everything vectory.
-
-      // DUPLANE works on 128-bit vectors, widen it if necessary.
-      SDValue Lane = Value.getOperand(1);
-      Value = Value.getOperand(0);
-      if (Value.getValueType().getSizeInBits() == 64)
-        Value = WidenVector(Value, DAG);
-
-      unsigned Opcode = getDUPLANEOp(VT.getVectorElementType());
-      return DAG.getNode(Opcode, dl, VT, Value, Lane);
-    }
-
-    if (VT.getVectorElementType().isFloatingPoint()) {
-      SmallVector<SDValue, 8> Ops;
-      MVT NewType =
-          (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
-      for (unsigned i = 0; i < NumElts; ++i)
-        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, NewType, Op.getOperand(i)));
-      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), NewType, NumElts);
-      SDValue Val = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], NumElts);
-      Val = LowerBUILD_VECTOR(Val, DAG);
-      if (Val.getNode())
-        return DAG.getNode(ISD::BITCAST, dl, VT, Val);
-    }
-  }
-
-  // If there was only one constant value used and for more than one lane,
-  // start by splatting that value, then replace the non-constant lanes. This
-  // is better than the default, which will perform a separate initialization
-  // for each lane.
-  if (NumConstantLanes > 0 && usesOnlyOneConstantValue) {
-    SDValue Val = DAG.getNode(ARM64ISD::DUP, dl, VT, ConstantValue);
-    // Now insert the non-constant lanes.
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      if (!isa<ConstantSDNode>(V)) {
-        // Note that type legalization likely mucked about with the VT of the
-        // source operand, so we may have to convert it here before inserting.
-        Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Val, V, LaneIdx);
-      }
-    }
-    return Val;
-  }
-
-  // If all elements are constants and the case above didn't get hit, fall back
-  // to the default expansion, which will generate a load from the constant
-  // pool.
-  if (isConstant)
-    return SDValue();
-
-  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
-  if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
-      return shuffle;
-  }
-
-  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
-  // know the default expansion would otherwise fall back on something even
-  // worse. For a vector with one or two non-undef values, that's
-  // scalar_to_vector for the elements followed by a shuffle (provided the
-  // shuffle is valid for the target) and materialization element by element
-  // on the stack followed by a load for everything else.
-  if (!isConstant && !usesOnlyOneValue) {
-    SDValue Vec = DAG.getUNDEF(VT);
-    SDValue Op0 = Op.getOperand(0);
-    unsigned ElemSize = VT.getVectorElementType().getSizeInBits();
-    unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
-    // a) Avoid a RMW dependency on the full vector register, and
-    // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    if (Op0.getOpcode() != ISD::UNDEF && (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? ARM64::ssub : ARM64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, MVT::i32));
-      Vec = SDValue(N, 0);
-      ++i;
-    }
-    for (; i < NumElts; ++i) {
-      SDValue V = Op.getOperand(i);
-      if (V.getOpcode() == ISD::UNDEF)
-        continue;
-      SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
-      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
-    }
-    return Vec;
-  }
-
-  // Just use the default expansion. We failed to find a better alternative.
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::INSERT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(2)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform insertion by expanding the value
-  // to a V128 type and perform the insertion on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  SDValue Node = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, WideTy, WideVec,
-                             Op.getOperand(1), Op.getOperand(2));
-  // Re-narrow the resultant vector.
-  return NarrowVector(Node, DAG);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
-                                                     SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unknown opcode!");
-
-  // Check for non-constant lane.
-  if (!isa<ConstantSDNode>(Op.getOperand(1)))
-    return SDValue();
-
-  EVT VT = Op.getOperand(0).getValueType();
-
-  // Insertion/extraction are legal for V128 types.
-  if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
-      VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64)
-    return Op;
-
-  if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32)
-    return SDValue();
-
-  // For V64 types, we perform extraction by expanding the value
-  // to a V128 type and perform the extraction on that.
-  SDLoc DL(Op);
-  SDValue WideVec = WidenVector(Op.getOperand(0), DAG);
-  EVT WideTy = WideVec.getValueType();
-
-  EVT ExtrTy = WideTy.getVectorElementType();
-  if (ExtrTy == MVT::i16 || ExtrTy == MVT::i8)
-    ExtrTy = MVT::i32;
-
-  // For extractions, we just return the result directly.
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtrTy, WideVec,
-                     Op.getOperand(1));
-}
-
-SDValue ARM64TargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
-                                                   SelectionDAG &DAG) const {
-  assert(Op.getOpcode() == ISD::SCALAR_TO_VECTOR && "Unknown opcode!");
-  // Some AdvSIMD intrinsics leave their results in the scalar B/H/S/D
-  // registers. The default lowering will copy those to a GPR then back
-  // to a vector register. Instead, just recognize those cases and reference
-  // the vector register they're already a subreg of.
-  SDValue Op0 = Op->getOperand(0);
-  if (Op0->getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-    return Op;
-  unsigned IID = getIntrinsicID(Op0.getNode());
-  // The below list of intrinsics isn't exhaustive. Add cases as-needed.
-  // FIXME: Even better would be if there were an attribute on the node
-  // that we could query and set in the intrinsics definition or something.
-  unsigned SubIdx;
-  switch (IID) {
-  default:
-    // Early exit if this isn't one of the intrinsics we handle.
-    return Op;
-  case Intrinsic::arm64_neon_uaddv:
-  case Intrinsic::arm64_neon_saddv:
-  case Intrinsic::arm64_neon_uaddlv:
-  case Intrinsic::arm64_neon_saddlv:
-    switch (Op0.getValueType().getSizeInBits()) {
-    default:
-      llvm_unreachable("Illegal result size from ARM64 vector intrinsic!");
-    case 8:
-      SubIdx = ARM64::bsub;
-      break;
-    case 16:
-      SubIdx = ARM64::hsub;
-      break;
-    case 32:
-      SubIdx = ARM64::ssub;
-      break;
-    case 64:
-      SubIdx = ARM64::dsub;
-      break;
-    }
-  }
-  MachineSDNode *N =
-      DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, SDLoc(Op),
-                         Op.getValueType(), DAG.getUNDEF(Op.getValueType()),
-                         Op0, DAG.getTargetConstant(SubIdx, MVT::i32));
-  return SDValue(N, 0);
-}
-
-SDValue ARM64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  // Just in case...
-  if (!VT.isVector())
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
-
-  unsigned Size = Op.getValueType().getSizeInBits();
-  if (Val == 0) {
-    switch (Size) {
-    case 8:
-      return DAG.getTargetExtractSubreg(ARM64::bsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 16:
-      return DAG.getTargetExtractSubreg(ARM64::hsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 32:
-      return DAG.getTargetExtractSubreg(ARM64::ssub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    case 64:
-      return DAG.getTargetExtractSubreg(ARM64::dsub, dl, Op.getValueType(),
-                                        Op.getOperand(0));
-    default:
-      llvm_unreachable("Unexpected vector type in extract_subvector!");
-    }
-  }
-  // If this is extracting the upper 64-bits of a 128-bit vector, we match
-  // that directly.
-  if (Size == 64 && Val * VT.getVectorElementType().getSizeInBits() == 64)
-    return Op;
-
-  return SDValue();
-}
-
-bool ARM64TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
-                                             EVT VT) const {
-  if (VT.getVectorNumElements() == 4 &&
-      (VT.is128BitVector() || VT.is64BitVector())) {
-    unsigned PFIndexes[4];
-    for (unsigned i = 0; i != 4; ++i) {
-      if (M[i] < 0)
-        PFIndexes[i] = 8;
-      else
-        PFIndexes[i] = M[i];
-    }
-
-    // Compute the index in the perfect shuffle table.
-    unsigned PFTableIndex = PFIndexes[0] * 9 * 9 * 9 + PFIndexes[1] * 9 * 9 +
-                            PFIndexes[2] * 9 + PFIndexes[3];
-    unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
-    unsigned Cost = (PFEntry >> 30);
-
-    if (Cost <= 4)
-      return true;
-  }
-
-  bool ReverseVEXT;
-  unsigned Imm, WhichResult;
-
-  return (ShuffleVectorSDNode::isSplatMask(&M[0], VT) || isREVMask(M, VT, 64) ||
-          isREVMask(M, VT, 32) || isREVMask(M, VT, 16) ||
-          isEXTMask(M, VT, ReverseVEXT, Imm) ||
-          // isTBLMask(M, VT) || // FIXME: Port TBL support from ARM.
-          isTRNMask(M, VT, WhichResult) || isUZPMask(M, VT, WhichResult) ||
-          isZIPMask(M, VT, WhichResult) ||
-          isTRN_v_undef_Mask(M, VT, WhichResult) ||
-          isUZP_v_undef_Mask(M, VT, WhichResult) ||
-          isZIP_v_undef_Mask(M, VT, WhichResult));
-}
-
-/// getVShiftImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift operation, where all the elements of the
-/// build_vector must have the same constant integer value.
-static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
-  // Ignore bit_converts.
-  while (Op.getOpcode() == ISD::BITCAST)
-    Op = Op.getOperand(0);
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!BVN || !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs, ElementBits) ||
-      SplatBitSize > ElementBits)
-    return false;
-  Cnt = SplatBits.getSExtValue();
-  return true;
-}
-
-/// isVShiftLImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift left operation.  That value must be in the range:
-///   0 <= Value < ElementBits for a left shift; or
-///   0 <= Value <= ElementBits for a long left shift.
-static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
-}
-
-/// isVShiftRImm - Check if this is a valid build_vector for the immediate
-/// operand of a vector shift right operation.  For a shift opcode, the value
-/// is positive, but for an intrinsic the value count must be negative. The
-/// absolute value must be in the range:
-///   1 <= |Value| <= ElementBits for a right shift; or
-///   1 <= |Value| <= ElementBits/2 for a narrow right shift.
-static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
-                         int64_t &Cnt) {
-  assert(VT.isVector() && "vector shift count is not a vector type");
-  unsigned ElementBits = VT.getVectorElementType().getSizeInBits();
-  if (!getVShiftImm(Op, ElementBits, Cnt))
-    return false;
-  if (isIntrinsic)
-    Cnt = -Cnt;
-  return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
-}
-
-SDValue ARM64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
-                                                    SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-  SDLoc DL(Op);
-  int64_t Cnt;
-
-  if (!Op.getOperand(1).getValueType().isVector())
-    return Op;
-  unsigned EltSize = VT.getVectorElementType().getSizeInBits();
-
-  switch (Op.getOpcode()) {
-  default:
-    llvm_unreachable("unexpected shift opcode");
-
-  case ISD::SHL:
-    if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
-      return DAG.getNode(ARM64ISD::VSHL, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       DAG.getConstant(Intrinsic::arm64_neon_ushl, MVT::i32),
-                       Op.getOperand(0), Op.getOperand(1));
-  case ISD::SRA:
-  case ISD::SRL:
-    // Right shift immediate
-    if (isVShiftRImm(Op.getOperand(1), VT, false, false, Cnt) &&
-        Cnt < EltSize) {
-      unsigned Opc =
-          (Op.getOpcode() == ISD::SRA) ? ARM64ISD::VASHR : ARM64ISD::VLSHR;
-      return DAG.getNode(Opc, SDLoc(Op), VT, Op.getOperand(0),
-                         DAG.getConstant(Cnt, MVT::i32));
-    }
-
-    // Right shift register.  Note, there is not a shift right register
-    // instruction, but the shift left register instruction takes a signed
-    // value, where negative numbers specify a right shift.
-    unsigned Opc = (Op.getOpcode() == ISD::SRA) ? Intrinsic::arm64_neon_sshl
-                                                : Intrinsic::arm64_neon_ushl;
-    // negate the shift amount
-    SDValue NegShift = DAG.getNode(ARM64ISD::NEG, DL, VT, Op.getOperand(1));
-    SDValue NegShiftLeft =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                    DAG.getConstant(Opc, MVT::i32), Op.getOperand(0), NegShift);
-    return NegShiftLeft;
-  }
-
-  return SDValue();
-}
-
-static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
-                                    ARM64CC::CondCode CC, bool NoNans, EVT VT,
-                                    SDLoc dl, SelectionDAG &DAG) {
-  EVT SrcVT = LHS.getValueType();
-
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
-  APInt CnstBits(VT.getSizeInBits(), 0);
-  APInt UndefBits(VT.getSizeInBits(), 0);
-  bool IsCnst = BVN && resolveBuildVector(BVN, CnstBits, UndefBits);
-  bool IsZero = IsCnst && (CnstBits == 0);
-
-  if (SrcVT.getVectorElementType().isFloatingPoint()) {
-    switch (CC) {
-    default:
-      return SDValue();
-    case ARM64CC::NE: {
-      SDValue Fcmeq;
-      if (IsZero)
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      else
-        Fcmeq = DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-      return DAG.getNode(ARM64ISD::NOT, dl, VT, Fcmeq);
-    }
-    case ARM64CC::EQ:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMEQz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMEQ, dl, VT, LHS, RHS);
-    case ARM64CC::GE:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, LHS, RHS);
-    case ARM64CC::GT:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMGTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, LHS, RHS);
-    case ARM64CC::LS:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLEz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGE, dl, VT, RHS, LHS);
-    case ARM64CC::LT:
-      if (!NoNans)
-        return SDValue();
-    // If we ignore NaNs then we can use to the MI implementation.
-    // Fallthrough.
-    case ARM64CC::MI:
-      if (IsZero)
-        return DAG.getNode(ARM64ISD::FCMLTz, dl, VT, LHS);
-      return DAG.getNode(ARM64ISD::FCMGT, dl, VT, RHS, LHS);
-    }
-  }
-
-  switch (CC) {
-  default:
-    return SDValue();
-  case ARM64CC::NE: {
-    SDValue Cmeq;
-    if (IsZero)
-      Cmeq = DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    else
-      Cmeq = DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-    return DAG.getNode(ARM64ISD::NOT, dl, VT, Cmeq);
-  }
-  case ARM64CC::EQ:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMEQz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMEQ, dl, VT, LHS, RHS);
-  case ARM64CC::GE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, LHS, RHS);
-  case ARM64CC::GT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMGTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, LHS, RHS);
-  case ARM64CC::LE:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLEz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGE, dl, VT, RHS, LHS);
-  case ARM64CC::LS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, RHS, LHS);
-  case ARM64CC::CC:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, RHS, LHS);
-  case ARM64CC::LT:
-    if (IsZero)
-      return DAG.getNode(ARM64ISD::CMLTz, dl, VT, LHS);
-    return DAG.getNode(ARM64ISD::CMGT, dl, VT, RHS, LHS);
-  case ARM64CC::HI:
-    return DAG.getNode(ARM64ISD::CMHI, dl, VT, LHS, RHS);
-  case ARM64CC::CS:
-    return DAG.getNode(ARM64ISD::CMHS, dl, VT, LHS, RHS);
-  }
-}
-
-SDValue ARM64TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDLoc dl(Op);
-
-  if (LHS.getValueType().getVectorElementType().isInteger()) {
-    assert(LHS.getValueType() == RHS.getValueType());
-    ARM64CC::CondCode ARM64CC = changeIntCCToARM64CC(CC);
-    return EmitVectorComparison(LHS, RHS, ARM64CC, false, Op.getValueType(), dl,
-                                DAG);
-  }
-
-  assert(LHS.getValueType().getVectorElementType() == MVT::f32 ||
-         LHS.getValueType().getVectorElementType() == MVT::f64);
-
-  // Unfortunately, the mapping of LLVM FP CC's onto ARM64 CC's isn't totally
-  // clean.  Some of them require two branches to implement.
-  ARM64CC::CondCode CC1, CC2;
-  changeFPCCToARM64CC(CC, CC1, CC2);
-
-  bool NoNaNs = getTargetMachine().Options.NoNaNsFPMath;
-  SDValue Cmp1 =
-      EmitVectorComparison(LHS, RHS, CC1, NoNaNs, Op.getValueType(), dl, DAG);
-  if (!Cmp1.getNode())
-    return SDValue();
-
-  if (CC2 != ARM64CC::AL) {
-    SDValue Cmp2 =
-        EmitVectorComparison(LHS, RHS, CC2, NoNaNs, Op.getValueType(), dl, DAG);
-    if (!Cmp2.getNode())
-      return SDValue();
-
-    return DAG.getNode(ISD::OR, dl, Cmp1.getValueType(), Cmp1, Cmp2);
-  }
-
-  return Cmp1;
-}
-
-/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
-/// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
-/// specified in the intrinsic calls.
-bool ARM64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                             const CallInst &I,
-                                             unsigned Intrinsic) const {
-  switch (Intrinsic) {
-  case Intrinsic::arm64_neon_ld2:
-  case Intrinsic::arm64_neon_ld3:
-  case Intrinsic::arm64_neon_ld4:
-  case Intrinsic::arm64_neon_ld2lane:
-  case Intrinsic::arm64_neon_ld3lane:
-  case Intrinsic::arm64_neon_ld4lane:
-  case Intrinsic::arm64_neon_ld2r:
-  case Intrinsic::arm64_neon_ld3r:
-  case Intrinsic::arm64_neon_ld4r: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    // Conservatively set memVT to the entire set of vectors loaded.
-    uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile loads with NEON intrinsics not supported
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_neon_st2:
-  case Intrinsic::arm64_neon_st3:
-  case Intrinsic::arm64_neon_st4:
-  case Intrinsic::arm64_neon_st2lane:
-  case Intrinsic::arm64_neon_st3lane:
-  case Intrinsic::arm64_neon_st4lane: {
-    Info.opc = ISD::INTRINSIC_VOID;
-    // Conservatively set memVT to the entire set of vectors stored.
-    unsigned NumElts = 0;
-    for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
-      Type *ArgTy = I.getArgOperand(ArgI)->getType();
-      if (!ArgTy->isVectorTy())
-        break;
-      NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
-    }
-    Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
-    Info.ptrVal = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.offset = 0;
-    Info.align = 0;
-    Info.vol = false; // volatile stores with NEON intrinsics not supported
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stxr: {
-    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
-    Info.ptrVal = I.getArgOperand(1);
-    Info.offset = 0;
-    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  case Intrinsic::arm64_ldxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(0);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = true;
-    Info.writeMem = false;
-    return true;
-  }
-  case Intrinsic::arm64_stxp: {
-    Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::i128;
-    Info.ptrVal = I.getArgOperand(2);
-    Info.offset = 0;
-    Info.align = 16;
-    Info.vol = true;
-    Info.readMem = false;
-    Info.writeMem = true;
-    return true;
-  }
-  default:
-    break;
-  }
-
-  return false;
-}
-
-// Truncations from 64-bit GPR to 32-bit GPR is free.
-bool ARM64TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-bool ARM64TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 <= NumBits2)
-    return false;
-  return true;
-}
-
-// All 32-bit GPR operations implicitly zero the high-half of the corresponding
-// 64-bit GPR.
-bool ARM64TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
-  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
-    return false;
-  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
-  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-bool ARM64TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
-    return false;
-  unsigned NumBits1 = VT1.getSizeInBits();
-  unsigned NumBits2 = VT2.getSizeInBits();
-  if (NumBits1 == 32 && NumBits2 == 64)
-    return true;
-  return false;
-}
-
-bool ARM64TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
-  EVT VT1 = Val.getValueType();
-  if (isZExtFree(VT1, VT2)) {
-    return true;
-  }
-
-  if (Val.getOpcode() != ISD::LOAD)
-    return false;
-
-  // 8-, 16-, and 32-bit integer loads all implicitly zero-extend.
-  return (VT1.isSimple() && VT1.isInteger() && VT2.isSimple() &&
-          VT2.isInteger() && VT1.getSizeInBits() <= 32);
-}
-
-bool ARM64TargetLowering::hasPairedLoad(Type *LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType->isIntegerTy() && !LoadedType->isFloatTy())
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType->getPrimitiveSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-bool ARM64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                        unsigned &RequiredAligment) const {
-  if (!LoadedType.isSimple() ||
-      (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
-    return false;
-  // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
-  unsigned NumBits = LoadedType.getSizeInBits();
-  return NumBits == 32 || NumBits == 64;
-}
-
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
-
-EVT ARM64TargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                             unsigned SrcAlign, bool IsMemset,
-                                             bool ZeroMemset, bool MemcpyStrSrc,
-                                             MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (!IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsUnalignedMemoryAccesses(MVT::v2i64, 0, &Fast) && Fast)))
-    return MVT::v2i64;
-
-  return Size >= 8 ? MVT::i64 : MVT::i32;
-}
-
-// 12-bit optionally shifted immediates are legal for adds.
-bool ARM64TargetLowering::isLegalAddImmediate(int64_t Immed) const {
-  if ((Immed >> 12) == 0 || ((Immed & 0xfff) == 0 && Immed >> 24 == 0))
-    return true;
-  return false;
-}
-
-// Integer comparisons are implemented with ADDS/SUBS, so the range of valid
-// immediates is the same as for an add or a sub.
-bool ARM64TargetLowering::isLegalICmpImmediate(int64_t Immed) const {
-  if (Immed < 0)
-    Immed *= -1;
-  return isLegalAddImmediate(Immed);
-}
-
-/// isLegalAddressingMode - Return true if the addressing mode represented
-/// by AM is legal for this target, for a load/store of the specified type.
-bool ARM64TargetLowering::isLegalAddressingMode(const AddrMode &AM,
-                                                Type *Ty) const {
-  // ARM64 has five basic addressing modes:
-  //  reg
-  //  reg + 9-bit signed offset
-  //  reg + SIZE_IN_BYTES * 12-bit unsigned offset
-  //  reg1 + reg2
-  //  reg + SIZE_IN_BYTES * reg
-
-  // No global is ever allowed as a base.
-  if (AM.BaseGV)
-    return false;
-
-  // No reg+reg+imm addressing.
-  if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
-    return false;
-
-  // check reg + imm case:
-  // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
-  uint64_t NumBytes = 0;
-  if (Ty->isSized()) {
-    uint64_t NumBits = getDataLayout()->getTypeSizeInBits(Ty);
-    NumBytes = NumBits / 8;
-    if (!isPowerOf2_64(NumBits))
-      NumBytes = 0;
-  }
-
-  if (!AM.Scale) {
-    int64_t Offset = AM.BaseOffs;
-
-    // 9-bit signed offset
-    if (Offset >= -(1LL << 9) && Offset <= (1LL << 9) - 1)
-      return true;
-
-    // 12-bit unsigned offset
-    unsigned shift = Log2_64(NumBytes);
-    if (NumBytes && Offset > 0 && (Offset / NumBytes) <= (1LL << 12) - 1 &&
-        // Must be a multiple of NumBytes (NumBytes is a power of 2)
-        (Offset >> shift) << shift == Offset)
-      return true;
-    return false;
-  }
-
-  // Check reg1 + SIZE_IN_BYTES * reg2 and reg1 + reg2
-
-  if (!AM.Scale || AM.Scale == 1 ||
-      (AM.Scale > 0 && (uint64_t)AM.Scale == NumBytes))
-    return true;
-  return false;
-}
-
-int ARM64TargetLowering::getScalingFactorCost(const AddrMode &AM,
-                                              Type *Ty) const {
-  // Scaling factors are not free at all.
-  // Operands                     | Rt Latency
-  // -------------------------------------------
-  // Rt, [Xn, Xm]                 | 4
-  // -------------------------------------------
-  // Rt, [Xn, Xm, lsl #imm]       | Rn: 4 Rm: 5
-  // Rt, [Xn, Wm, <extend> #imm]  |
-  if (isLegalAddressingMode(AM, Ty))
-    // Scale represents reg2 * scale, thus account for 1 if
-    // it is not equal to 0 or 1.
-    return AM.Scale != 0 && AM.Scale != 1;
-  return -1;
-}
-
-bool ARM64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
-
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-  case MVT::f64:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-const uint16_t *
-ARM64TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  // LR is a callee-save register, but we must treat it as clobbered by any call
-  // site. Hence we include LR in the scratch registers, which are in turn added
-  // as implicit-defs for stackmaps and patchpoints.
-  static const uint16_t ScratchRegs[] = {
-    ARM64::X16, ARM64::X17, ARM64::LR, 0
-  };
-  return ScratchRegs;
-}
-
-bool ARM64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                            Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return false;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return true;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift < 3) ? true : false;
-}
-
-// Generate SUBS and CSEL for integer abs.
-static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  SDLoc DL(N);
-
-  // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
-  // and change it to SUB and CSEL.
-  if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
-      N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
-      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0))
-    if (ConstantSDNode *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)))
-      if (Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
-        SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
-                                  N0.getOperand(0));
-        // Generate SUBS & CSEL.
-        SDValue Cmp =
-            DAG.getNode(ARM64ISD::SUBS, DL, DAG.getVTList(VT, MVT::i32),
-                        N0.getOperand(0), DAG.getConstant(0, VT));
-        return DAG.getNode(ARM64ISD::CSEL, DL, VT, N0.getOperand(0), Neg,
-                           DAG.getConstant(ARM64CC::PL, MVT::i32),
-                           SDValue(Cmp.getNode(), 1));
-      }
-  return SDValue();
-}
-
-// performXorCombine - Attempts to handle integer ABS.
-static SDValue performXorCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  return performIntegerAbsCombine(N, DAG);
-}
-
-static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARM64Subtarget *Subtarget) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Multiplication of a power of two plus/minus one can be done more
-  // cheaply as as shift+add/sub. For now, this is true unilaterally. If
-  // future CPUs have a cheaper MADD instruction, this may need to be
-  // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
-  // 64-bit is 5 cycles, so this is always a win.
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
-    APInt Value = C->getAPIntValue();
-    EVT VT = N->getValueType(0);
-    APInt VP1 = Value + 1;
-    if (VP1.isPowerOf2()) {
-      // Multiplying by one less than a power of two, replace with a shift
-      // and a subtract.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VP1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::SUB, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-    APInt VM1 = Value - 1;
-    if (VM1.isPowerOf2()) {
-      // Multiplying by one more than a power of two, replace with a shift
-      // and an add.
-      SDValue ShiftedVal =
-          DAG.getNode(ISD::SHL, SDLoc(N), VT, N->getOperand(0),
-                      DAG.getConstant(VM1.logBase2(), MVT::i64));
-      return DAG.getNode(ISD::ADD, SDLoc(N), VT, ShiftedVal, N->getOperand(0));
-    }
-  }
-  return SDValue();
-}
-
-static SDValue performIntToFpCombine(SDNode *N, SelectionDAG &DAG) {
-  EVT VT = N->getValueType(0);
-  if (VT != MVT::f32 && VT != MVT::f64)
-    return SDValue();
-  // Only optimize when the source and destination types have the same width.
-  if (VT.getSizeInBits() != N->getOperand(0).getValueType().getSizeInBits())
-    return SDValue();
-
-  // If the result of an integer load is only used by an integer-to-float
-  // conversion, use a fp load instead and a AdvSIMD scalar {S|U}CVTF instead.
-  // This eliminates an "integer-to-vector-move UOP and improve throughput.
-  SDValue N0 = N->getOperand(0);
-  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
-      // Do not change the width of a volatile load.
-      !cast<LoadSDNode>(N0)->isVolatile()) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue Load = DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
-                               LN0->getPointerInfo(), LN0->isVolatile(),
-                               LN0->isNonTemporal(), LN0->isInvariant(),
-                               LN0->getAlignment());
-
-    // Make sure successors of the original load stay after it by updating them
-    // to use the new Chain.
-    DAG.ReplaceAllUsesOfValueWith(SDValue(LN0, 1), Load.getValue(1));
-
-    unsigned Opcode =
-        (N->getOpcode() == ISD::SINT_TO_FP) ? ARM64ISD::SITOF : ARM64ISD::UITOF;
-    return DAG.getNode(Opcode, SDLoc(N), VT, Load);
-  }
-
-  return SDValue();
-}
-
-/// An EXTR instruction is made up of two shifts, ORed together. This helper
-/// searches for and classifies those shifts.
-static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount,
-                         bool &FromHi) {
-  if (N.getOpcode() == ISD::SHL)
-    FromHi = false;
-  else if (N.getOpcode() == ISD::SRL)
-    FromHi = true;
-  else
-    return false;
-
-  if (!isa<ConstantSDNode>(N.getOperand(1)))
-    return false;
-
-  ShiftAmount = N->getConstantOperandVal(1);
-  Src = N->getOperand(0);
-  return true;
-}
-
-/// EXTR instruction extracts a contiguous chunk of bits from two existing
-/// registers viewed as a high/low pair. This function looks for the pattern:
-/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an
-/// EXTR. Can't quite be done in TableGen because the two immediates aren't
-/// independent.
-static SDValue tryCombineToEXTR(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI) {
-  SelectionDAG &DAG = DCI.DAG;
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(N->getOpcode() == ISD::OR && "Unexpected root");
-
-  if (VT != MVT::i32 && VT != MVT::i64)
-    return SDValue();
-
-  SDValue LHS;
-  uint32_t ShiftLHS = 0;
-  bool LHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(0), LHS, ShiftLHS, LHSFromHi))
-    return SDValue();
-
-  SDValue RHS;
-  uint32_t ShiftRHS = 0;
-  bool RHSFromHi = 0;
-  if (!findEXTRHalf(N->getOperand(1), RHS, ShiftRHS, RHSFromHi))
-    return SDValue();
-
-  // If they're both trying to come from the high part of the register, they're
-  // not really an EXTR.
-  if (LHSFromHi == RHSFromHi)
-    return SDValue();
-
-  if (ShiftLHS + ShiftRHS != VT.getSizeInBits())
-    return SDValue();
-
-  if (LHSFromHi) {
-    std::swap(LHS, RHS);
-    std::swap(ShiftLHS, ShiftRHS);
-  }
-
-  return DAG.getNode(ARM64ISD::EXTR, DL, VT, LHS, RHS,
-                     DAG.getConstant(ShiftRHS, MVT::i64));
-}
-
-static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
-                                const ARM64Subtarget *Subtarget) {
-  // Attempt to form an EXTR from (or (shl VAL1, #N), (srl VAL2, #RegWidth-N))
-  if (!EnableARM64ExtrGeneration)
-    return SDValue();
-  SelectionDAG &DAG = DCI.DAG;
-  EVT VT = N->getValueType(0);
-
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  SDValue Res = tryCombineToEXTR(N, DCI);
-  if (Res.getNode())
-    return Res;
-
-  return SDValue();
-}
-
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == ARM64::EXTRACT_SUBREG) {
-    if (idx != ARM64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(ARM64::dsub, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
-static SDValue performConcatVectorsCombine(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-
-  // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
-  // splat. The indexed instructions are going to be expecting a DUPLANE64, so
-  // canonicalise to that.
-  if (N->getOperand(0) == N->getOperand(1) && VT.getVectorNumElements() == 2) {
-    assert(VT.getVectorElementType().getSizeInBits() == 64);
-    return DAG.getNode(ARM64ISD::DUPLANE64, dl, VT,
-                       WidenVector(N->getOperand(0), DAG),
-                       DAG.getConstant(0, MVT::i64));
-  }
-
-  // Canonicalise concat_vectors so that the right-hand vector has as few
-  // bit-casts as possible before its real operation. The primary matching
-  // destination for these operations will be the narrowing "2" instructions,
-  // which depend on the operation being performed on this right-hand vector.
-  // For example,
-  //    (concat_vectors LHS,  (v1i64 (bitconvert (v4i16 RHS))))
-  // becomes
-  //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
-
-  SDValue Op1 = N->getOperand(1);
-  if (Op1->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue RHS = Op1->getOperand(0);
-  MVT RHSTy = RHS.getValueType().getSimpleVT();
-  // If the RHS is not a vector, this is not the pattern we're looking for.
-  if (!RHSTy.isVector())
-    return SDValue();
-
-  DEBUG(dbgs() << "arm64-lower: concat_vectors bitcast simplification\n");
-
-  MVT ConcatTy = MVT::getVectorVT(RHSTy.getVectorElementType(),
-                                  RHSTy.getVectorNumElements() * 2);
-  return DAG.getNode(
-      ISD::BITCAST, dl, VT,
-      DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatTy,
-                  DAG.getNode(ISD::BITCAST, dl, RHSTy, N->getOperand(0)), RHS));
-}
-
-static SDValue tryCombineFixedPointConvert(SDNode *N,
-                                           TargetLowering::DAGCombinerInfo &DCI,
-                                           SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-  // Transform a scalar conversion of a value from a lane extract into a
-  // lane extract of a vector conversion. E.g., from foo1 to foo2:
-  // double foo1(int64x2_t a) { return vcvtd_n_f64_s64(a[1], 9); }
-  // double foo2(int64x2_t a) { return vcvtq_n_f64_s64(a, 9)[1]; }
-  //
-  // The second form interacts better with instruction selection and the
-  // register allocator to avoid cross-class register copies that aren't
-  // coalescable due to a lane reference.
-
-  // Check the operand and see if it originates from a lane extract.
-  SDValue Op1 = N->getOperand(1);
-  if (Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    // Yep, no additional predication needed. Perform the transform.
-    SDValue IID = N->getOperand(0);
-    SDValue Shift = N->getOperand(2);
-    SDValue Vec = Op1.getOperand(0);
-    SDValue Lane = Op1.getOperand(1);
-    EVT ResTy = N->getValueType(0);
-    EVT VecResTy;
-    SDLoc DL(N);
-
-    // The vector width should be 128 bits by the time we get here, even
-    // if it started as 64 bits (the extract_vector handling will have
-    // done so).
-    assert(Vec.getValueType().getSizeInBits() == 128 &&
-           "unexpected vector size on extract_vector_elt!");
-    if (Vec.getValueType() == MVT::v4i32)
-      VecResTy = MVT::v4f32;
-    else if (Vec.getValueType() == MVT::v2i64)
-      VecResTy = MVT::v2f64;
-    else
-      assert(0 && "unexpected vector type!");
-
-    SDValue Convert =
-        DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VecResTy, IID, Vec, Shift);
-    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ResTy, Convert, Lane);
-  }
-  return SDValue();
-}
-
-// AArch64 high-vector "long" operations are formed by performing the non-high
-// version on an extract_subvector of each operand which gets the high half:
-//
-//  (longop2 LHS, RHS) == (longop (extract_high LHS), (extract_high RHS))
-//
-// However, there are cases which don't have an extract_high explicitly, but
-// have another operation that can be made compatible with one for free. For
-// example:
-//
-//  (dupv64 scalar) --> (extract_high (dup128 scalar))
-//
-// This routine does the actual conversion of such DUPs, once outer routines
-// have determined that everything else is in order.
-static SDValue tryExtendDUPToExtractHigh(SDValue N, SelectionDAG &DAG) {
-  // We can handle most types of duplicate, but the lane ones have an extra
-  // operand saying *which* lane, so we need to know.
-  bool IsDUPLANE;
-  switch (N.getOpcode()) {
-  case ARM64ISD::DUP:
-    IsDUPLANE = false;
-    break;
-  case ARM64ISD::DUPLANE8:
-  case ARM64ISD::DUPLANE16:
-  case ARM64ISD::DUPLANE32:
-  case ARM64ISD::DUPLANE64:
-    IsDUPLANE = true;
-    break;
-  default:
-    return SDValue();
-  }
-
-  MVT NarrowTy = N.getSimpleValueType();
-  if (!NarrowTy.is64BitVector())
-    return SDValue();
-
-  MVT ElementTy = NarrowTy.getVectorElementType();
-  unsigned NumElems = NarrowTy.getVectorNumElements();
-  MVT NewDUPVT = MVT::getVectorVT(ElementTy, NumElems * 2);
-
-  SDValue NewDUP;
-  if (IsDUPLANE)
-    NewDUP = DAG.getNode(N.getOpcode(), SDLoc(N), NewDUPVT, N.getOperand(0),
-                         N.getOperand(1));
-  else
-    NewDUP = DAG.getNode(ARM64ISD::DUP, SDLoc(N), NewDUPVT, N.getOperand(0));
-
-  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N.getNode()), NarrowTy,
-                     NewDUP, DAG.getConstant(NumElems, MVT::i64));
-}
-
-static bool isEssentiallyExtractSubvector(SDValue N) {
-  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
-    return true;
-
-  return N.getOpcode() == ISD::BITCAST &&
-         N.getOperand(0).getOpcode() == ISD::EXTRACT_SUBVECTOR;
-}
-
-/// \brief Helper structure to keep track of ISD::SET_CC operands.
-struct GenericSetCCInfo {
-  const SDValue *Opnd0;
-  const SDValue *Opnd1;
-  ISD::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of a SET_CC lowered into ARM64 code.
-struct ARM64SetCCInfo {
-  const SDValue *Cmp;
-  ARM64CC::CondCode CC;
-};
-
-/// \brief Helper structure to keep track of SetCC information.
-union SetCCInfo {
-  GenericSetCCInfo Generic;
-  ARM64SetCCInfo ARM64;
-};
-
-/// \brief Helper structure to be able to read SetCC information.
-/// If set to true, IsARM64 field, Info is a ARM64SetCCInfo, otherwise Info is
-/// a GenericSetCCInfo.
-struct SetCCInfoAndKind {
-  SetCCInfo Info;
-  bool IsARM64;
-};
-
-/// \brief Check whether or not \p Op is a SET_CC operation, either a generic or
-/// an
-/// ARM64 lowered one.
-/// \p SetCCInfo is filled accordingly.
-/// \post SetCCInfo is meanginfull only when this function returns true.
-/// \return True when Op is a kind of SET_CC operation.
-static bool isSetCC(SDValue Op, SetCCInfoAndKind &SetCCInfo) {
-  // If this is a setcc, this is straight forward.
-  if (Op.getOpcode() == ISD::SETCC) {
-    SetCCInfo.Info.Generic.Opnd0 = &Op.getOperand(0);
-    SetCCInfo.Info.Generic.Opnd1 = &Op.getOperand(1);
-    SetCCInfo.Info.Generic.CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
-    SetCCInfo.IsARM64 = false;
-    return true;
-  }
-  // Otherwise, check if this is a matching csel instruction.
-  // In other words:
-  // - csel 1, 0, cc
-  // - csel 0, 1, !cc
-  if (Op.getOpcode() != ARM64ISD::CSEL)
-    return false;
-  // Set the information about the operands.
-  // TODO: we want the operands of the Cmp not the csel
-  SetCCInfo.Info.ARM64.Cmp = &Op.getOperand(3);
-  SetCCInfo.IsARM64 = true;
-  SetCCInfo.Info.ARM64.CC = static_cast<ARM64CC::CondCode>(
-      cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
-
-  // Check that the operands matches the constraints:
-  // (1) Both operands must be constants.
-  // (2) One must be 1 and the other must be 0.
-  ConstantSDNode *TValue = dyn_cast<ConstantSDNode>(Op.getOperand(0));
-  ConstantSDNode *FValue = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-
-  // Check (1).
-  if (!TValue || !FValue)
-    return false;
-
-  // Check (2).
-  if (!TValue->isOne()) {
-    // Update the comparison when we are interested in !cc.
-    std::swap(TValue, FValue);
-    SetCCInfo.Info.ARM64.CC =
-        ARM64CC::getInvertedCondCode(SetCCInfo.Info.ARM64.CC);
-  }
-  return TValue->isOne() && FValue->isNullValue();
-}
-
-// The folding we want to perform is:
-// (add x, (setcc cc ...) )
-//   -->
-// (csel x, (add x, 1), !cc ...)
-//
-// The latter will get matched to a CSINC instruction.
-static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
-  assert(Op && Op->getOpcode() == ISD::ADD && "Unexpected operation!");
-  SDValue LHS = Op->getOperand(0);
-  SDValue RHS = Op->getOperand(1);
-  SetCCInfoAndKind InfoAndKind;
-
-  // If neither operand is a SET_CC, give up.
-  if (!isSetCC(LHS, InfoAndKind)) {
-    std::swap(LHS, RHS);
-    if (!isSetCC(LHS, InfoAndKind))
-      return SDValue();
-  }
-
-  // FIXME: This could be generatized to work for FP comparisons.
-  EVT CmpVT = InfoAndKind.IsARM64
-                  ? InfoAndKind.Info.ARM64.Cmp->getOperand(0).getValueType()
-                  : InfoAndKind.Info.Generic.Opnd0->getValueType();
-  if (CmpVT != MVT::i32 && CmpVT != MVT::i64)
-    return SDValue();
-
-  SDValue CCVal;
-  SDValue Cmp;
-  SDLoc dl(Op);
-  if (InfoAndKind.IsARM64) {
-    CCVal = DAG.getConstant(
-        ARM64CC::getInvertedCondCode(InfoAndKind.Info.ARM64.CC), MVT::i32);
-    Cmp = *InfoAndKind.Info.ARM64.Cmp;
-  } else
-    Cmp = getARM64Cmp(*InfoAndKind.Info.Generic.Opnd0,
-                      *InfoAndKind.Info.Generic.Opnd1,
-                      ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
-                      CCVal, DAG, dl);
-
-  EVT VT = Op->getValueType(0);
-  LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, VT));
-  return DAG.getNode(ARM64ISD::CSEL, dl, VT, RHS, LHS, CCVal, Cmp);
-}
-
-// The basic add/sub long vector instructions have variants with "2" on the end
-// which act on the high-half of their inputs. They are normally matched by
-// patterns like:
-//
-// (add (zeroext (extract_high LHS)),
-//      (zeroext (extract_high RHS)))
-// -> uaddl2 vD, vN, vM
-//
-// However, if one of the extracts is something like a duplicate, this
-// instruction can still be used profitably. This function puts the DAG into a
-// more appropriate form for those patterns to trigger.
-static SDValue performAddSubLongCombine(SDNode *N,
-                                        TargetLowering::DAGCombinerInfo &DCI,
-                                        SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  MVT VT = N->getSimpleValueType(0);
-  if (!VT.is128BitVector()) {
-    if (N->getOpcode() == ISD::ADD)
-      return performSetccAddFolding(N, DAG);
-    return SDValue();
-  }
-
-  // Make sure both branches are extended in the same way.
-  SDValue LHS = N->getOperand(0);
-  SDValue RHS = N->getOperand(1);
-  if ((LHS.getOpcode() != ISD::ZERO_EXTEND &&
-       LHS.getOpcode() != ISD::SIGN_EXTEND) ||
-      LHS.getOpcode() != RHS.getOpcode())
-    return SDValue();
-
-  unsigned ExtType = LHS.getOpcode();
-
-  // It's not worth doing if at least one of the inputs isn't already an
-  // extract, but we don't know which it'll be so we have to try both.
-  if (isEssentiallyExtractSubvector(LHS.getOperand(0))) {
-    RHS = tryExtendDUPToExtractHigh(RHS.getOperand(0), DAG);
-    if (!RHS.getNode())
-      return SDValue();
-
-    RHS = DAG.getNode(ExtType, SDLoc(N), VT, RHS);
-  } else if (isEssentiallyExtractSubvector(RHS.getOperand(0))) {
-    LHS = tryExtendDUPToExtractHigh(LHS.getOperand(0), DAG);
-    if (!LHS.getNode())
-      return SDValue();
-
-    LHS = DAG.getNode(ExtType, SDLoc(N), VT, LHS);
-  }
-
-  return DAG.getNode(N->getOpcode(), SDLoc(N), VT, LHS, RHS);
-}
-
-// Massage DAGs which we can use the high-half "long" operations on into
-// something isel will recognize better. E.g.
-//
-// (arm64_neon_umull (extract_high vec) (dupv64 scalar)) -->
-//   (arm64_neon_umull (extract_high (v2i64 vec)))
-//                     (extract_high (v2i64 (dup128 scalar)))))
-//
-static SDValue tryCombineLongOpWithDup(unsigned IID, SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       SelectionDAG &DAG) {
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LHS = N->getOperand(1);
-  SDValue RHS = N->getOperand(2);
-  assert(LHS.getValueType().is64BitVector() &&
-         RHS.getValueType().is64BitVector() &&
-         "unexpected shape for long operation");
-
-  // Either node could be a DUP, but it's not worth doing both of them (you'd
-  // just as well use the non-high version) so look for a corresponding extract
-  // operation on the other "wing".
-  if (isEssentiallyExtractSubvector(LHS)) {
-    RHS = tryExtendDUPToExtractHigh(RHS, DAG);
-    if (!RHS.getNode())
-      return SDValue();
-  } else if (isEssentiallyExtractSubvector(RHS)) {
-    LHS = tryExtendDUPToExtractHigh(LHS, DAG);
-    if (!LHS.getNode())
-      return SDValue();
-  }
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), N->getValueType(0),
-                     N->getOperand(0), LHS, RHS);
-}
-
-static SDValue tryCombineShiftImm(unsigned IID, SDNode *N, SelectionDAG &DAG) {
-  MVT ElemTy = N->getSimpleValueType(0).getScalarType();
-  unsigned ElemBits = ElemTy.getSizeInBits();
-
-  int64_t ShiftAmount;
-  if (BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(2))) {
-    APInt SplatValue, SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-    if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                              HasAnyUndefs, ElemBits) ||
-        SplatBitSize != ElemBits)
-      return SDValue();
-
-    ShiftAmount = SplatValue.getSExtValue();
-  } else if (ConstantSDNode *CVN = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
-    ShiftAmount = CVN->getSExtValue();
-  } else
-    return SDValue();
-
-  unsigned Opcode;
-  bool IsRightShift;
-  switch (IID) {
-  default:
-    llvm_unreachable("Unknown shift intrinsic");
-  case Intrinsic::arm64_neon_sqshl:
-    Opcode = ARM64ISD::SQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_uqshl:
-    Opcode = ARM64ISD::UQSHL_I;
-    IsRightShift = false;
-    break;
-  case Intrinsic::arm64_neon_srshl:
-    Opcode = ARM64ISD::SRSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_urshl:
-    Opcode = ARM64ISD::URSHR_I;
-    IsRightShift = true;
-    break;
-  case Intrinsic::arm64_neon_sqshlu:
-    Opcode = ARM64ISD::SQSHLU_I;
-    IsRightShift = false;
-    break;
-  }
-
-  if (IsRightShift && ShiftAmount <= -1 && ShiftAmount >= -(int)ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(-ShiftAmount, MVT::i32));
-  else if (!IsRightShift && ShiftAmount >= 0 && ShiftAmount <= ElemBits)
-    return DAG.getNode(Opcode, SDLoc(N), N->getValueType(0), N->getOperand(1),
-                       DAG.getConstant(ShiftAmount, MVT::i32));
-
-  return SDValue();
-}
-
-// The CRC32[BH] instructions ignore the high bits of their data operand. Since
-// the intrinsics must be legal and take an i32, this means there's almost
-// certainly going to be a zext in the DAG which we can eliminate.
-static SDValue tryCombineCRC32(unsigned Mask, SDNode *N, SelectionDAG &DAG) {
-  SDValue AndN = N->getOperand(2);
-  if (AndN.getOpcode() != ISD::AND)
-    return SDValue();
-
-  ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(AndN.getOperand(1));
-  if (!CMask || CMask->getZExtValue() != Mask)
-    return SDValue();
-
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N), MVT::i32,
-                     N->getOperand(0), N->getOperand(1), AndN.getOperand(0));
-}
-
-static SDValue performIntrinsicCombine(SDNode *N,
-                                       TargetLowering::DAGCombinerInfo &DCI,
-                                       const ARM64Subtarget *Subtarget) {
-  SelectionDAG &DAG = DCI.DAG;
-  unsigned IID = getIntrinsicID(N);
-  switch (IID) {
-  default:
-    break;
-  case Intrinsic::arm64_neon_vcvtfxs2fp:
-  case Intrinsic::arm64_neon_vcvtfxu2fp:
-    return tryCombineFixedPointConvert(N, DCI, DAG);
-    break;
-  case Intrinsic::arm64_neon_fmax:
-    return DAG.getNode(ARM64ISD::FMAX, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_fmin:
-    return DAG.getNode(ARM64ISD::FMIN, SDLoc(N), N->getValueType(0),
-                       N->getOperand(1), N->getOperand(2));
-  case Intrinsic::arm64_neon_smull:
-  case Intrinsic::arm64_neon_umull:
-  case Intrinsic::arm64_neon_pmull:
-  case Intrinsic::arm64_neon_sqdmull:
-    return tryCombineLongOpWithDup(IID, N, DCI, DAG);
-  case Intrinsic::arm64_neon_sqshl:
-  case Intrinsic::arm64_neon_uqshl:
-  case Intrinsic::arm64_neon_sqshlu:
-  case Intrinsic::arm64_neon_srshl:
-  case Intrinsic::arm64_neon_urshl:
-    return tryCombineShiftImm(IID, N, DAG);
-  case Intrinsic::arm64_crc32b:
-  case Intrinsic::arm64_crc32cb:
-    return tryCombineCRC32(0xff, N, DAG);
-  case Intrinsic::arm64_crc32h:
-  case Intrinsic::arm64_crc32ch:
-    return tryCombineCRC32(0xffff, N, DAG);
-  }
-  return SDValue();
-}
-
-static SDValue performExtendCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  // If we see something like (zext (sabd (extract_high ...), (DUP ...))) then
-  // we can convert that DUP into another extract_high (of a bigger DUP), which
-  // helps the backend to decide that an sabdl2 would be useful, saving a real
-  // extract_high operation.
-  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ZERO_EXTEND &&
-      N->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    SDNode *ABDNode = N->getOperand(0).getNode();
-    unsigned IID = getIntrinsicID(ABDNode);
-    if (IID == Intrinsic::arm64_neon_sabd ||
-        IID == Intrinsic::arm64_neon_uabd) {
-      SDValue NewABD = tryCombineLongOpWithDup(IID, ABDNode, DCI, DAG);
-      if (!NewABD.getNode())
-        return SDValue();
-
-      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0),
-                         NewABD);
-    }
-  }
-
-  // This is effectively a custom type legalization for ARM64.
-  //
-  // Type legalization will split an extend of a small, legal, type to a larger
-  // illegal type by first splitting the destination type, often creating
-  // illegal source types, which then get legalized in isel-confusing ways,
-  // leading to really terrible codegen. E.g.,
-  //   %result = v8i32 sext v8i8 %value
-  // becomes
-  //   %losrc = extract_subreg %value, ...
-  //   %hisrc = extract_subreg %value, ...
-  //   %lo = v4i32 sext v4i8 %losrc
-  //   %hi = v4i32 sext v4i8 %hisrc
-  // Things go rapidly downhill from there.
-  //
-  // For ARM64, the [sz]ext vector instructions can only go up one element
-  // size, so we can, e.g., extend from i8 to i16, but to go from i8 to i32
-  // take two instructions.
-  //
-  // This implies that the most efficient way to do the extend from v8i8
-  // to two v4i32 values is to first extend the v8i8 to v8i16, then do
-  // the normal splitting to happen for the v8i16->v8i32.
-
-  // This is pre-legalization to catch some cases where the default
-  // type legalization will create ill-tempered code.
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // We're only interested in cleaning things up for non-legal vector types
-  // here. If both the source and destination are legal, things will just
-  // work naturally without any fiddling.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  EVT ResVT = N->getValueType(0);
-  if (!ResVT.isVector() || TLI.isTypeLegal(ResVT))
-    return SDValue();
-  // If the vector type isn't a simple VT, it's beyond the scope of what
-  // we're  worried about here. Let legalization do its thing and hope for
-  // the best.
-  if (!ResVT.isSimple())
-    return SDValue();
-
-  SDValue Src = N->getOperand(0);
-  MVT SrcVT = Src->getValueType(0).getSimpleVT();
-  // If the source VT is a 64-bit vector, we can play games and get the
-  // better results we want.
-  if (SrcVT.getSizeInBits() != 64)
-    return SDValue();
-
-  unsigned SrcEltSize = SrcVT.getVectorElementType().getSizeInBits();
-  unsigned ElementCount = SrcVT.getVectorNumElements();
-  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
-  SDLoc DL(N);
-  Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
-
-  // Now split the rest of the operation into two halves, each with a 64
-  // bit source.
-  EVT LoVT, HiVT;
-  SDValue Lo, Hi;
-  unsigned NumElements = ResVT.getVectorNumElements();
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                 ResVT.getVectorElementType(), NumElements / 2);
-
-  EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
-  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(0));
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getIntPtrConstant(InNVT.getVectorNumElements()));
-  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
-  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
-
-  // Now combine the parts back together so we still have a single result
-  // like the combiner expects.
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}
-
-/// Replace a splat of a scalar to a vector store by scalar stores of the scalar
-/// value. The load store optimizer pass will merge them to store pair stores.
-/// This has better performance than a splat of the scalar followed by a split
-/// vector store. Even if the stores are not merged it is four stores vs a dup,
-/// followed by an ext.b and two stores.
-static SDValue replaceSplatVectorStore(SelectionDAG &DAG, StoreSDNode *St) {
-  SDValue StVal = St->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't replace floating point stores, they possibly won't be transformed to
-  // stp because of the store pair suppress pass.
-  if (VT.isFloatingPoint())
-    return SDValue();
-
-  // Check for insert vector elements.
-  if (StVal.getOpcode() != ISD::INSERT_VECTOR_ELT)
-    return SDValue();
-
-  // We can express a splat as store pair(s) for 2 or 4 elements.
-  unsigned NumVecElts = VT.getVectorNumElements();
-  if (NumVecElts != 4 && NumVecElts != 2)
-    return SDValue();
-  SDValue SplatVal = StVal.getOperand(1);
-  unsigned RemainInsertElts = NumVecElts - 1;
-
-  // Check that this is a splat.
-  while (--RemainInsertElts) {
-    SDValue NextInsertElt = StVal.getOperand(0);
-    if (NextInsertElt.getOpcode() != ISD::INSERT_VECTOR_ELT)
-      return SDValue();
-    if (NextInsertElt.getOperand(1) != SplatVal)
-      return SDValue();
-    StVal = NextInsertElt;
-  }
-  unsigned OrigAlignment = St->getAlignment();
-  unsigned EltOffset = NumVecElts == 4 ? 4 : 8;
-  unsigned Alignment = std::min(OrigAlignment, EltOffset);
-
-  // Create scalar stores. This is at least as good as the code sequence for a
-  // split unaligned store wich is a dup.s, ext.b, and two stores.
-  // Most of the time the three stores should be replaced by store pair
-  // instructions (stp).
-  SDLoc DL(St);
-  SDValue BasePtr = St->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(St->getChain(), DL, SplatVal, BasePtr, St->getPointerInfo(),
-                   St->isVolatile(), St->isNonTemporal(), St->getAlignment());
-
-  unsigned Offset = EltOffset;
-  while (--NumVecElts) {
-    SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                    DAG.getConstant(Offset, MVT::i64));
-    NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St->getPointerInfo(), St->isVolatile(),
-                          St->isNonTemporal(), Alignment);
-    Offset += EltOffset;
-  }
-  return NewST1;
-}
-
-static SDValue performSTORECombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI,
-                                   SelectionDAG &DAG,
-                                   const ARM64Subtarget *Subtarget) {
-  if (!DCI.isBeforeLegalize())
-    return SDValue();
-
-  StoreSDNode *S = cast<StoreSDNode>(N);
-  if (S->isVolatile())
-    return SDValue();
-
-  // Cyclone has bad performance on unaligned 16B stores when crossing line and
-  // page boundries. We want to split such stores.
-  if (!Subtarget->isCyclone())
-    return SDValue();
-
-  // Don't split at Oz.
-  MachineFunction &MF = DAG.getMachineFunction();
-  bool IsMinSize = MF.getFunction()->getAttributes().hasAttribute(
-      AttributeSet::FunctionIndex, Attribute::MinSize);
-  if (IsMinSize)
-    return SDValue();
-
-  SDValue StVal = S->getValue();
-  EVT VT = StVal.getValueType();
-
-  // Don't split v2i64 vectors. Memcpy lowering produces those and splitting
-  // those up regresses performance on micro-benchmarks and olden/bh.
-  if (!VT.isVector() || VT.getVectorNumElements() < 2 || VT == MVT::v2i64)
-    return SDValue();
-
-  // Split unaligned 16B stores. They are terrible for performance.
-  // Don't split stores with alignment of 1 or 2. Code that uses clang vector
-  // extensions can use this to mark that it does not want splitting to happen
-  // (by underspecifying alignment to be 1 or 2). Furthermore, the chance of
-  // eliminating alignment hazards is only 1 in 8 for alignment of 2.
-  if (VT.getSizeInBits() != 128 || S->getAlignment() >= 16 ||
-      S->getAlignment() <= 2)
-    return SDValue();
-
-  // If we get a splat of a scalar convert this vector store to a store of
-  // scalars. They will be merged into store pairs thereby removing two
-  // instructions.
-  SDValue ReplacedSplat = replaceSplatVectorStore(DAG, S);
-  if (ReplacedSplat != SDValue())
-    return ReplacedSplat;
-
-  SDLoc DL(S);
-  unsigned NumElts = VT.getVectorNumElements() / 2;
-  // Split VT into two.
-  EVT HalfVT =
-      EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(), NumElts);
-  SDValue SubVector0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(0));
-  SDValue SubVector1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, StVal,
-                                   DAG.getIntPtrConstant(NumElts));
-  SDValue BasePtr = S->getBasePtr();
-  SDValue NewST1 =
-      DAG.getStore(S->getChain(), DL, SubVector0, BasePtr, S->getPointerInfo(),
-                   S->isVolatile(), S->isNonTemporal(), S->getAlignment());
-  SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
-                                  DAG.getConstant(8, MVT::i64));
-  return DAG.getStore(NewST1.getValue(0), DL, SubVector1, OffsetPtr,
-                      S->getPointerInfo(), S->isVolatile(), S->isNonTemporal(),
-                      S->getAlignment());
-}
-
-// Optimize compare with zero and branch.
-static SDValue performBRCONDCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI,
-                                    SelectionDAG &DAG) {
-  SDValue Chain = N->getOperand(0);
-  SDValue Dest = N->getOperand(1);
-  SDValue CCVal = N->getOperand(2);
-  SDValue Cmp = N->getOperand(3);
-
-  assert(isa<ConstantSDNode>(CCVal) && "Expected a ConstantSDNode here!");
-  unsigned CC = cast<ConstantSDNode>(CCVal)->getZExtValue();
-  if (CC != ARM64CC::EQ && CC != ARM64CC::NE)
-    return SDValue();
-
-  unsigned CmpOpc = Cmp.getOpcode();
-  if (CmpOpc != ARM64ISD::ADDS && CmpOpc != ARM64ISD::SUBS)
-    return SDValue();
-
-  // Only attempt folding if there is only one use of the flag and no use of the
-  // value.
-  if (!Cmp->hasNUsesOfValue(0, 0) || !Cmp->hasNUsesOfValue(1, 1))
-    return SDValue();
-
-  SDValue LHS = Cmp.getOperand(0);
-  SDValue RHS = Cmp.getOperand(1);
-
-  assert(LHS.getValueType() == RHS.getValueType() &&
-         "Expected the value type to be the same for both operands!");
-  if (LHS.getValueType() != MVT::i32 && LHS.getValueType() != MVT::i64)
-    return SDValue();
-
-  if (isa<ConstantSDNode>(LHS) && cast<ConstantSDNode>(LHS)->isNullValue())
-    std::swap(LHS, RHS);
-
-  if (!isa<ConstantSDNode>(RHS) || !cast<ConstantSDNode>(RHS)->isNullValue())
-    return SDValue();
-
-  if (LHS.getOpcode() == ISD::SHL || LHS.getOpcode() == ISD::SRA ||
-      LHS.getOpcode() == ISD::SRL)
-    return SDValue();
-
-  // Fold the compare into the branch instruction.
-  SDValue BR;
-  if (CC == ARM64CC::EQ)
-    BR = DAG.getNode(ARM64ISD::CBZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-  else
-    BR = DAG.getNode(ARM64ISD::CBNZ, SDLoc(N), MVT::Other, Chain, LHS, Dest);
-
-  // Do not add new nodes to DAG combiner worklist.
-  DCI.CombineTo(N, BR, false);
-
-  return SDValue();
-}
-
-SDValue ARM64TargetLowering::PerformDAGCombine(SDNode *N,
-                                               DAGCombinerInfo &DCI) const {
-  SelectionDAG &DAG = DCI.DAG;
-  switch (N->getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-    return performAddSubLongCombine(N, DCI, DAG);
-  case ISD::XOR:
-    return performXorCombine(N, DAG, DCI, Subtarget);
-  case ISD::MUL:
-    return performMulCombine(N, DAG, DCI, Subtarget);
-  case ISD::SINT_TO_FP:
-  case ISD::UINT_TO_FP:
-    return performIntToFpCombine(N, DAG);
-  case ISD::OR:
-    return performORCombine(N, DCI, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN:
-    return performIntrinsicCombine(N, DCI, Subtarget);
-  case ISD::ANY_EXTEND:
-  case ISD::ZERO_EXTEND:
-  case ISD::SIGN_EXTEND:
-    return performExtendCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
-  case ISD::CONCAT_VECTORS:
-    return performConcatVectorsCombine(N, DCI, DAG);
-  case ISD::STORE:
-    return performSTORECombine(N, DCI, DAG, Subtarget);
-  case ARM64ISD::BRCOND:
-    return performBRCONDCombine(N, DCI, DAG);
-  }
-  return SDValue();
-}
-
-// Check if the return value is used as only a return value, as otherwise
-// we can't perform a tail-call. In particular, we need to check for
-// target ISD nodes that are returns and any other "odd" constructs
-// that the generic analysis code won't necessarily catch.
-bool ARM64TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
-  if (N->getNumValues() != 1)
-    return false;
-  if (!N->hasNUsesOfValue(1, 0))
-    return false;
-
-  SDValue TCChain = Chain;
-  SDNode *Copy = *N->use_begin();
-  if (Copy->getOpcode() == ISD::CopyToReg) {
-    // If the copy has a glue operand, we conservatively assume it isn't safe to
-    // perform a tail call.
-    if (Copy->getOperand(Copy->getNumOperands() - 1).getValueType() ==
-        MVT::Glue)
-      return false;
-    TCChain = Copy->getOperand(0);
-  } else if (Copy->getOpcode() != ISD::FP_EXTEND)
-    return false;
-
-  bool HasRet = false;
-  for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
-       UI != UE; ++UI) {
-    if (UI->getOpcode() != ARM64ISD::RET_FLAG)
-      return false;
-    HasRet = true;
-  }
-
-  if (!HasRet)
-    return false;
-
-  Chain = TCChain;
-  return true;
-}
-
-// Return whether the an instruction can potentially be optimized to a tail
-// call. This will cause the optimizers to attempt to move, or duplicate,
-// return instructions to help enable tail call optimizations for this
-// instruction.
-bool ARM64TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
-  if (!EnableARM64TailCalls)
-    return false;
-
-  if (!CI->isTailCall())
-    return false;
-
-  return true;
-}
-
-bool ARM64TargetLowering::getIndexedAddressParts(SDNode *Op, SDValue &Base,
-                                                 SDValue &Offset,
-                                                 ISD::MemIndexedMode &AM,
-                                                 bool &IsInc,
-                                                 SelectionDAG &DAG) const {
-  if (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB)
-    return false;
-
-  Base = Op->getOperand(0);
-  // All of the indexed addressing mode instructions take a signed
-  // 9 bit immediate offset.
-  if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1))) {
-    int64_t RHSC = (int64_t)RHS->getZExtValue();
-    if (RHSC >= 256 || RHSC <= -256)
-      return false;
-    IsInc = (Op->getOpcode() == ISD::ADD);
-    Offset = Op->getOperand(1);
-    return true;
-  }
-  return false;
-}
-
-bool ARM64TargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                                    SDValue &Offset,
-                                                    ISD::MemIndexedMode &AM,
-                                                    SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Ptr.getNode(), Base, Offset, AM, IsInc, DAG))
-    return false;
-  AM = IsInc ? ISD::PRE_INC : ISD::PRE_DEC;
-  return true;
-}
-
-bool ARM64TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                                     SDValue &Base,
-                                                     SDValue &Offset,
-                                                     ISD::MemIndexedMode &AM,
-                                                     SelectionDAG &DAG) const {
-  EVT VT;
-  SDValue Ptr;
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
-    VT = LD->getMemoryVT();
-    Ptr = LD->getBasePtr();
-  } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
-    VT = ST->getMemoryVT();
-    Ptr = ST->getBasePtr();
-  } else
-    return false;
-
-  bool IsInc;
-  if (!getIndexedAddressParts(Op, Base, Offset, AM, IsInc, DAG))
-    return false;
-  // Post-indexing updates the base, so it's not a valid transform
-  // if that's not the same as the load's pointer.
-  if (Ptr != Base)
-    return false;
-  AM = IsInc ? ISD::POST_INC : ISD::POST_DEC;
-  return true;
-}
-
-/// The only 128-bit atomic operation is an stxp that succeeds. In particular
-/// neither ldp nor ldxp are atomic. So the canonical sequence for an atomic
-/// load is:
-///     loop:
-///         ldxp x0, x1, [x8]
-///         stxp w2, x0, x1, [x8]
-///         cbnz w2, loop
-/// If the stxp succeeds then the ldxp managed to get both halves without an
-/// intervening stxp from a different thread and the read was atomic.
-static void ReplaceATOMIC_LOAD_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                   SelectionDAG &DAG) {
-  SDLoc DL(N);
-  AtomicSDNode *AN = cast<AtomicSDNode>(N);
-  EVT VT = AN->getMemoryVT();
-  SDValue Zero = DAG.getConstant(0, VT);
-
-  // FIXME: Really want ATOMIC_LOAD_NOP but that doesn't fit into the existing
-  // scheme very well. Given the complexity of what we're already generating, an
-  // extra couple of ORRs probably won't make much difference.
-  SDValue Result = DAG.getAtomic(ISD::ATOMIC_LOAD_OR, DL, AN->getMemoryVT(),
-                                 N->getOperand(0), N->getOperand(1), Zero,
-                                 AN->getMemOperand(), AN->getOrdering(),
-                                 AN->getSynchScope());
-
-  Results.push_back(Result.getValue(0)); // Value
-  Results.push_back(Result.getValue(1)); // Chain
-}
-
-static void ReplaceATOMIC_OP_128(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                 SelectionDAG &DAG, unsigned NewOp) {
-  SDLoc DL(N);
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(N->getValueType(0) == MVT::i128 &&
-         "Only know how to expand i128 atomics");
-
-  SmallVector<SDValue, 6> Ops;
-  Ops.push_back(N->getOperand(1)); // Ptr
-  // Low part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                            N->getOperand(2), DAG.getIntPtrConstant(0)));
-  // High part of Val1
-  Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                            N->getOperand(2), DAG.getIntPtrConstant(1)));
-  if (NewOp == ARM64::ATOMIC_CMP_SWAP_I128) {
-    // Low part of Val2
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                              N->getOperand(3), DAG.getIntPtrConstant(0)));
-    // High part of Val2
-    Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i64,
-                              N->getOperand(3), DAG.getIntPtrConstant(1)));
-  }
-
-  Ops.push_back(DAG.getTargetConstant(Ordering, MVT::i32));
-  Ops.push_back(N->getOperand(0)); // Chain
-
-  SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
-  SDNode *Result = DAG.getMachineNode(NewOp, DL, Tys, Ops);
-  SDValue OpsF[] = { SDValue(Result, 0), SDValue(Result, 1) };
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128, OpsF, 2));
-  Results.push_back(SDValue(Result, 2));
-}
-
-void ARM64TargetLowering::ReplaceNodeResults(SDNode *N,
-                                             SmallVectorImpl<SDValue> &Results,
-                                             SelectionDAG &DAG) const {
-  switch (N->getOpcode()) {
-  default:
-    llvm_unreachable("Don't know how to custom expand this");
-  case ISD::ATOMIC_LOAD:
-    ReplaceATOMIC_LOAD_128(N, Results, DAG);
-    return;
-  case ISD::ATOMIC_LOAD_ADD:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_ADD_I128);
-    return;
-  case ISD::ATOMIC_LOAD_SUB:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_SUB_I128);
-    return;
-  case ISD::ATOMIC_LOAD_AND:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_AND_I128);
-    return;
-  case ISD::ATOMIC_LOAD_OR:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_OR_I128);
-    return;
-  case ISD::ATOMIC_LOAD_XOR:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_XOR_I128);
-    return;
-  case ISD::ATOMIC_LOAD_NAND:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_NAND_I128);
-    return;
-  case ISD::ATOMIC_SWAP:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_SWAP_I128);
-    return;
-  case ISD::ATOMIC_LOAD_MIN:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MIN_I128);
-    return;
-  case ISD::ATOMIC_LOAD_MAX:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_MAX_I128);
-    return;
-  case ISD::ATOMIC_LOAD_UMIN:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMIN_I128);
-    return;
-  case ISD::ATOMIC_LOAD_UMAX:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_LOAD_UMAX_I128);
-    return;
-  case ISD::ATOMIC_CMP_SWAP:
-    ReplaceATOMIC_OP_128(N, Results, DAG, ARM64::ATOMIC_CMP_SWAP_I128);
-    return;
-  case ISD::FP_TO_UINT:
-  case ISD::FP_TO_SINT:
-    assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
-    // Let normal code take care of it by not adding anything to Results.
-    return;
-  }
-}
diff --git a/lib/Target/ARM64/ARM64ISelLowering.h b/lib/Target/ARM64/ARM64ISelLowering.h
deleted file mode 100644
index a4664ac..0000000
--- a/lib/Target/ARM64/ARM64ISelLowering.h
+++ /dev/null
@@ -1,422 +0,0 @@
-//==-- ARM64ISelLowering.h - ARM64 DAG Lowering Interface --------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the interfaces that ARM64 uses to lower LLVM code into a
-// selection DAG.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_ISELLOWERING_H
-#define LLVM_TARGET_ARM64_ISELLOWERING_H
-
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetLowering.h"
-
-namespace llvm {
-
-namespace ARM64ISD {
-
-enum {
-  FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
-  CALL,         // Function call.
-
-  // Almost the same as a normal call node, except that a TLSDesc relocation is
-  // needed so the linker can relax it correctly if possible.
-  TLSDESC_CALL,
-  ADRP,     // Page address of a TargetGlobalAddress operand.
-  ADDlow,   // Add the low 12 bits of a TargetGlobalAddress operand.
-  LOADgot,  // Load from automatically generated descriptor (e.g. Global
-            // Offset Table, TLS record).
-  RET_FLAG, // Return with a flag operand. Operand 0 is the chain operand.
-  BRCOND,   // Conditional branch instruction; "b.cond".
-  CSEL,
-  FCSEL, // Conditional move instruction.
-  CSINV, // Conditional select invert.
-  CSNEG, // Conditional select negate.
-  CSINC, // Conditional select increment.
-
-  // Pointer to the thread's local storage area. Materialised from TPIDR_EL0 on
-  // ELF.
-  THREAD_POINTER,
-  ADC,
-  SBC, // adc, sbc instructions
-
-  // Arithmetic instructions which write flags.
-  ADDS,
-  SUBS,
-  ADCS,
-  SBCS,
-  ANDS,
-
-  // Floating point comparison
-  FCMP,
-
-  // Floating point max and min instructions.
-  FMAX,
-  FMIN,
-
-  // Scalar extract
-  EXTR,
-
-  // Scalar-to-vector duplication
-  DUP,
-  DUPLANE8,
-  DUPLANE16,
-  DUPLANE32,
-  DUPLANE64,
-
-  // Vector immedate moves
-  MOVI,
-  MOVIshift,
-  MOVIedit,
-  MOVImsl,
-  FMOV,
-  MVNIshift,
-  MVNImsl,
-
-  // Vector immediate ops
-  BICi,
-  ORRi,
-
-  // Vector arithmetic negation
-  NEG,
-
-  // Vector shuffles
-  ZIP1,
-  ZIP2,
-  UZP1,
-  UZP2,
-  TRN1,
-  TRN2,
-  REV16,
-  REV32,
-  REV64,
-  EXT,
-
-  // Vector shift by scalar
-  VSHL,
-  VLSHR,
-  VASHR,
-
-  // Vector shift by scalar (again)
-  SQSHL_I,
-  UQSHL_I,
-  SQSHLU_I,
-  SRSHR_I,
-  URSHR_I,
-
-  // Vector comparisons
-  CMEQ,
-  CMGE,
-  CMGT,
-  CMHI,
-  CMHS,
-  FCMEQ,
-  FCMGE,
-  FCMGT,
-
-  // Vector zero comparisons
-  CMEQz,
-  CMGEz,
-  CMGTz,
-  CMLEz,
-  CMLTz,
-  FCMEQz,
-  FCMGEz,
-  FCMGTz,
-  FCMLEz,
-  FCMLTz,
-
-  // Vector bitwise negation
-  NOT,
-
-  // Vector bitwise selection
-  BIT,
-
-  // Compare-and-branch
-  CBZ,
-  CBNZ,
-  TBZ,
-  TBNZ,
-
-  // Tail calls
-  TC_RETURN,
-
-  // Custom prefetch handling
-  PREFETCH,
-
-  // {s|u}int to FP within a FP register.
-  SITOF,
-  UITOF
-};
-
-} // end namespace ARM64ISD
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64TargetLowering : public TargetLowering {
-  bool RequireStrictAlign;
-
-public:
-  explicit ARM64TargetLowering(ARM64TargetMachine &TM);
-
-  /// Selects the correct CCAssignFn for a the given CallingConvention
-  /// value.
-  CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg) const;
-
-  /// computeMaskedBitsForTargetNode - Determine which of the bits specified in
-  /// Mask are known to be either zero or one and return them in the
-  /// KnownZero/KnownOne bitsets.
-  void computeMaskedBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                      APInt &KnownOne, const SelectionDAG &DAG,
-                                      unsigned Depth = 0) const;
-
-  MVT getScalarShiftAmountTy(EVT LHSTy) const override;
-
-  /// allowsUnalignedMemoryAccesses - Returns true if the target allows
-  /// unaligned memory accesses. of the specified type.
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace = 0,
-                                     bool *Fast = 0) const override {
-    if (RequireStrictAlign)
-      return false;
-    // FIXME: True for Cyclone, but not necessary others.
-    if (Fast)
-      *Fast = true;
-    return true;
-  }
-
-  /// LowerOperation - Provide custom lowering hooks for some operations.
-  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
-
-  const char *getTargetNodeName(unsigned Opcode) const override;
-
-  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
-
-  /// getFunctionAlignment - Return the Log2 alignment of this function.
-  unsigned getFunctionAlignment(const Function *F) const;
-
-  /// getMaximalGlobalOffset - Returns the maximal possible offset which can
-  /// be used for loads / stores from the global.
-  unsigned getMaximalGlobalOffset() const override;
-
-  /// Returns true if a cast between SrcAS and DestAS is a noop.
-  bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override {
-    // Addrspacecasts are always noops.
-    return true;
-  }
-
-  /// createFastISel - This method returns a target specific FastISel object,
-  /// or null if the target does not support "fast" ISel.
-  FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                           const TargetLibraryInfo *libInfo) const override;
-
-  bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
-
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
-
-  /// isShuffleMaskLegal - Return true if the given shuffle mask can be
-  /// codegen'd directly, or if it should be stack expanded.
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &M, EVT VT) const override;
-
-  /// getSetCCResultType - Return the ISD::SETCC ValueType
-  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
-
-  SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
-
-  MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
-                                      unsigned Size, unsigned BinOpcode) const;
-  MachineBasicBlock *EmitAtomicCmpSwap(MachineInstr *MI, MachineBasicBlock *BB,
-                                       unsigned Size) const;
-  MachineBasicBlock *EmitAtomicBinary128(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned BinOpcodeLo,
-                                         unsigned BinOpcodeHi) const;
-  MachineBasicBlock *EmitAtomicCmpSwap128(MachineInstr *MI,
-                                          MachineBasicBlock *BB) const;
-  MachineBasicBlock *EmitAtomicMinMax128(MachineInstr *MI,
-                                         MachineBasicBlock *BB,
-                                         unsigned CondCode) const;
-  MachineBasicBlock *EmitF128CSEL(MachineInstr *MI,
-                                  MachineBasicBlock *BB) const;
-
-  MachineBasicBlock *
-  EmitInstrWithCustomInserter(MachineInstr *MI,
-                              MachineBasicBlock *MBB) const override;
-
-  bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const override;
-
-  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
-  bool isTruncateFree(EVT VT1, EVT VT2) const override;
-
-  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
-  bool isZExtFree(EVT VT1, EVT VT2) const override;
-  bool isZExtFree(SDValue Val, EVT VT2) const override;
-
-  bool hasPairedLoad(Type *LoadedType,
-                     unsigned &RequiredAligment) const override;
-  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
-
-  bool isLegalAddImmediate(int64_t) const override;
-  bool isLegalICmpImmediate(int64_t) const override;
-
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                          MachineFunction &MF) const override;
-
-  /// isLegalAddressingMode - Return true if the addressing mode represented
-  /// by AM is legal for this target, for a load/store of the specified type.
-  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
-
-  /// \brief Return the cost of the scaling factor used in the addressing
-  /// mode represented by AM for this target, for a load/store
-  /// of the specified type.
-  /// If the AM is supported, the return value must be >= 0.
-  /// If the AM is not supported, it returns a negative value.
-  int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
-
-  /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
-  /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
-  /// expanded to FMAs when this method returns true, otherwise fmuladd is
-  /// expanded to fmul + fadd.
-  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
-
-  const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
-
-  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                         Type *Ty) const override;
-
-private:
-  /// Subtarget - Keep a pointer to the ARM64Subtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-  void addTypeForNEON(EVT VT, EVT PromotedBitwiseVT);
-  void addDRTypeForNEON(MVT VT);
-  void addQRTypeForNEON(MVT VT);
-
-  SDValue
-  LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                       SelectionDAG &DAG,
-                       SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCall(CallLoweringInfo & /*CLI*/,
-                    SmallVectorImpl<SDValue> &InVals) const override;
-
-  SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
-                          CallingConv::ID CallConv, bool isVarArg,
-                          const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc DL,
-                          SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-                          bool isThisReturn, SDValue ThisVal) const;
-
-  bool isEligibleForTailCallOptimization(
-      SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
-      bool isCalleeStructRet, bool isCallerStructRet,
-      const SmallVectorImpl<ISD::OutputArg> &Outs,
-      const SmallVectorImpl<SDValue> &OutVals,
-      const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
-
-  void saveVarArgRegisters(CCState &CCInfo, SelectionDAG &DAG, SDLoc DL,
-                           SDValue &Chain) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                      bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      LLVMContext &Context) const override;
-
-  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-                      const SmallVectorImpl<ISD::OutputArg> &Outs,
-                      const SmallVectorImpl<SDValue> &OutVals, SDLoc DL,
-                      SelectionDAG &DAG) const override;
-
-  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerELFTLSDescCall(SDValue SymAddr, SDValue DescAddr, SDLoc DL,
-                              SelectionDAG &DAG) const;
-  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerF128Call(SDValue Op, SelectionDAG &DAG,
-                        RTLIB::Libcall Call) const;
-  SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorAND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
-
-  ConstraintType getConstraintType(const std::string &Constraint) const;
-
-  /// Examine constraint string and operand type and determine a weight value.
-  /// The operand object must already have been set up with the operand type.
-  ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                                  const char *constraint) const;
-
-  std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
-  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                    std::vector<SDValue> &Ops,
-                                    SelectionDAG &DAG) const;
-
-  bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const;
-  bool mayBeEmittedAsTailCall(CallInst *CI) const;
-  bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
-                              ISD::MemIndexedMode &AM, bool &IsInc,
-                              SelectionDAG &DAG) const;
-  bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset,
-                                 ISD::MemIndexedMode &AM,
-                                 SelectionDAG &DAG) const;
-  bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base,
-                                  SDValue &Offset, ISD::MemIndexedMode &AM,
-                                  SelectionDAG &DAG) const;
-
-  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                          SelectionDAG &DAG) const;
-};
-
-namespace ARM64 {
-FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
-                         const TargetLibraryInfo *libInfo);
-} // end namespace ARM64
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64_ISELLOWERING_H
diff --git a/lib/Target/ARM64/ARM64InstrAtomics.td b/lib/Target/ARM64/ARM64InstrAtomics.td
deleted file mode 100644
index 0d36e06..0000000
--- a/lib/Target/ARM64/ARM64InstrAtomics.td
+++ /dev/null
@@ -1,293 +0,0 @@
-//===- ARM64InstrAtomics.td - ARM64 Atomic codegen support -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Atomic operand code-gen constructs.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------
-// Atomic fences
-//===----------------------------------
-def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
-def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
-
-//===----------------------------------
-// Atomic loads
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A atomic load operation that actually needs acquire semantics.
-class acquiring_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected load ordering");
-  return Ordering == Acquire || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic load operation that does not need either acquire or release
-// semantics.
-class relaxed_load<PatFrag base>
-  : PatFrag<(ops node:$ptr), (base node:$ptr), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit loads
-def : Pat<(acquiring_load<atomic_load_8>  GPR64sp:$ptr), (LDARB GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_8> ro_indexed8:$addr),
-          (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_indexed8:$addr),
-          (LDRBBui am_indexed8:$addr)>;
-def : Pat<(relaxed_load<atomic_load_8> am_unscaled8:$addr),
-          (LDURBBi am_unscaled8:$addr)>;
-
-// 16-bit loads
-def : Pat<(acquiring_load<atomic_load_16> GPR64sp:$ptr), (LDARH GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_16> ro_indexed16:$addr),
-          (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_indexed16:$addr),
-          (LDRHHui am_indexed16:$addr)>;
-def : Pat<(relaxed_load<atomic_load_16> am_unscaled16:$addr),
-          (LDURHHi am_unscaled16:$addr)>;
-
-// 32-bit loads
-def : Pat<(acquiring_load<atomic_load_32> GPR64sp:$ptr), (LDARW GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_32> ro_indexed32:$addr),
-          (LDRWro ro_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_indexed32:$addr),
-          (LDRWui am_indexed32:$addr)>;
-def : Pat<(relaxed_load<atomic_load_32> am_unscaled32:$addr),
-          (LDURWi am_unscaled32:$addr)>;
-
-// 64-bit loads
-def : Pat<(acquiring_load<atomic_load_64> GPR64sp:$ptr), (LDARX GPR64sp:$ptr)>;
-def : Pat<(relaxed_load<atomic_load_64> ro_indexed64:$addr),
-          (LDRXro ro_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_indexed64:$addr),
-          (LDRXui am_indexed64:$addr)>;
-def : Pat<(relaxed_load<atomic_load_64> am_unscaled64:$addr),
-          (LDURXi am_unscaled64:$addr)>;
-
-//===----------------------------------
-// Atomic stores
-//===----------------------------------
-
-// When they're actually atomic, only one addressing mode (GPR64sp) is
-// supported, but when they're relaxed and anything can be used, all the
-// standard modes would be valid and may give efficiency gains.
-
-// A store operation that actually needs release semantics.
-class releasing_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  assert(Ordering != AcquireRelease && "unexpected store ordering");
-  return Ordering == Release || Ordering == SequentiallyConsistent;
-}]>;
-
-// An atomic store operation that doesn't actually need to be atomic on ARM64.
-class relaxed_store<PatFrag base>
-  : PatFrag<(ops node:$ptr, node:$val), (base node:$ptr, node:$val), [{
-  AtomicOrdering Ordering = cast<AtomicSDNode>(N)->getOrdering();
-  return Ordering == Monotonic || Ordering == Unordered;
-}]>;
-
-// 8-bit stores
-def : Pat<(releasing_store<atomic_store_8> GPR64sp:$ptr, GPR32:$val),
-          (STLRB GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> ro_indexed8:$ptr, GPR32:$val),
-          (STRBBro GPR32:$val, ro_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_indexed8:$ptr, GPR32:$val),
-          (STRBBui GPR32:$val, am_indexed8:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_8> am_unscaled8:$ptr, GPR32:$val),
-          (STURBBi GPR32:$val, am_unscaled8:$ptr)>;
-
-// 16-bit stores
-def : Pat<(releasing_store<atomic_store_16> GPR64sp:$ptr, GPR32:$val),
-          (STLRH GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> ro_indexed16:$ptr, GPR32:$val),
-          (STRHHro GPR32:$val, ro_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_indexed16:$ptr, GPR32:$val),
-          (STRHHui GPR32:$val, am_indexed16:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_16> am_unscaled16:$ptr, GPR32:$val),
-          (STURHHi GPR32:$val, am_unscaled16:$ptr)>;
-
-// 32-bit stores
-def : Pat<(releasing_store<atomic_store_32> GPR64sp:$ptr, GPR32:$val),
-          (STLRW GPR32:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> ro_indexed32:$ptr, GPR32:$val),
-          (STRWro GPR32:$val, ro_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_indexed32:$ptr, GPR32:$val),
-          (STRWui GPR32:$val, am_indexed32:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_32> am_unscaled32:$ptr, GPR32:$val),
-          (STURWi GPR32:$val, am_unscaled32:$ptr)>;
-
-// 64-bit stores
-def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
-          (STLRX GPR64:$val, GPR64sp:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> ro_indexed64:$ptr, GPR64:$val),
-          (STRXro GPR64:$val, ro_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_indexed64:$ptr, GPR64:$val),
-          (STRXui GPR64:$val, am_indexed64:$ptr)>;
-def : Pat<(relaxed_store<atomic_store_64> am_unscaled64:$ptr, GPR64:$val),
-          (STURXi GPR64:$val, am_unscaled64:$ptr)>;
-
-//===----------------------------------
-// Atomic read-modify-write operations
-//===----------------------------------
-
-// More complicated operations need lots of C++ support, so we just create
-// skeletons here for the C++ code to refer to.
-
-let usesCustomInserter = 1, hasCtrlDep = 1, mayLoad = 1, mayStore = 1 in {
-multiclass AtomicSizes {
-  def _I8 : Pseudo<(outs GPR32:$dst),
-                   (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I16 : Pseudo<(outs GPR32:$dst),
-                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I32 : Pseudo<(outs GPR32:$dst),
-                    (ins GPR64sp:$ptr, GPR32:$incr, i32imm:$ordering), []>;
-  def _I64 : Pseudo<(outs GPR64:$dst),
-                    (ins GPR64sp:$ptr, GPR64:$incr, i32imm:$ordering), []>;
-  def _I128 : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
-                     (ins GPR64sp:$ptr, GPR64:$incrlo, GPR64:$incrhi,
-                          i32imm:$ordering), []>;
-}
-}
-
-defm ATOMIC_LOAD_ADD  : AtomicSizes;
-defm ATOMIC_LOAD_SUB  : AtomicSizes;
-defm ATOMIC_LOAD_AND  : AtomicSizes;
-defm ATOMIC_LOAD_OR   : AtomicSizes;
-defm ATOMIC_LOAD_XOR  : AtomicSizes;
-defm ATOMIC_LOAD_NAND : AtomicSizes;
-defm ATOMIC_SWAP      : AtomicSizes;
-let Defs = [CPSR] in {
-  // These operations need a CMP to calculate the correct value
-  defm ATOMIC_LOAD_MIN  : AtomicSizes;
-  defm ATOMIC_LOAD_MAX  : AtomicSizes;
-  defm ATOMIC_LOAD_UMIN : AtomicSizes;
-  defm ATOMIC_LOAD_UMAX : AtomicSizes;
-}
-
-class AtomicCmpSwap<RegisterClass GPRData>
-  : Pseudo<(outs GPRData:$dst),
-           (ins GPR64sp:$ptr, GPRData:$old, GPRData:$new,
-                i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [CPSR];
-}
-
-def ATOMIC_CMP_SWAP_I8  : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I16 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I32 : AtomicCmpSwap<GPR32>;
-def ATOMIC_CMP_SWAP_I64 : AtomicCmpSwap<GPR64>;
-
-def ATOMIC_CMP_SWAP_I128
-  : Pseudo<(outs GPR64:$dstlo, GPR64:$dsthi),
-           (ins GPR64sp:$ptr, GPR64:$oldlo, GPR64:$oldhi,
-                GPR64:$newlo, GPR64:$newhi, i32imm:$ordering), []> {
-  let usesCustomInserter = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-  let Defs = [CPSR];
-}
-
-//===----------------------------------
-// Low-level exclusive operations
-//===----------------------------------
-
-// Load-exclusives.
-
-def ldxr_1 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def ldxr_2 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def ldxr_4 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def ldxr_8 : PatFrag<(ops node:$ptr), (int_arm64_ldxr node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(ldxr_1 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_2 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_4 am_noindex:$addr),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-def : Pat<(ldxr_8 am_noindex:$addr), (LDXRX am_noindex:$addr)>;
-
-def : Pat<(and (ldxr_1 am_noindex:$addr), 0xff),
-          (SUBREG_TO_REG (i64 0), (LDXRB am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_2 am_noindex:$addr), 0xffff),
-          (SUBREG_TO_REG (i64 0), (LDXRH am_noindex:$addr), sub_32)>;
-def : Pat<(and (ldxr_4 am_noindex:$addr), 0xffffffff),
-          (SUBREG_TO_REG (i64 0), (LDXRW am_noindex:$addr), sub_32)>;
-
-// Store-exclusives.
-
-def stxr_1 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
-}]>;
-
-def stxr_2 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
-}]>;
-
-def stxr_4 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
-}]>;
-
-def stxr_8 : PatFrag<(ops node:$val, node:$ptr),
-                     (int_arm64_stxr node:$val, node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
-}]>;
-
-def : Pat<(stxr_1 GPR64:$val, am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 GPR64:$val, am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 GPR64:$val, am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_8 GPR64:$val, am_noindex:$addr),
-          (STXRX GPR64:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (zext (and GPR32:$val, 0xff)), am_noindex:$addr),
-          (STXRB GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_2 (zext (and GPR32:$val, 0xffff)), am_noindex:$addr),
-          (STXRH GPR32:$val, am_noindex:$addr)>;
-def : Pat<(stxr_4 (zext GPR32:$val), am_noindex:$addr),
-          (STXRW GPR32:$val, am_noindex:$addr)>;
-
-def : Pat<(stxr_1 (and GPR64:$val, 0xff), am_noindex:$addr),
-          (STXRB (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_2 (and GPR64:$val, 0xffff), am_noindex:$addr),
-          (STXRH (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-def : Pat<(stxr_4 (and GPR64:$val, 0xffffffff), am_noindex:$addr),
-          (STXRW (EXTRACT_SUBREG GPR64:$val, sub_32), am_noindex:$addr)>;
-
-
-// And clear exclusive.
-
-def : Pat<(int_arm64_clrex), (CLREX 0xf)>;
diff --git a/lib/Target/ARM64/ARM64InstrFormats.td b/lib/Target/ARM64/ARM64InstrFormats.td
deleted file mode 100644
index 38406f8..0000000
--- a/lib/Target/ARM64/ARM64InstrFormats.td
+++ /dev/null
@@ -1,8193 +0,0 @@
-//===- ARM64InstrFormats.td - ARM64 Instruction Formats ------*- tblgen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//  Describe ARM64 instructions format here
-//
-
-// Format specifies the encoding used by the instruction.  This is part of the
-// ad-hoc solution used to emit machine instruction encodings by our machine
-// code emitter.
-class Format<bits<2> val> {
-  bits<2> Value = val;
-}
-
-def PseudoFrm   : Format<0>;
-def NormalFrm   : Format<1>; // Do we need any others?
-
-// ARM64 Instruction Format
-class ARM64Inst<Format f, string cstr> : Instruction {
-  field bits<32> Inst; // Instruction encoding.
-  // Mask of bits that cause an encoding to be UNPREDICTABLE.
-  // If a bit is set, then if the corresponding bit in the
-  // target encoding differs from its value in the "Inst" field,
-  // the instruction is UNPREDICTABLE (SoftFail in abstract parlance).
-  field bits<32> Unpredictable = 0;
-  // SoftFail is the generic name for this field, but we alias it so
-  // as to make it more obvious what it means in ARM-land.
-  field bits<32> SoftFail = Unpredictable;
-  let Namespace   = "ARM64";
-  Format F        = f;
-  bits<2> Form    = F.Value;
-  let Pattern     = [];
-  let Constraints = cstr;
-}
-
-// Pseudo instructions (don't have encoding information)
-class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
-    : ARM64Inst<PseudoFrm, cstr> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let Pattern        = pattern;
-  let isCodeGenOnly  = 1;
-}
-
-// Real instructions (have encoding information)
-class EncodedI<string cstr, list<dag> pattern> : ARM64Inst<NormalFrm, cstr> {
-  let Pattern = pattern;
-  let Size = 4;
-}
-
-// Normal instructions
-class I<dag oops, dag iops, string asm, string operands, string cstr,
-        list<dag> pattern>
-    : EncodedI<cstr, pattern> {
-  dag OutOperandList = oops;
-  dag InOperandList  = iops;
-  let AsmString      = !strconcat(asm, operands);
-}
-
-class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
-class BinOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$RHS), res>;
-class UnOpFrag<dag res>  : PatFrag<(ops node:$LHS), res>;
-
-// Helper fragment for an extract of the high portion of a 128-bit vector.
-def extract_high_v16i8 :
-   UnOpFrag<(extract_subvector (v16i8 node:$LHS), (i64 8))>;
-def extract_high_v8i16 :
-   UnOpFrag<(extract_subvector (v8i16 node:$LHS), (i64 4))>;
-def extract_high_v4i32 :
-   UnOpFrag<(extract_subvector (v4i32 node:$LHS), (i64 2))>;
-def extract_high_v2i64 :
-   UnOpFrag<(extract_subvector (v2i64 node:$LHS), (i64 1))>;
-
-//===----------------------------------------------------------------------===//
-// Asm Operand Classes.
-//
-
-// Shifter operand for arithmetic shifted encodings.
-def ShifterOperand : AsmOperandClass {
-  let Name = "Shifter";
-}
-
-// Shifter operand for mov immediate encodings.
-def MovImm32ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm32Shifter";
-}
-def MovImm64ShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MovImm64Shifter";
-}
-
-// Shifter operand for arithmetic register shifted encodings.
-def ArithmeticShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "ArithmeticShifter";
-}
-
-// Shifter operand for arithmetic shifted encodings for ADD/SUB instructions.
-def AddSubShifterOperand : AsmOperandClass {
-  let SuperClasses = [ArithmeticShifterOperand];
-  let Name = "AddSubShifter";
-}
-
-// Shifter operand for logical vector 128/64-bit shifted encodings.
-def LogicalVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "LogicalVecShifter";
-}
-def LogicalVecHalfWordShifterOperand : AsmOperandClass {
-  let SuperClasses = [LogicalVecShifterOperand];
-  let Name = "LogicalVecHalfWordShifter";
-}
-
-// The "MSL" shifter on the vector MOVI instruction.
-def MoveVecShifterOperand : AsmOperandClass {
-  let SuperClasses = [ShifterOperand];
-  let Name = "MoveVecShifter";
-}
-
-// Extend operand for arithmetic encodings.
-def ExtendOperand : AsmOperandClass { let Name = "Extend"; }
-def ExtendOperand64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "Extend64";
-}
-// 'extend' that's a lsl of a 64-bit register.
-def ExtendOperandLSL64 : AsmOperandClass {
-  let SuperClasses = [ExtendOperand];
-  let Name = "ExtendLSL64";
-}
-
-// 8-bit floating-point immediate encodings.
-def FPImmOperand : AsmOperandClass {
-  let Name = "FPImm";
-  let ParserMethod = "tryParseFPImm";
-}
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def SIMDImmType10Operand : AsmOperandClass { let Name = "SIMDImmType10"; }
-
-
-//===----------------------------------------------------------------------===//
-// Operand Definitions.
-//
-
-// ADR[P] instruction labels.
-def AdrpOperand : AsmOperandClass {
-  let Name = "AdrpLabel";
-  let ParserMethod = "tryParseAdrpLabel";
-}
-def adrplabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let PrintMethod = "printAdrpLabel";
-  let ParserMatchClass = AdrpOperand;
-}
-
-def AdrOperand : AsmOperandClass {
-  let Name = "AdrLabel";
-  let ParserMethod = "tryParseAdrLabel";
-}
-def adrlabel : Operand<i64> {
-  let EncoderMethod = "getAdrLabelOpValue";
-  let ParserMatchClass = AdrOperand;
-}
-
-// simm9 predicate - True if the immediate is in the range [-256, 255].
-def SImm9Operand : AsmOperandClass {
-  let Name = "SImm9";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
-  let ParserMatchClass = SImm9Operand;
-}
-
-// simm7s4 predicate - True if the immediate is a multiple of 4 in the range
-// [-256, 252].
-def SImm7s4Operand : AsmOperandClass {
-  let Name = "SImm7s4";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def simm7s4 : Operand<i32> {
-  let ParserMatchClass = SImm7s4Operand;
-  let PrintMethod = "printImmScale4";
-}
-
-// simm7s8 predicate - True if the immediate is a multiple of 8 in the range
-// [-512, 504].
-def SImm7s8Operand : AsmOperandClass {
-  let Name = "SImm7s8";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s8 : Operand<i32> {
-  let ParserMatchClass = SImm7s8Operand;
-  let PrintMethod = "printImmScale8";
-}
-
-// simm7s16 predicate - True if the immediate is a multiple of 16 in the range
-// [-1024, 1008].
-def SImm7s16Operand : AsmOperandClass {
-  let Name = "SImm7s16";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def simm7s16 : Operand<i32> {
-  let ParserMatchClass = SImm7s16Operand;
-  let PrintMethod = "printImmScale16";
-}
-
-// imm0_65535 predicate - True if the immediate is in the range [0,65535].
-def Imm0_65535Operand : AsmOperandClass { let Name = "Imm0_65535"; }
-def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 65536;
-}]> {
-  let ParserMatchClass = Imm0_65535Operand;
-}
-
-def Imm1_8Operand : AsmOperandClass {
-  let Name = "Imm1_8";
-  let DiagnosticType = "InvalidImm1_8";
-}
-def Imm1_16Operand : AsmOperandClass {
-  let Name = "Imm1_16";
-  let DiagnosticType = "InvalidImm1_16";
-}
-def Imm1_32Operand : AsmOperandClass {
-  let Name = "Imm1_32";
-  let DiagnosticType = "InvalidImm1_32";
-}
-def Imm1_64Operand : AsmOperandClass {
-  let Name = "Imm1_64";
-  let DiagnosticType = "InvalidImm1_64";
-}
-
-def MovZSymbolG3AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG3";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g3 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG3AsmOperand;
-}
-
-def MovZSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG2AsmOperand;
-}
-
-def MovZSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG1AsmOperand;
-}
-
-def MovZSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovZSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movz_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovZSymbolG0AsmOperand;
-}
-
-def MovKSymbolG2AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG2";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g2 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG2AsmOperand;
-}
-
-def MovKSymbolG1AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG1";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g1 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG1AsmOperand;
-}
-
-def MovKSymbolG0AsmOperand : AsmOperandClass {
-  let Name = "MovKSymbolG0";
-  let RenderMethod = "addImmOperands";
-}
-
-def movk_symbol_g0 : Operand<i32> {
-  let ParserMatchClass = MovKSymbolG0AsmOperand;
-}
-
-def fixedpoint32 : Operand<i32> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def fixedpoint64 : Operand<i64> {
-  let EncoderMethod = "getFixedPointScaleOpValue";
-  let DecoderMethod = "DecodeFixedPointScaleImm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-
-def vecshiftR8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR8OpValue";
-  let DecoderMethod = "DecodeVecShiftR8Imm";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16Imm";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR16Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
-}]> {
-  let EncoderMethod = "getVecShiftR16OpValue";
-  let DecoderMethod = "DecodeVecShiftR16ImmNarrow";
-  let ParserMatchClass = Imm1_8Operand;
-}
-def vecshiftR32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32Imm";
-  let ParserMatchClass = Imm1_32Operand;
-}
-def vecshiftR32Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
-}]> {
-  let EncoderMethod = "getVecShiftR32OpValue";
-  let DecoderMethod = "DecodeVecShiftR32ImmNarrow";
-  let ParserMatchClass = Imm1_16Operand;
-}
-def vecshiftR64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64Imm";
-  let ParserMatchClass = Imm1_64Operand;
-}
-def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
-}]> {
-  let EncoderMethod = "getVecShiftR64OpValue";
-  let DecoderMethod = "DecodeVecShiftR64ImmNarrow";
-  let ParserMatchClass = Imm1_32Operand;
-}
-
-def Imm0_7Operand : AsmOperandClass { let Name = "Imm0_7"; }
-def Imm0_15Operand : AsmOperandClass { let Name = "Imm0_15"; }
-def Imm0_31Operand : AsmOperandClass { let Name = "Imm0_31"; }
-def Imm0_63Operand : AsmOperandClass { let Name = "Imm0_63"; }
-
-def vecshiftL8 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 8);
-}]> {
-  let EncoderMethod = "getVecShiftL8OpValue";
-  let DecoderMethod = "DecodeVecShiftL8Imm";
-  let ParserMatchClass = Imm0_7Operand;
-}
-def vecshiftL16 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 16);
-}]> {
-  let EncoderMethod = "getVecShiftL16OpValue";
-  let DecoderMethod = "DecodeVecShiftL16Imm";
-  let ParserMatchClass = Imm0_15Operand;
-}
-def vecshiftL32 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 32);
-}]> {
-  let EncoderMethod = "getVecShiftL32OpValue";
-  let DecoderMethod = "DecodeVecShiftL32Imm";
-  let ParserMatchClass = Imm0_31Operand;
-}
-def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
-  return (((uint32_t)Imm) < 64);
-}]> {
-  let EncoderMethod = "getVecShiftL64OpValue";
-  let DecoderMethod = "DecodeVecShiftL64Imm";
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-
-// Crazy immediate formats used by 32-bit and 64-bit logical immediate
-// instructions for splatting repeating bit patterns across the immediate.
-def logical_imm32_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 32);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-def logical_imm64_XFORM : SDNodeXForm<imm, [{
-  uint64_t enc = ARM64_AM::encodeLogicalImmediate(N->getZExtValue(), 64);
-  return CurDAG->getTargetConstant(enc, MVT::i32);
-}]>;
-
-def LogicalImm32Operand : AsmOperandClass { let Name = "LogicalImm32"; }
-def LogicalImm64Operand : AsmOperandClass { let Name = "LogicalImm64"; }
-def logical_imm32 : Operand<i32>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 32);
-}], logical_imm32_XFORM> {
-  let PrintMethod = "printLogicalImm32";
-  let ParserMatchClass = LogicalImm32Operand;
-}
-def logical_imm64 : Operand<i64>, PatLeaf<(imm), [{
-  return ARM64_AM::isLogicalImmediate(N->getZExtValue(), 64);
-}], logical_imm64_XFORM> {
-  let PrintMethod = "printLogicalImm64";
-  let ParserMatchClass = LogicalImm64Operand;
-}
-
-// imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
-def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 256;
-}]> {
-  let ParserMatchClass = Imm0_255Operand;
-}
-
-// imm0_127 predicate - True if the immediate is in the range [0,127]
-def Imm0_127Operand : AsmOperandClass { let Name = "Imm0_127"; }
-def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
-  return ((uint32_t)Imm) < 128;
-}]> {
-  let ParserMatchClass = Imm0_127Operand;
-}
-
-// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
-// for all shift-amounts.
-
-// imm0_63 predicate - True if the immediate is in the range [0,63]
-def imm0_63 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 64;
-}]> {
-  let ParserMatchClass = Imm0_63Operand;
-}
-
-// imm0_31 predicate - True if the immediate is in the range [0,31]
-def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 32;
-}]> {
-  let ParserMatchClass = Imm0_31Operand;
-}
-
-// imm0_15 predicate - True if the immediate is in the range [0,15]
-def imm0_15 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = Imm0_15Operand;
-}
-
-// imm0_7 predicate - True if the immediate is in the range [0,7]
-def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = Imm0_7Operand;
-}
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr
-//  {5-0} - imm6
-def arith_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = ArithmeticShifterOperand;
-}
-
-class arith_shifted_reg<ValueType Ty, RegisterClass regclass>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectArithShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, arith_shift);
-}
-
-def arith_shifted_reg32 : arith_shifted_reg<i32, GPR32>;
-def arith_shifted_reg64 : arith_shifted_reg<i64, GPR64>;
-
-// An arithmetic shifter operand:
-//  {7-6} - shift type: 00 = lsl, 01 = lsr, 10 = asr, 11 = ror
-//  {5-0} - imm6
-def logical_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = ShifterOperand;
-}
-
-class logical_shifted_reg<ValueType Ty, RegisterClass regclass>
-    : Operand<Ty>,
-      ComplexPattern<Ty, 2, "SelectLogicalShiftedRegister", []> {
-  let PrintMethod = "printShiftedRegister";
-  let MIOperandInfo = (ops regclass, logical_shift);
-}
-
-def logical_shifted_reg32 : logical_shifted_reg<i32, GPR32>;
-def logical_shifted_reg64 : logical_shifted_reg<i64, GPR64>;
-
-// A logical vector shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0, #8, #16, or #24
-def logical_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecShifterOperand;
-}
-
-// A logical vector half-word shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #8
-def logical_vec_hw_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getVecShifterOpValue";
-  let ParserMatchClass = LogicalVecHalfWordShifterOperand;
-}
-
-// A vector move shifter operand:
-//  {0} - imm1: #8 or #16
-def move_vec_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let EncoderMethod = "getMoveVecShifterOpValue";
-  let ParserMatchClass = MoveVecShifterOperand;
-}
-
-// An ADD/SUB immediate shifter operand:
-//  {7-6} - shift type: 00 = lsl
-//  {5-0} - imm6: #0 or #12
-def addsub_shift : Operand<i32> {
-  let ParserMatchClass = AddSubShifterOperand;
-}
-
-class addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let MIOperandInfo = (ops i32imm, addsub_shift);
-}
-
-def addsub_shifted_imm32 : addsub_shifted_imm<i32>;
-def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
-
-class neg_addsub_shifted_imm<ValueType Ty>
-    : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
-  let PrintMethod = "printAddSubImm";
-  let EncoderMethod = "getAddSubImmOpValue";
-  let MIOperandInfo = (ops i32imm, addsub_shift);
-}
-
-def neg_addsub_shifted_imm32 : neg_addsub_shifted_imm<i32>;
-def neg_addsub_shifted_imm64 : neg_addsub_shifted_imm<i64>;
-
-// An extend operand:
-//  {5-3} - extend type
-//  {2-0} - imm3
-def arith_extend : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand;
-}
-def arith_extend64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperand64;
-}
-
-// 'extend' that's a lsl of a 64-bit register.
-def arith_extendlsl64 : Operand<i32> {
-  let PrintMethod = "printExtend";
-  let ParserMatchClass = ExtendOperandLSL64;
-}
-
-class arith_extended_reg32<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend);
-}
-
-class arith_extended_reg32to64<ValueType Ty> : Operand<Ty>,
-                    ComplexPattern<Ty, 2, "SelectArithExtendedRegister", []> {
-  let PrintMethod = "printExtendedRegister";
-  let MIOperandInfo = (ops GPR32, arith_extend64);
-}
-
-// Floating-point immediate.
-def fpimm32 : Operand<f32>,
-              PatLeaf<(f32 fpimm), [{
-      return ARM64_AM::getFP32Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP32Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-def fpimm64 : Operand<f64>,
-              PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::getFP64Imm(N->getValueAPF()) != -1;
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::getFP64Imm(InVal);
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm8 : Operand<i32> {
-  let ParserMatchClass = FPImmOperand;
-  let PrintMethod = "printFPImmOperand";
-}
-
-def fpimm0 : PatLeaf<(fpimm), [{
-  return N->isExactlyValue(+0.0);
-}]>;
-
-// 8-bit immediate for AdvSIMD where 64-bit values of the form:
-// aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
-// are encoded as the eight bit value 'abcdefgh'.
-def simdimmtype10 : Operand<i32>,
-                    PatLeaf<(f64 fpimm), [{
-      return ARM64_AM::isAdvSIMDModImmType10(N->getValueAPF()
-                                               .bitcastToAPInt()
-                                               .getZExtValue());
-    }], SDNodeXForm<fpimm, [{
-      APFloat InVal = N->getValueAPF();
-      uint32_t enc = ARM64_AM::encodeAdvSIMDModImmType10(N->getValueAPF()
-                                                           .bitcastToAPInt()
-                                                           .getZExtValue());
-      return CurDAG->getTargetConstant(enc, MVT::i32);
-    }]>> {
-  let ParserMatchClass = SIMDImmType10Operand;
-  let PrintMethod = "printSIMDType10Operand";
-}
-
-
-//---
-// Sytem management
-//---
-
-// Base encoding for system instruction operands.
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class BaseSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-22} = 0b1101010100;
-  let Inst{21}    = L;
-}
-
-// System instructions which do not have an Rt register.
-class SimpleSystemI<bit L, dag iops, string asm, string operands>
-    : BaseSystemI<L, (outs), iops, asm, operands> {
-  let Inst{4-0} = 0b11111;
-}
-
-// System instructions which have an Rt register.
-class RtSystemI<bit L, dag oops, dag iops, string asm, string operands>
-    : BaseSystemI<L, oops, iops, asm, operands>,
-      Sched<[WriteSys]> {
-  bits<5> Rt;
-  let Inst{4-0} = Rt;
-}
-
-// Hint instructions that take both a CRm and a 3-bit immediate.
-class HintI<string mnemonic>
-    : SimpleSystemI<0, (ins imm0_127:$imm), mnemonic#" $imm", "">,
-      Sched<[WriteHint]> {
-  bits <7> imm;
-  let Inst{20-12} = 0b000110010;
-  let Inst{11-5} = imm;
-}
-
-// System instructions taking a single literal operand which encodes into
-// CRm. op2 differentiates the opcodes.
-def BarrierAsmOperand : AsmOperandClass {
-  let Name = "Barrier";
-  let ParserMethod = "tryParseBarrierOperand";
-}
-def barrier_op : Operand<i32> {
-  let PrintMethod = "printBarrierOption";
-  let ParserMatchClass = BarrierAsmOperand;
-}
-class CRmSystemI<Operand crmtype, bits<3> opc, string asm>
-    : SimpleSystemI<0, (ins crmtype:$CRm), asm, "\t$CRm">,
-      Sched<[WriteBarrier]> {
-  bits<4> CRm;
-  let Inst{20-12} = 0b000110011;
-  let Inst{11-8} = CRm;
-  let Inst{7-5} = opc;
-}
-
-// MRS/MSR system instructions.
-def SystemRegisterOperand : AsmOperandClass {
-  let Name = "SystemRegister";
-  let ParserMethod = "tryParseSystemRegister";
-}
-// concatenation of 1, op0, op1, CRn, CRm, op2. 16-bit immediate.
-def sysreg_op : Operand<i32> {
-  let ParserMatchClass = SystemRegisterOperand;
-  let DecoderMethod = "DecodeSystemRegister";
-  let PrintMethod = "printSystemRegister";
-}
-
-class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins sysreg_op:$systemreg),
-                       "mrs", "\t$Rt, $systemreg"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-// FIXME: Some of these def CPSR, others don't. Best way to model that?
-// Explicitly modeling each of the system register as a register class
-// would do it, but feels like overkill at this point.
-class MSRI : RtSystemI<0, (outs), (ins sysreg_op:$systemreg, GPR64:$Rt),
-                       "msr", "\t$systemreg, $Rt"> {
-  bits<15> systemreg;
-  let Inst{20} = 1;
-  let Inst{19-5} = systemreg;
-}
-
-def SystemCPSRFieldOperand : AsmOperandClass {
-  let Name = "SystemCPSRField";
-  let ParserMethod = "tryParseCPSRField";
-}
-def cpsrfield_op : Operand<i32> {
-  let ParserMatchClass = SystemCPSRFieldOperand;
-  let PrintMethod = "printSystemCPSRField";
-}
-
-let Defs = [CPSR] in
-class MSRcpsrI : SimpleSystemI<0, (ins cpsrfield_op:$cpsr_field, imm0_15:$imm),
-                               "msr", "\t$cpsr_field, $imm">,
-                 Sched<[WriteSys]> {
-  bits<6> cpsrfield;
-  bits<4> imm;
-  let Inst{20-19} = 0b00;
-  let Inst{18-16} = cpsrfield{5-3};
-  let Inst{15-12} = 0b0100;
-  let Inst{11-8} = imm;
-  let Inst{7-5} = cpsrfield{2-0};
-
-  let DecoderMethod = "DecodeSystemCPSRInstruction";
-}
-
-// SYS and SYSL generic system instructions.
-def SysCRAsmOperand : AsmOperandClass {
-  let Name = "SysCR";
-  let ParserMethod = "tryParseSysCROperand";
-}
-
-def sys_cr_op : Operand<i32> {
-  let PrintMethod = "printSysCROperand";
-  let ParserMatchClass = SysCRAsmOperand;
-}
-
-class SystemI<bit L, string asm>
-  : SimpleSystemI<L,
-                  (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-                   asm, "\t$op1, $Cn, $Cm, $op2">,
-    Sched<[WriteSys]> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2, GPR64:$Rt),
-       asm, "\t$op1, $Cn, $Cm, $op2, $Rt"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-class SystemLXtI<bit L, string asm>
-  : RtSystemI<L, (outs),
-       (ins GPR64:$Rt, imm0_7:$op1, sys_cr_op:$Cn, sys_cr_op:$Cm, imm0_7:$op2),
-       asm, "\t$Rt, $op1, $Cn, $Cm, $op2"> {
-  bits<3> op1;
-  bits<4> Cn;
-  bits<4> Cm;
-  bits<3> op2;
-  let Inst{20-19} = 0b01;
-  let Inst{18-16} = op1;
-  let Inst{15-12} = Cn;
-  let Inst{11-8}  = Cm;
-  let Inst{7-5}   = op2;
-}
-
-
-// Branch (register) instructions:
-//
-//  case opc of
-//    0001 blr
-//    0000 br
-//    0101 dret
-//    0100 eret
-//    0010 ret
-//    otherwise UNDEFINED
-class BaseBranchReg<bits<4> opc, dag oops, dag iops, string asm,
-                    string operands, list<dag> pattern>
-    : I<oops, iops, asm, operands, "", pattern>, Sched<[WriteBrReg]> {
-  let Inst{31-25} = 0b1101011;
-  let Inst{24-21} = opc;
-  let Inst{20-16} = 0b11111;
-  let Inst{15-10} = 0b000000;
-  let Inst{4-0}   = 0b00000;
-}
-
-class BranchReg<bits<4> opc, string asm, list<dag> pattern>
-    : BaseBranchReg<opc, (outs), (ins GPR64:$Rn), asm, "\t$Rn", pattern> {
-  bits<5> Rn;
-  let Inst{9-5} = Rn;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1, isReturn = 1 in
-class SpecialReturn<bits<4> opc, string asm>
-    : BaseBranchReg<opc, (outs), (ins), asm, "", []> {
-  let Inst{9-5} = 0b11111;
-}
-
-//---
-// Conditional branch instruction.
-//---
-// Branch condition code.
-// 4-bit immediate. Pretty-printed as .<cc>
-def dotCcode : Operand<i32> {
-  let PrintMethod = "printDotCondCode";
-}
-
-// Conditional branch target. 19-bit immediate. The low two bits of the target
-// offset are implied zero and so are not part of the immediate.
-def BranchTarget19Operand : AsmOperandClass {
-  let Name = "BranchTarget19";
-}
-def am_brcond : Operand<OtherVT> {
-  let EncoderMethod = "getCondBranchTargetOpValue";
-  let DecoderMethod = "DecodeCondBranchTarget";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget19Operand;
-}
-
-class BranchCond : I<(outs), (ins dotCcode:$cond, am_brcond:$target),
-                     "b", "$cond\t$target", "",
-                     [(ARM64brcond bb:$target, imm:$cond, CPSR)]>,
-                   Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-  let Uses = [CPSR];
-
-  bits<4> cond;
-  bits<19> target;
-  let Inst{31-24} = 0b01010100;
-  let Inst{23-5} = target;
-  let Inst{4} = 0;
-  let Inst{3-0} = cond;
-}
-
-//---
-// Compare-and-branch instructions.
-//---
-class BaseCmpBranch<RegisterClass regtype, bit op, string asm, SDNode node>
-    : I<(outs), (ins regtype:$Rt, am_brcond:$target),
-         asm, "\t$Rt, $target", "",
-         [(node regtype:$Rt, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<19> target;
-  let Inst{30-25} = 0b011010;
-  let Inst{24}    = op;
-  let Inst{23-5}  = target;
-  let Inst{4-0}   = Rt;
-}
-
-multiclass CmpBranch<bit op, string asm, SDNode node> {
-  def W : BaseCmpBranch<GPR32, op, asm, node> {
-    let Inst{31} = 0;
-  }
-  def X : BaseCmpBranch<GPR64, op, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Test-bit-and-branch instructions.
-//---
-// Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
-// the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
-def am_tbrcond : Operand<OtherVT> {
-  let EncoderMethod = "getTestBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget14Operand;
-}
-
-class TestBranch<bit op, string asm, SDNode node>
-    : I<(outs), (ins GPR64:$Rt, imm0_63:$bit_off, am_tbrcond:$target),
-       asm, "\t$Rt, $bit_off, $target", "",
-       [(node GPR64:$Rt, imm0_63:$bit_off, bb:$target)]>,
-      Sched<[WriteBr]> {
-  let isBranch = 1;
-  let isTerminator = 1;
-
-  bits<5> Rt;
-  bits<6> bit_off;
-  bits<14> target;
-
-  let Inst{31}    = bit_off{5};
-  let Inst{30-25} = 0b011011;
-  let Inst{24}    = op;
-  let Inst{23-19} = bit_off{4-0};
-  let Inst{18-5}  = target;
-  let Inst{4-0}   = Rt;
-
-  let DecoderMethod = "DecodeTestAndBranch";
-}
-
-//---
-// Unconditional branch (immediate) instructions.
-//---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-}
-def am_b_target : Operand<OtherVT> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-def am_bl_target : Operand<i64> {
-  let EncoderMethod = "getBranchTargetOpValue";
-  let PrintMethod = "printAlignedBranchTarget";
-  let ParserMatchClass = BranchTarget26Operand;
-}
-
-class BImm<bit op, dag iops, string asm, list<dag> pattern>
-    : I<(outs), iops, asm, "\t$addr", "", pattern>, Sched<[WriteBr]> {
-  bits<26> addr;
-  let Inst{31}    = op;
-  let Inst{30-26} = 0b00101;
-  let Inst{25-0}  = addr;
-
-  let DecoderMethod = "DecodeUnconditionalBranch";
-}
-
-class BranchImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_b_target:$addr), asm, pattern>;
-class CallImm<bit op, string asm, list<dag> pattern>
-    : BImm<op, (ins am_bl_target:$addr), asm, pattern>;
-
-//---
-// Basic one-operand data processing instructions.
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandData<bits<3> opc, RegisterClass regtype, string asm,
-                         SDPatternOperator node>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-      [(set regtype:$Rd, (node regtype:$Rn))]>,
-    Sched<[WriteI]> {
-  bits<5> Rd;
-  bits<5> Rn;
-
-  let Inst{30-13} = 0b101101011000000000;
-  let Inst{12-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass OneOperandData<bits<3> opc, string asm,
-                          SDPatternOperator node = null_frag> {
-  def Wr : BaseOneOperandData<opc, GPR32, asm, node> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseOneOperandData<opc, GPR64, asm, node> {
-    let Inst{31} = 1;
-  }
-}
-
-class OneWRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR32, asm, node> {
-  let Inst{31} = 0;
-}
-
-class OneXRegData<bits<3> opc, string asm, SDPatternOperator node>
-    : BaseOneOperandData<opc, GPR64, asm, node> {
-  let Inst{31} = 1;
-}
-
-//---
-// Basic two-operand data processing instructions.
-//---
-class BaseBaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                          list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30}    = isSub;
-  let Inst{28-21} = 0b11010000;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseAddSubCarry<bit isSub, RegisterClass regtype, string asm,
-                      SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR))]>;
-
-class BaseAddSubCarrySetFlags<bit isSub, RegisterClass regtype, string asm,
-                              SDNode OpNode>
-    : BaseBaseAddSubCarry<isSub, regtype, asm,
-        [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm, CPSR)),
-         (implicit CPSR)]> {
-  let Defs = [CPSR];
-}
-
-multiclass AddSubCarry<bit isSub, string asm, string asm_setflags,
-                       SDNode OpNode, SDNode OpNode_setflags> {
-  def Wr : BaseAddSubCarry<isSub, GPR32, asm, OpNode> {
-    let Inst{31} = 0;
-    let Inst{29} = 0;
-  }
-  def Xr : BaseAddSubCarry<isSub, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-    let Inst{29} = 0;
-  }
-
-  // Sets flags.
-  def SWr : BaseAddSubCarrySetFlags<isSub, GPR32, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 0;
-    let Inst{29} = 1;
-  }
-  def SXr : BaseAddSubCarrySetFlags<isSub, GPR64, asm_setflags,
-                                    OpNode_setflags> {
-    let Inst{31} = 1;
-    let Inst{29} = 1;
-  }
-}
-
-class BaseTwoOperand<bits<4> opc, RegisterClass regtype, string asm,
-                     SDPatternOperator OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-14} = 0b00;
-  let Inst{13-10} = opc;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseDiv<bit isSigned, RegisterClass regtype, string asm,
-              SDPatternOperator OpNode>
-    : BaseTwoOperand<{0,0,1,?}, regtype, asm, OpNode> {
-  let Inst{10}    = isSigned;
-}
-
-multiclass Div<bit isSigned, string asm, SDPatternOperator OpNode> {
-  def Wr : BaseDiv<isSigned, GPR32, asm, OpNode>,
-           Sched<[WriteID32]> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseDiv<isSigned, GPR64, asm, OpNode>,
-           Sched<[WriteID64]> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseShift<bits<2> shift_type, RegisterClass regtype, string asm,
-                SDPatternOperator OpNode = null_frag>
-  : BaseTwoOperand<{1,0,?,?}, regtype, asm, OpNode>,
-    Sched<[WriteIS]> {
-  let Inst{11-10} = shift_type;
-}
-
-multiclass Shift<bits<2> shift_type, string asm, SDNode OpNode> {
-  def Wr : BaseShift<shift_type, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xr : BaseShift<shift_type, GPR64, asm, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, i64:$Rm)),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn,
-                                             (EXTRACT_SUBREG i64:$Rm, sub_32))>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (zext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (anyext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-
-  def : Pat<(i32 (OpNode GPR32:$Rn, (i64 (sext GPR32:$Rm)))),
-            (!cast<Instruction>(NAME # "Wr") GPR32:$Rn, GPR32:$Rm)>;
-}
-
-class ShiftAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2)>;
-
-class BaseMulAccum<bit isSub, bits<3> opc, RegisterClass multype,
-                       RegisterClass addtype, string asm,
-                       list<dag> pattern>
-  : I<(outs addtype:$Rd), (ins multype:$Rn, multype:$Rm, addtype:$Ra),
-      asm, "\t$Rd, $Rn, $Rm, $Ra", "", pattern> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{30-24} = 0b0011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass MulAccum<bit isSub, string asm, SDNode AccNode> {
-  def Wrrr : BaseMulAccum<isSub, 0b000, GPR32, GPR32, asm,
-      [(set GPR32:$Rd, (AccNode GPR32:$Ra, (mul GPR32:$Rn, GPR32:$Rm)))]>,
-      Sched<[WriteIM32]> {
-    let Inst{31} = 0;
-  }
-
-  def Xrrr : BaseMulAccum<isSub, 0b000, GPR64, GPR64, asm,
-      [(set GPR64:$Rd, (AccNode GPR64:$Ra, (mul GPR64:$Rn, GPR64:$Rm)))]>,
-      Sched<[WriteIM64]> {
-    let Inst{31} = 1;
-  }
-}
-
-class WideMulAccum<bit isSub, bits<3> opc, string asm,
-                   SDNode AccNode, SDNode ExtNode>
-  : BaseMulAccum<isSub, opc, GPR32, GPR64, asm,
-    [(set GPR64:$Rd, (AccNode GPR64:$Ra,
-                            (mul (ExtNode GPR32:$Rn), (ExtNode GPR32:$Rm))))]>,
-    Sched<[WriteIM32]> {
-  let Inst{31} = 1;
-}
-
-class MulHi<bits<3> opc, string asm, SDNode OpNode>
-  : I<(outs GPR64:$Rd), (ins GPR64:$Rn, GPR64:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, GPR64:$Rm))]>,
-    Sched<[WriteIM64]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-24} = 0b10011011;
-  let Inst{23-21} = opc;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b011111;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class MulAccumWAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR32:$dst, GPR32:$src1, GPR32:$src2, WZR)>;
-class MulAccumXAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR64:$src1, GPR64:$src2, XZR)>;
-class WideMulAccumAlias<string asm, Instruction inst>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst GPR64:$dst, GPR32:$src1, GPR32:$src2, XZR)>;
-
-class BaseCRC32<bit sf, bits<2> sz, bit C, RegisterClass StreamReg,
-              SDPatternOperator OpNode, string asm>
-  : I<(outs GPR32:$Rd), (ins GPR32:$Rn, StreamReg:$Rm),
-      asm, "\t$Rd, $Rn, $Rm", "",
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, StreamReg:$Rm))]>,
-    Sched<[WriteISReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31} = sf;
-  let Inst{30-21} = 0b0011010110;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = 0b010;
-  let Inst{12} = C;
-  let Inst{11-10} = sz;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-//---
-// Address generation.
-//---
-
-class ADRI<bit page, string asm, Operand adr, list<dag> pattern>
-    : I<(outs GPR64:$Xd), (ins adr:$label), asm, "\t$Xd, $label", "",
-        pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Xd;
-  bits<21> label;
-  let Inst{31}    = page;
-  let Inst{30-29} = label{1-0};
-  let Inst{28-24} = 0b10000;
-  let Inst{23-5}  = label{20-2};
-  let Inst{4-0}   = Xd;
-
-  let DecoderMethod = "DecodeAdrInstruction";
-}
-
-//---
-// Move immediate.
-//---
-
-def movimm32_imm : Operand<i32> {
-  let ParserMatchClass = Imm0_65535Operand;
-  let EncoderMethod = "getMoveWideImmOpValue";
-}
-def movimm32_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm32ShifterOperand;
-}
-def movimm64_shift : Operand<i32> {
-  let PrintMethod = "printShifter";
-  let ParserMatchClass = MovImm64ShifterOperand;
-}
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseMoveImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                        string asm>
-  : I<(outs regtype:$Rd), (ins movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "", []>,
-    Sched<[WriteImm]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass MoveImmediate<bits<2> opc, string asm> {
-  def Wi : BaseMoveImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseMoveImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseInsertImmediate<bits<2> opc, RegisterClass regtype, Operand shifter,
-                          string asm>
-  : I<(outs regtype:$Rd),
-      (ins regtype:$src, movimm32_imm:$imm, shifter:$shift),
-       asm, "\t$Rd, $imm$shift", "$src = $Rd", []>,
-    Sched<[WriteI]> {
-  bits<5> Rd;
-  bits<16> imm;
-  bits<6> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100101;
-  let Inst{22-21} = shift{5-4};
-  let Inst{20-5}  = imm;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeMoveImmInstruction";
-}
-
-multiclass InsertImmediate<bits<2> opc, string asm> {
-  def Wi : BaseInsertImmediate<opc, GPR32, movimm32_shift, asm> {
-    let Inst{31} = 0;
-  }
-
-  def Xi : BaseInsertImmediate<opc, GPR64, movimm64_shift, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Add/Subtract
-//---
-
-class BaseAddSubImm<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                    RegisterClass srcRegtype, addsub_shifted_imm immtype,
-                    string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$Rd), (ins srcRegtype:$Rn, immtype:$imm),
-        asm, "\t$Rd, $Rn, $imm", "",
-        [(set dstRegtype:$Rd, (OpNode srcRegtype:$Rn, immtype:$imm))]>,
-      Sched<[WriteI]>  {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<14> imm;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b10001;
-  let Inst{23-22} = imm{13-12}; // '00' => lsl #0, '01' => lsl #12
-  let Inst{21-10} = imm{11-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-  let DecoderMethod = "DecodeBaseAddSubImm";
-}
-
-class BaseAddSubRegPseudo<RegisterClass regtype,
-                          SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
-
-class BaseAddSubSReg<bit isSub, bit setFlags, RegisterClass regtype,
-                     arith_shifted_reg shifted_regtype, string asm,
-                     SDPatternOperator OpNode>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "",
-        [(set regtype:$Rd, (OpNode regtype:$Rn, shifted_regtype:$Rm))]>,
-      Sched<[WriteISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = 0;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-class BaseAddSubEReg<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, Operand src2Regtype,
-                     string asm, SDPatternOperator OpNode>
-    : I<(outs dstRegtype:$R1),
-        (ins src1Regtype:$R2, src2Regtype:$R3),
-        asm, "\t$R1, $R2, $R3", "",
-        [(set dstRegtype:$R1, (OpNode src1Regtype:$R2, src2Regtype:$R3))]>,
-      Sched<[WriteIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15-13} = ext{5-3};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseAddSubEReg64<bit isSub, bit setFlags, RegisterClass dstRegtype,
-                       RegisterClass src1Regtype, RegisterClass src2Regtype,
-                       Operand ext_op, string asm>
-    : I<(outs dstRegtype:$Rd),
-        (ins src1Regtype:$Rn, src2Regtype:$Rm, ext_op:$ext),
-        asm, "\t$Rd, $Rn, $Rm$ext", "", []>,
-      Sched<[WriteIEReg]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> ext;
-  let Inst{30}    = isSub;
-  let Inst{29}    = setFlags;
-  let Inst{28-24} = 0b01011;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = ext{5};
-  let Inst{12-10} = ext{2-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeAddSubERegInstruction";
-}
-
-// Aliases for register+register add/subtract.
-class AddSubRegAlias<string asm, Instruction inst, RegisterClass dstRegtype,
-                     RegisterClass src1Regtype, RegisterClass src2Regtype,
-                     int shiftExt>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst dstRegtype:$dst, src1Regtype:$src1, src2Regtype:$src2,
-                      shiftExt)>;
-
-multiclass AddSub<bit isSub, string mnemonic,
-                  SDPatternOperator OpNode = null_frag> {
-  let hasSideEffects = 0 in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 0, GPR32sp, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 0, GPR64sp, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register - Only used for CodeGen
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 0, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 0, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1, hasSideEffects = 0 in {
-  def Wrx : BaseAddSubEReg<isSub, 0, GPR32sp, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 0, GPR64sp, GPR64sp,
-                           arith_extended_reg32to64<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 0, GPR64sp, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when either the destination or
-  // first source register is SP.  This relies on the shifted register aliases
-  // above matching first in the case when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32sp, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64sp, GPR64sp, GPR64, 24>; // UXTX #0
-}
-
-multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [CPSR] in {
-  // Add/Subtract immediate
-  def Wri  : BaseAddSubImm<isSub, 1, GPR32, GPR32sp, addsub_shifted_imm32,
-                           mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xri  : BaseAddSubImm<isSub, 1, GPR64, GPR64sp, addsub_shifted_imm64,
-                           mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract register
-  def Wrr : BaseAddSubRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseAddSubRegPseudo<GPR64, OpNode>;
-
-  // Add/Subtract shifted register
-  def Wrs : BaseAddSubSReg<isSub, 1, GPR32, arith_shifted_reg32, mnemonic,
-                           OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseAddSubSReg<isSub, 1, GPR64, arith_shifted_reg64, mnemonic,
-                           OpNode> {
-    let Inst{31} = 1;
-  }
-
-  // Add/Subtract extended register
-  let AddedComplexity = 1 in {
-  def Wrx : BaseAddSubEReg<isSub, 1, GPR32, GPR32sp,
-                           arith_extended_reg32<i32>, mnemonic, OpNode> {
-    let Inst{31} = 0;
-  }
-  def Xrx : BaseAddSubEReg<isSub, 1, GPR64, GPR64sp,
-                           arith_extended_reg32<i64>, mnemonic, OpNode> {
-    let Inst{31} = 1;
-  }
-  }
-
-  def Xrx64 : BaseAddSubEReg64<isSub, 1, GPR64, GPR64sp, GPR64,
-                               arith_extendlsl64, mnemonic> {
-    // UXTX and SXTX only.
-    let Inst{14-13} = 0b11;
-    let Inst{31} = 1;
-  }
-  } // Defs = [CPSR]
-
-  // Register/register aliases with no shift when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrs"),
-                       GPR32, GPR32, GPR32, 0>;
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Xrs"),
-                       GPR64, GPR64, GPR64, 0>;
-
-  // Register/register aliases with no shift when the first source register
-  // is SP.  This relies on the shifted register aliases above matching first
-  // in the case when SP is not used.
-  def : AddSubRegAlias<mnemonic, !cast<Instruction>(NAME#"Wrx"),
-                       GPR32, GPR32sp, GPR32, 16>; // UXTW #0
-  def : AddSubRegAlias<mnemonic,
-                       !cast<Instruction>(NAME#"Xrx64"),
-                       GPR64, GPR64sp, GPR64, 24>; // UXTX #0
-}
-
-//---
-// Extract
-//---
-def SDTA64EXTR : SDTypeProfile<1, 3, [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>,
-                                      SDTCisPtrTy<3>]>;
-def ARM64Extr : SDNode<"ARM64ISD::EXTR", SDTA64EXTR>;
-
-class BaseExtractImm<RegisterClass regtype, Operand imm_type, string asm,
-                     list<dag> patterns>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $Rm, $imm", "", patterns>,
-      Sched<[WriteExtr, ReadExtrHi]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<6> imm;
-
-  let Inst{30-23} = 0b00100111;
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = imm;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ExtractImm<string asm> {
-  def Wrri : BaseExtractImm<GPR32, imm0_31, asm,
-                      [(set GPR32:$Rd,
-                        (ARM64Extr GPR32:$Rn, GPR32:$Rm, imm0_31:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xrri : BaseExtractImm<GPR64, imm0_63, asm,
-                      [(set GPR64:$Rd,
-                        (ARM64Extr GPR64:$Rn, GPR64:$Rm, imm0_63:$imm))]> {
-
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Bitfield
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImm<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, imm_type:$immr, imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "", []>,
-      Sched<[WriteIS]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImm<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImm<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xri : BaseBitfieldImm<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseBitfieldImmWith2RegArgs<bits<2> opc,
-                      RegisterClass regtype, Operand imm_type, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$src, regtype:$Rn, imm_type:$immr,
-                             imm_type:$imms),
-         asm, "\t$Rd, $Rn, $immr, $imms", "$src = $Rd", []>,
-      Sched<[WriteIS]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> immr;
-  bits<6> imms;
-
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100110;
-  let Inst{21-16} = immr;
-  let Inst{15-10} = imms;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass BitfieldImmWith2RegArgs<bits<2> opc, string asm> {
-  def Wri : BaseBitfieldImmWith2RegArgs<opc, GPR32, imm0_31, asm> {
-    let Inst{31} = 0;
-    let Inst{22} = 0;
-  }
-  def Xri : BaseBitfieldImmWith2RegArgs<opc, GPR64, imm0_63, asm> {
-    let Inst{31} = 1;
-    let Inst{22} = 1;
-  }
-}
-
-//---
-// Logical
-//---
-
-// Logical (immediate)
-class BaseLogicalImm<bits<2> opc, RegisterClass dregtype,
-                     RegisterClass sregtype, Operand imm_type, string asm,
-                     list<dag> pattern>
-    : I<(outs dregtype:$Rd), (ins sregtype:$Rn, imm_type:$imm),
-         asm, "\t$Rd, $Rn, $imm", "", pattern>,
-      Sched<[WriteI]> {
-  bits<5>  Rd;
-  bits<5>  Rn;
-  bits<13> imm;
-  let Inst{30-29} = opc;
-  let Inst{28-23} = 0b100100;
-  let Inst{22}    = imm{12};
-  let Inst{21-16} = imm{11-6};
-  let Inst{15-10} = imm{5-0};
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-
-  let DecoderMethod = "DecodeLogicalImmInstruction";
-}
-
-// Logical (shifted register)
-class BaseLogicalSReg<bits<2> opc, bit N, RegisterClass regtype,
-                      logical_shifted_reg shifted_regtype, string asm,
-                      list<dag> pattern>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, shifted_regtype:$Rm),
-        asm, "\t$Rd, $Rn, $Rm", "", pattern>,
-      Sched<[WriteISReg]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<8> shift;
-  let Inst{30-29} = opc;
-  let Inst{28-24} = 0b01010;
-  let Inst{23-22} = shift{7-6};
-  let Inst{21}    = N;
-  let Inst{20-16} = src2;
-  let Inst{15-10} = shift{5-0};
-  let Inst{9-5}   = src1;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeThreeAddrSRegInstruction";
-}
-
-// Aliases for register+register logical instructions.
-class LogicalRegAlias<string asm, Instruction inst, RegisterClass regtype>
-    : InstAlias<asm#" $dst, $src1, $src2",
-                (inst regtype:$dst, regtype:$src1, regtype:$src2, 0)>;
-
-let AddedComplexity = 6 in
-multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode> {
-  def Wri : BaseLogicalImm<opc, GPR32sp, GPR32, logical_imm32, mnemonic,
-                           [(set GPR32sp:$Rd, (OpNode GPR32:$Rn,
-                                               logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri : BaseLogicalImm<opc, GPR64sp, GPR64, logical_imm64, mnemonic,
-                           [(set GPR64sp:$Rd, (OpNode GPR64:$Rn,
-                                               logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-}
-
-multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode> {
-  let isCompare = 1, Defs = [CPSR] in {
-  def Wri  : BaseLogicalImm<opc, GPR32, GPR32, logical_imm32, mnemonic,
-      [(set GPR32:$Rd, (OpNode GPR32:$Rn, logical_imm32:$imm))]> {
-    let Inst{31} = 0;
-    let Inst{22} = 0; // 64-bit version has an additional bit of immediate.
-  }
-  def Xri  : BaseLogicalImm<opc, GPR64, GPR64, logical_imm64, mnemonic,
-      [(set GPR64:$Rd, (OpNode GPR64:$Rn, logical_imm64:$imm))]> {
-    let Inst{31} = 1;
-  }
-  } // end Defs = [CPSR]
-}
-
-class BaseLogicalRegPseudo<RegisterClass regtype, SDPatternOperator OpNode>
-    : Pseudo<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-             [(set regtype:$Rd, (OpNode regtype:$Rn, regtype:$Rm))]>,
-      Sched<[WriteI]>;
-
-// Split from LogicalImm as not all instructions have both.
-multiclass LogicalReg<bits<2> opc, bit N, string mnemonic,
-                      SDPatternOperator OpNode> {
-  def Wrr : BaseLogicalRegPseudo<GPR32, OpNode>;
-  def Xrr : BaseLogicalRegPseudo<GPR64, OpNode>;
-
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic,
-                            [(set GPR32:$Rd, (OpNode GPR32:$Rn,
-                                                 logical_shifted_reg32:$Rm))]> {
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic,
-                            [(set GPR64:$Rd, (OpNode GPR64:$Rn,
-                                                 logical_shifted_reg64:$Rm))]> {
-    let Inst{31} = 1;
-  }
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-// Split from LogicalReg to allow setting CPSR Defs
-multiclass LogicalRegS<bits<2> opc, bit N, string mnemonic> {
-  let Defs = [CPSR], mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def Wrs : BaseLogicalSReg<opc, N, GPR32, logical_shifted_reg32, mnemonic, []>{
-    let Inst{31} = 0;
-  }
-  def Xrs : BaseLogicalSReg<opc, N, GPR64, logical_shifted_reg64, mnemonic, []>{
-    let Inst{31} = 1;
-  }
-  } // Defs = [CPSR]
-
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Wrs"), GPR32>;
-  def : LogicalRegAlias<mnemonic,
-                        !cast<Instruction>(NAME#"Xrs"), GPR64>;
-}
-
-//---
-// Conditionally set flags
-//---
-
-// Condition code.
-// 4-bit immediate. Pretty-printed as <cc>
-def ccode : Operand<i32> {
-  let PrintMethod = "printCondCode";
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsImm<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, imm0_31:$imm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $imm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  let Defs = [CPSR];
-
-  bits<5> Rn;
-  bits<5> imm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = imm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsImm<bit op, string asm> {
-  def Wi : BaseCondSetFlagsImm<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xi : BaseCondSetFlagsImm<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseCondSetFlagsReg<bit op, RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-  let Defs = [CPSR];
-
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b111010010;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = 0b0;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass CondSetFlagsReg<bit op, string asm> {
-  def Wr : BaseCondSetFlagsReg<op, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSetFlagsReg<op, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Conditional select
-//---
-
-class BaseCondSelect<bit op, bits<2> op2, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, regtype:$Rm, (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelect<bit op, bits<2> op2, string asm> {
-  def Wr : BaseCondSelect<op, op2, GPR32, asm> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelect<op, op2, GPR64, asm> {
-    let Inst{31} = 1;
-  }
-}
-
-class BaseCondSelectOp<bit op, bits<2> op2, RegisterClass regtype, string asm,
-                       PatFrag frag>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel regtype:$Rn, (frag regtype:$Rm),
-               (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteI]> {
-  let Uses = [CPSR];
-
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{30}    = op;
-  let Inst{29-21} = 0b011010100;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = op2;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass CondSelectOp<bit op, bits<2> op2, string asm, PatFrag frag> {
-  def Wr : BaseCondSelectOp<op, op2, GPR32, asm, frag> {
-    let Inst{31} = 0;
-  }
-  def Xr : BaseCondSelectOp<op, op2, GPR64, asm, frag> {
-    let Inst{31} = 1;
-  }
-}
-
-//---
-// Special Mask Value
-//---
-def maski8_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xff) == 0xff; }]> {
-}
-def maski16_or_more : Operand<i32>,
-  ImmLeaf<i32, [{ return (Imm & 0xffff) == 0xffff; }]> {
-}
-
-
-//---
-// Load/store
-//---
-
-// (unsigned immediate)
-// Indexed for 8-bit registers. offset is in range [0,4095].
-def MemoryIndexed8Operand : AsmOperandClass {
-  let Name = "MemoryIndexed8";
-  let DiagnosticType = "InvalidMemoryIndexed8";
-}
-def am_indexed8 : Operand<i64>,
-                  ComplexPattern<i64, 2, "SelectAddrModeIndexed8", []> {
-  let PrintMethod = "printAMIndexed8";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale1>";
-  let ParserMatchClass = MemoryIndexed8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 16-bit registers. offset is multiple of 2 in range [0,8190],
-// stored as immval/2 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed16Operand : AsmOperandClass {
-  let Name = "MemoryIndexed16";
-  let DiagnosticType = "InvalidMemoryIndexed16";
-}
-def am_indexed16 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed16", []> {
-  let PrintMethod = "printAMIndexed16";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale2>";
-  let ParserMatchClass = MemoryIndexed16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 32-bit registers. offset is multiple of 4 in range [0,16380],
-// stored as immval/4 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed32Operand : AsmOperandClass {
-  let Name = "MemoryIndexed32";
-  let DiagnosticType = "InvalidMemoryIndexed32";
-}
-def am_indexed32 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed32", []> {
-  let PrintMethod = "printAMIndexed32";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale4>";
-  let ParserMatchClass = MemoryIndexed32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 64-bit registers. offset is multiple of 8 in range [0,32760],
-// stored as immval/8 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed64Operand : AsmOperandClass {
-  let Name = "MemoryIndexed64";
-  let DiagnosticType = "InvalidMemoryIndexed64";
-}
-def am_indexed64 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed64", []> {
-  let PrintMethod = "printAMIndexed64";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale8>";
-  let ParserMatchClass = MemoryIndexed64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// Indexed for 128-bit registers. offset is multiple of 16 in range [0,65520],
-// stored as immval/16 (the 12-bit literal that encodes directly into the insn).
-def MemoryIndexed128Operand : AsmOperandClass {
-  let Name = "MemoryIndexed128";
-  let DiagnosticType = "InvalidMemoryIndexed128";
-}
-def am_indexed128 : Operand<i64>,
-                   ComplexPattern<i64, 2, "SelectAddrModeIndexed128", []> {
-  let PrintMethod = "printAMIndexed128";
-  let EncoderMethod
-      = "getAMIndexed8OpValue<ARM64::fixup_arm64_ldst_imm12_scale16>";
-  let ParserMatchClass = MemoryIndexed128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-
-// No offset.
-def MemoryNoIndexOperand : AsmOperandClass { let Name = "MemoryNoIndex"; }
-def am_noindex : Operand<i64>,
-                 ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemoryNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-}
-
-class BaseLoadStoreUI<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                      string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  bits<5> dst;
-
-  bits<17> addr;
-  bits<5> base = addr{4-0};
-  bits<12> offset = addr{16-5};
-
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b01;
-  let Inst{23-22} = opc;
-  let Inst{21-10} = offset;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeUnsignedLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs regtype:$Rt), (ins indextype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class StoreUI<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             Operand indextype, string asm, list<dag> pattern>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins regtype:$Rt, indextype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-def PrefetchOperand : AsmOperandClass {
-  let Name = "Prefetch";
-  let ParserMethod = "tryParsePrefetch";
-}
-def prfop : Operand<i32> {
-  let PrintMethod = "printPrefetchOp";
-  let ParserMatchClass = PrefetchOperand;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUI<sz, V, opc,
-                      (outs), (ins prfop:$Rt, am_indexed64:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load literal
-//---
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class LoadLiteral<bits<2> opc, bit V, RegisterClass regtype, string asm>
-    : I<(outs regtype:$Rt), (ins am_brcond:$label),
-        asm, "\t$Rt, $label", "", []>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchLiteral<bits<2> opc, bit V, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, am_brcond:$label),
-        asm, "\t$Rt, $label", "", pat>,
-      Sched<[WriteLD]> {
-  bits<5> Rt;
-  bits<19> label;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b011;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-5}  = label;
-  let Inst{4-0}   = Rt;
-}
-
-//---
-// Load/store register offset
-//---
-
-class MemROAsmOperand<int sz> : AsmOperandClass {
-  let Name = "MemoryRegisterOffset"#sz;
-}
-
-def MemROAsmOperand8 : MemROAsmOperand<8>;
-def MemROAsmOperand16 : MemROAsmOperand<16>;
-def MemROAsmOperand32 : MemROAsmOperand<32>;
-def MemROAsmOperand64 : MemROAsmOperand<64>;
-def MemROAsmOperand128 : MemROAsmOperand<128>;
-
-class ro_indexed<int sz> : Operand<i64> { // ComplexPattern<...>
-  let PrintMethod = "printMemoryRegOffset"#sz;
-  let MIOperandInfo = (ops GPR64sp:$base, GPR64:$offset, i32imm:$extend);
-}
-
-def ro_indexed8 : ro_indexed<8>, ComplexPattern<i64, 3, "SelectAddrModeRO8", []> {
-  let ParserMatchClass = MemROAsmOperand8;
-}
-
-def ro_indexed16 : ro_indexed<16>, ComplexPattern<i64, 3, "SelectAddrModeRO16", []> {
-  let ParserMatchClass = MemROAsmOperand16;
-}
-
-def ro_indexed32 : ro_indexed<32>, ComplexPattern<i64, 3, "SelectAddrModeRO32", []> {
-  let ParserMatchClass = MemROAsmOperand32;
-}
-
-def ro_indexed64 : ro_indexed<64>, ComplexPattern<i64, 3, "SelectAddrModeRO64", []> {
-  let ParserMatchClass = MemROAsmOperand64;
-}
-
-def ro_indexed128 : ro_indexed<128>, ComplexPattern<i64, 3, "SelectAddrModeRO128", []> {
-  let ParserMatchClass = MemROAsmOperand128;
-}
-
-class LoadStore8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed8:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store8RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore8RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed8:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed16:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store16RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore16RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed16:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-class Load32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed32:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-class Store32RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore32RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed32:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-class LoadStore64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed64:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store64RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore64RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed64:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-
-class LoadStore128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                      string asm, dag ins, dag outs, list<dag> pat>
-    : I<ins, outs, asm, "\t$Rt, $addr", "", pat> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class Load128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs regtype:$Rt), (ins ro_indexed128:$addr), pat>,
-    Sched<[WriteLDIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-class Store128RO<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm, list<dag> pat>
-  : LoadStore128RO<sz, V, opc, regtype, asm,
-                 (outs), (ins regtype:$Rt, ro_indexed128:$addr), pat>,
-    Sched<[WriteSTIdx, ReadAdrBase]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchRO<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : I<(outs), (ins prfop:$Rt, ro_indexed64:$addr), asm,
-         "\t$Rt, $addr", "", pat>,
-      Sched<[WriteLD]> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<5> offset;
-  bits<4> extend;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 1;
-  let Inst{20-16} = offset;
-  let Inst{15-13} = extend{3-1};
-
-  let Inst{12}    = extend{0};
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeRegOffsetLdStInstruction";
-}
-
-//---
-// Load/store unscaled immediate
-//---
-
-def MemoryUnscaledOperand : AsmOperandClass {
-  let Name = "MemoryUnscaled";
-  let DiagnosticType = "InvalidMemoryIndexedSImm9";
-}
-class am_unscaled_operand : Operand<i64> {
-  let PrintMethod = "printAMUnscaled";
-  let ParserMatchClass = MemoryUnscaledOperand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled   : am_unscaled_operand;
-def am_unscaled8  : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled8", []>;
-def am_unscaled16 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled16", []>;
-def am_unscaled32 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled32", []>;
-def am_unscaled64 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled64", []>;
-def am_unscaled128 : am_unscaled_operand,
-                    ComplexPattern<i64, 2, "SelectAddrModeUnscaled128", []>;
-
-class BaseLoadStoreUnscale<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                           string asm, list<dag> pattern>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", pattern> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let AddedComplexity = 1 in // try this before LoadUI
-class LoadUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs regtype:$Rt),
-                           (ins amtype:$addr), asm, pattern>,
-      Sched<[WriteLD]>;
-
-let AddedComplexity = 1 in // try this before StoreUI
-class StoreUnscaled<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    Operand amtype, string asm, list<dag> pattern>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins regtype:$Rt, amtype:$addr), asm, pattern>,
-      Sched<[WriteST]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class PrefetchUnscaled<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
-    : BaseLoadStoreUnscale<sz, V, opc, (outs),
-                           (ins prfop:$Rt, am_unscaled:$addr), asm, pat>,
-      Sched<[WriteLD]>;
-
-//---
-// Load/store unscaled immediate, unprivileged
-//---
-
-class BaseLoadStoreUnprivileged<bits<2> sz, bit V, bits<2> opc,
-                                dag oops, dag iops, string asm>
-    : I<oops, iops, asm, "\t$Rt, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                   string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs regtype:$Rt), (ins am_unscaled:$addr), asm>,
-      Sched<[WriteLD]>;
-}
-
-let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-class StoreUnprivileged<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-                    string asm>
-    : BaseLoadStoreUnprivileged<sz, V, opc,
-                      (outs), (ins regtype:$Rt, am_unscaled:$addr), asm>,
-      Sched<[WriteST]>;
-}
-
-//---
-// Load/store pre-indexed
-//---
-
-class BaseLoadStorePreIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr!", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                     (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                     (ins am_unscaled:$addr), asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePreIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePreIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_unscaled:$addr),
-                       asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteAdr, WriteST]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-//
-// Loads aren't matched with patterns here at all, but rather in C++
-// custom lowering.
-let mayStore = 0, mayLoad = 1, hasSideEffects = 0 in {
-class LoadPreIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteAdr]>;
-class LoadPostIdxPseudo<RegisterClass regtype>
-    : Pseudo<(outs regtype:$Rt, GPR64sp:$wback),
-             (ins am_noindex:$addr, simm9:$offset), [],
-              "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteLD, WriteI]>;
-}
-multiclass StorePreIdxPseudo<RegisterClass regtype, ValueType Ty,
-                             SDPatternOperator OpNode> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$offset), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      Sched<[WriteAdr, WriteST]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$offset),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$offset)>;
-}
-
-//---
-// Load/store post-indexed
-//---
-
-// (pre-index) load/stores.
-class BaseLoadStorePostIdx<bits<2> sz, bit V, bits<2> opc, dag oops, dag iops,
-                          string asm, string cstr>
-    : I<oops, iops, asm, "\t$Rt, $addr, $idx", cstr, []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling.
-  bits<5> dst;
-  bits<5> base;
-  bits<9> offset;
-  let Inst{31-30} = sz;
-  let Inst{29-27} = 0b111;
-  let Inst{26}    = V;
-  let Inst{25-24} = 0b00;
-  let Inst{23-22} = opc;
-  let Inst{21}    = 0b0;
-  let Inst{20-12} = offset;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodeSignedLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-// FIXME: Modeling the write-back of these instructions for isel is tricky.
-//        we need the complex addressing mode for the memory reference, but
-//        we also need the write-back specified as a tied operand to the
-//        base register. That combination does not play nicely with
-//        the asm matcher and friends.
-class LoadPostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs regtype:$Rt/*, GPR64sp:$wback*/),
-                      (ins am_noindex:$addr, simm9:$idx),
-                      asm, ""/*"$addr.base = $wback"*/>,
-      Sched<[WriteLD, WriteI]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePostIdx<bits<2> sz, bit V, bits<2> opc, RegisterClass regtype,
-             string asm>
-    : BaseLoadStorePostIdx<sz, V, opc,
-                      (outs/* GPR64sp:$wback*/),
-                      (ins regtype:$Rt, am_noindex:$addr, simm9:$idx),
-                       asm, ""/*"$addr.base = $wback"*/>,
-    Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-} // hasSideEffects = 0
-
-// ISel pseudo-instructions which have the tied operands. When the MC lowering
-// logic finally gets smart enough to strip off tied operands that are just
-// for isel convenience, we can get rid of these pseudos and just reference
-// the real instructions directly.
-//
-// Ironically, also because of the writeback operands, we can't put the
-// matcher pattern directly on the instruction, but need to define it
-// separately.
-multiclass StorePostIdxPseudo<RegisterClass regtype, ValueType Ty,
-                              SDPatternOperator OpNode, Instruction Insn> {
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in
-  def _isel: Pseudo<(outs GPR64sp:$wback),
-                    (ins regtype:$Rt, am_noindex:$addr, simm9:$idx), [],
-                    "$addr.base = $wback,@earlyclobber $wback">,
-      PseudoInstExpansion<(Insn regtype:$Rt, am_noindex:$addr, simm9:$idx)>,
-      Sched<[WriteAdr, WriteST, ReadAdrBase]>;
-
-  def : Pat<(OpNode (Ty regtype:$Rt), am_noindex:$addr, simm9:$idx),
-            (!cast<Instruction>(NAME#_isel) regtype:$Rt, am_noindex:$addr,
-                                            simm9:$idx)>;
-}
-
-//---
-// Load/store pair
-//---
-
-// (indexed, offset)
-
-class BaseLoadStorePairOffset<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b010;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayLoad = 0, mayStore = 1 in
-class StorePairOffset<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairOffset<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-// (pre-indexed)
-
-def MemoryIndexed32SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed32SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed32SImm7";
-}
-def am_indexed32simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed32";
-  let ParserMatchClass = MemoryIndexed32SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed64SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed64SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed64SImm7";
-}
-def am_indexed64simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed64";
-  let ParserMatchClass = MemoryIndexed64SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-def MemoryIndexed128SImm7 : AsmOperandClass {
-  let Name = "MemoryIndexed128SImm7";
-  let DiagnosticType = "InvalidMemoryIndexed128SImm7";
-}
-def am_indexed128simm7 : Operand<i32> { // ComplexPattern<...>
-  let PrintMethod = "printAMIndexed128";
-  let ParserMatchClass = MemoryIndexed128SImm7;
-  let MIOperandInfo = (ops GPR64sp:$base, i32imm:$offset);
-}
-
-class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr!", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b011;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins addrmode:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPreIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand addrmode, string asm>
-    : BaseLoadStorePairPreIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, addrmode:$addr),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-// (post-indexed)
-
-class BaseLoadStorePairPostIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr, $idx", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b001;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins am_noindex:$addr, idxtype:$idx), asm>,
-      Sched<[WriteLD, WriteLDHi, WriteAdr]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairPostIdx<bits<2> opc, bit V, RegisterClass regtype,
-                       Operand idxtype, string asm>
-    : BaseLoadStorePairPostIdx<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2,
-                                  am_noindex:$addr, idxtype:$idx),
-                             asm>,
-      Sched<[WriteAdr, WriteSTP]>;
-} // hasSideEffects = 0
-
-//  (no-allocate)
-
-class BaseLoadStorePairNoAlloc<bits<2> opc, bit V, bit L, dag oops, dag iops,
-                              string asm>
-    : I<oops, iops, asm, "\t$Rt, $Rt2, $addr", "", []> {
-  // The operands are in order to match the 'addr' MI operands, so we
-  // don't need an encoder method and by-name matching. Just use the default
-  // in-order handling. Since we're using by-order, make sure the names
-  // do not match.
-  bits<5> dst;
-  bits<5> dst2;
-  bits<5> base;
-  bits<7> offset;
-  let Inst{31-30} = opc;
-  let Inst{29-27} = 0b101;
-  let Inst{26}    = V;
-  let Inst{25-23} = 0b000;
-  let Inst{22}    = L;
-  let Inst{21-15} = offset;
-  let Inst{14-10} = dst2;
-  let Inst{9-5}   = base;
-  let Inst{4-0}   = dst;
-
-  let DecoderMethod = "DecodePairLdStInstruction";
-}
-
-let hasSideEffects = 0 in {
-let mayStore = 0, mayLoad = 1 in
-class LoadPairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                     Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 1,
-                              (outs regtype:$Rt, regtype:$Rt2),
-                              (ins indextype:$addr), asm>,
-      Sched<[WriteLD, WriteLDHi]>;
-
-let mayStore = 1, mayLoad = 0 in
-class StorePairNoAlloc<bits<2> opc, bit V, RegisterClass regtype,
-                      Operand indextype, string asm>
-    : BaseLoadStorePairNoAlloc<opc, V, 0, (outs),
-                             (ins regtype:$Rt, regtype:$Rt2, indextype:$addr),
-                             asm>,
-      Sched<[WriteSTP]>;
-} // hasSideEffects = 0
-
-//---
-// Load/store exclusive
-//---
-
-// True exclusive operations write to and/or read from the system's exclusive
-// monitors, which as far as a compiler is concerned can be modelled as a
-// random shared memory address. Hence LoadExclusive mayStore.
-let hasSideEffects = 1, mayLoad = 1, mayStore = 1 in
-class BaseLoadStoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                             dag oops, dag iops, string asm, string operands>
-    : I<oops, iops, asm, operands, "", []> {
-  let Inst{31-30} = sz;
-  let Inst{29-24} = 0b001000;
-  let Inst{23}    = o2;
-  let Inst{22}    = L;
-  let Inst{21}    = o1;
-  let Inst{15}    = o0;
-
-  let DecoderMethod = "DecodeExclusiveLdStInstruction";
-}
-
-// Neither Rs nor Rt2 operands.
-class LoadStoreExclusiveSimple<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                               dag oops, dag iops, string asm, string operands>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
-  bits<5> reg;
-  bits<5> base;
-  let Inst{20-16} = 0b11111;
-  let Inst{14-10} = 0b11111;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-}
-
-// Simple load acquires don't set the exclusive monitor
-let mayLoad = 1, mayStore = 0 in
-class LoadAcquire<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                  RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                    RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs regtype:$Rt),
-                               (ins am_noindex:$addr), asm, "\t$Rt, $addr">,
-      Sched<[WriteLD]>;
-
-class LoadExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                       RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs regtype:$Rt, regtype:$Rt2),
-                             (ins am_noindex:$addr), asm,
-                             "\t$Rt, $Rt2, $addr">,
-      Sched<[WriteLD, WriteLDHi]> {
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{20-16} = 0b11111;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-}
-
-// Simple store release operations do not check the exclusive monitor.
-let mayLoad = 0, mayStore = 1 in
-class StoreRelease<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                   RegisterClass regtype, string asm>
-    : LoadStoreExclusiveSimple<sz, o2, L, o1, o0, (outs),
-                               (ins regtype:$Rt, am_noindex:$addr),
-                               asm, "\t$Rt, $addr">,
-      Sched<[WriteST]>;
-
-let mayLoad = 1, mayStore = 1 in
-class StoreExclusive<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                     RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0, (outs GPR32:$Ws),
-                             (ins regtype:$Rt, am_noindex:$addr),
-                             asm, "\t$Ws, $Rt, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> reg;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{14-10} = 0b11111;
-  let Inst{9-5} = base;
-  let Inst{4-0} = reg;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-class StoreExclusivePair<bits<2> sz, bit o2, bit L, bit o1, bit o0,
-                         RegisterClass regtype, string asm>
-    : BaseLoadStoreExclusive<sz, o2, L, o1, o0,
-                             (outs GPR32:$Ws),
-                             (ins regtype:$Rt, regtype:$Rt2, am_noindex:$addr),
-                              asm, "\t$Ws, $Rt, $Rt2, $addr">,
-      Sched<[WriteSTX]> {
-  bits<5> status;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<5> base;
-  let Inst{20-16} = status;
-  let Inst{14-10} = dst2;
-  let Inst{9-5} = base;
-  let Inst{4-0} = dst1;
-
-  let Constraints = "@earlyclobber $Ws";
-}
-
-//---
-// Exception generation
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
-class ExceptionGeneration<bits<3> op1, bits<2> ll, string asm>
-    : I<(outs), (ins imm0_65535:$imm), asm, "\t$imm", "", []>,
-      Sched<[WriteSys]> {
-  bits<16> imm;
-  let Inst{31-24} = 0b11010100;
-  let Inst{23-21} = op1;
-  let Inst{20-5}  = imm;
-  let Inst{4-2}   = 0b000;
-  let Inst{1-0}   = ll;
-}
-
-//---
-// Floating point to integer conversion
-//---
-
-class BaseFPToIntegerUnscaled<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30}    = 0;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPToInteger<bits<2> type, bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", []>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30}    = 0;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = type;
-  let Inst{21}    = 0;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPToInteger<bits<2> rmode, bits<3> opcode, string asm, SDPatternOperator OpN> {
-  // Unscaled single-precision to 32-bit
-  def UWSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled single-precision to 64-bit
-  def UXSr : BaseFPToIntegerUnscaled<0b00, rmode, opcode, FPR32, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR32:$Rn))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Unscaled double-precision to 32-bit
-  def UWDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Unscaled double-precision to 64-bit
-  def UXDr : BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN (f64 FPR64:$Rn)))]> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled single-precision to 32-bit
-  def SWSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR32,
-                              fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Scaled single-precision to 64-bit
-  def SXSri : BaseFPToInteger<0b00, rmode, opcode, FPR32, GPR64,
-                              fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-
-  // Scaled double-precision to 32-bit
-  def SWDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR32,
-                              fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-  }
-
-  // Scaled double-precision to 64-bit
-  def SXDri : BaseFPToInteger<0b01, rmode, opcode, FPR64, GPR64,
-                              fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-  }
-}
-
-//---
-// Integer to floating point conversion
-//---
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseIntegerToFP<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      Operand immType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn, immType:$scale),
-         asm, "\t$Rd, $Rn, $scale", "", []>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b00001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = scale;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseIntegerToFPUnscaled<bit isUnsigned,
-                      RegisterClass srcType, RegisterClass dstType,
-                      ValueType dvt, string asm, SDNode node>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn),
-         asm, "\t$Rd, $Rn", "", [(set (dvt dstType:$Rd), (node srcType:$Rn))]>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<6> scale;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21-17} = 0b10001;
-  let Inst{16}    = isUnsigned;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
-  // Unscaled
-  def UWSri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR32, f32, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UWDri: BaseIntegerToFPUnscaled<isUnsigned, GPR32, FPR64, f64, asm, node> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def UXSri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR32, f32, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def UXDri: BaseIntegerToFPUnscaled<isUnsigned, GPR64, FPR64, f64, asm, node> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  // Scaled
-  def SWSri: BaseIntegerToFP<isUnsigned, GPR32, FPR32, fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SWDri: BaseIntegerToFP<isUnsigned, GPR32, FPR64, fixedpoint32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SXSri: BaseIntegerToFP<isUnsigned, GPR64, FPR32, fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def SXDri: BaseIntegerToFP<isUnsigned, GPR64, FPR64, fixedpoint64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-}
-
-//---
-// Unscaled integer <-> floating point conversion (i.e. FMOV)
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversion<bits<2> rmode, bits<3> opcode,
-                      RegisterClass srcType, RegisterClass dstType,
-                      string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "",
-        // We use COPY_TO_REGCLASS for these bitconvert operations.
-        // copyPhysReg() expands the resultant COPY instructions after
-        // regalloc is done. This gives greater freedom for the allocator
-        // and related passes (coalescing, copy propagation, et. al.) to
-        // be more effective.
-        [/*(set (dvt dstType:$Rd), (bitconvert (svt srcType:$Rn)))*/]>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111100;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionToHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterClass srcType, RegisterOperand dstType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd[1], $Rn", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseUnscaledConversionFromHigh<bits<2> rmode, bits<3> opcode,
-                     RegisterOperand srcType, RegisterClass dstType, string asm>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn[1]", "", []>,
-      Sched<[WriteFCopy]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{30-23} = 0b00111101;
-  let Inst{21}    = 1;
-  let Inst{20-19} = rmode;
-  let Inst{18-16} = opcode;
-  let Inst{15-10} = 0b000000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-
-multiclass UnscaledConversion<string asm> {
-  def WSr : BaseUnscaledConversion<0b00, 0b111, GPR32, FPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def XDr : BaseUnscaledConversion<0b00, 0b111, GPR64, FPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def SWr : BaseUnscaledConversion<0b00, 0b110, FPR32, GPR32, asm> {
-    let Inst{31} = 0; // 32-bit GPR flag
-    let Inst{22} = 0; // 32-bit FPR flag
-  }
-
-  def DXr : BaseUnscaledConversion<0b00, 0b110, FPR64, GPR64, asm> {
-    let Inst{31} = 1; // 64-bit GPR flag
-    let Inst{22} = 1; // 64-bit FPR flag
-  }
-
-  def XDHighr : BaseUnscaledConversionToHigh<0b01, 0b111, GPR64, V128,
-                                             asm#".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def DXHighr : BaseUnscaledConversionFromHigh<0b01, 0b110, V128, GPR64,
-                                               asm#".d"> {
-    let Inst{31} = 1;
-    let Inst{22} = 0;
-  }
-
-  def : InstAlias<asm#"$Vd.d[1], $Rn",
-                  (!cast<Instruction>(NAME#XDHighr) V128:$Vd, GPR64:$Rn), 0>;
-  def : InstAlias<asm#"$Rd, $Vn.d[1]",
-                  (!cast<Instruction>(NAME#DXHighr) GPR64:$Rd, V128:$Vn), 0>;
-}
-
-//---
-// Floating point conversion
-//---
-
-class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
-                       RegisterClass srcType, string asm, list<dag> pattern>
-    : I<(outs dstType:$Rd), (ins srcType:$Rn), asm, "\t$Rd, $Rn", "", pattern>,
-      Sched<[WriteFCvt]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-24} = 0b00011110;
-  let Inst{23-22} = type;
-  let Inst{21-17} = 0b10001;
-  let Inst{16-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPConversion<string asm> {
-  // Double-precision to Half-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm, []>;
-
-  // Double-precision to Single-precision
-  def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
-                             [(set FPR32:$Rd, (fround FPR64:$Rn))]>;
-
-  // Half-precision to Double-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm, []>;
-
-  // Half-precision to Single-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm, []>;
-
-  // Single-precision to Double-precision
-  def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
-                             [(set FPR64:$Rd, (fextend FPR32:$Rn))]>;
-
-  // Single-precision to Half-precision
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-  def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm, []>;
-}
-
-//---
-// Single operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSingleOperandFPData<bits<4> opcode, RegisterClass regtype,
-                              ValueType vt, string asm, SDPatternOperator node>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn), asm, "\t$Rd, $Rn", "",
-         [(set (vt regtype:$Rd), (node (vt regtype:$Rn)))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21-19} = 0b100;
-  let Inst{18-15} = opcode;
-  let Inst{14-10} = 0b10000;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SingleOperandFPData<bits<4> opcode, string asm,
-                               SDPatternOperator node = null_frag> {
-  def Sr : BaseSingleOperandFPData<opcode, FPR32, f32, asm, node> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Dr : BaseSingleOperandFPData<opcode, FPR64, f64, asm, node> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Two operand floating point data processing
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPData<bits<4> opcode, RegisterClass regtype,
-                           string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm),
-         asm, "\t$Rd, $Rn, $Rm", "", pat>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass TwoOperandFPData<bits<4> opcode, string asm,
-                            SDPatternOperator node = null_frag> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                         [(set (f32 FPR32:$Rd),
-                               (node (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                         [(set (f64 FPR64:$Rd),
-                               (node (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
-  def Srr : BaseTwoOperandFPData<opcode, FPR32, asm,
-                  [(set FPR32:$Rd, (fneg (node FPR32:$Rn, (f32 FPR32:$Rm))))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drr : BaseTwoOperandFPData<opcode, FPR64, asm,
-                  [(set FPR64:$Rd, (fneg (node FPR64:$Rn, (f64 FPR64:$Rm))))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-
-//---
-// Three operand floating point data processing
-//---
-
-class BaseThreeOperandFPData<bit isNegated, bit isSub,
-                             RegisterClass regtype, string asm, list<dag> pat>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, regtype: $Ra),
-         asm, "\t$Rd, $Rn, $Rm, $Ra", "", pat>,
-      Sched<[WriteFMul]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<5> Ra;
-  let Inst{31-23} = 0b000111110;
-  let Inst{21}    = isNegated;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = isSub;
-  let Inst{14-10} = Ra;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
-                              SDPatternOperator node> {
-  def Srrr : BaseThreeOperandFPData<isNegated, isSub, FPR32, asm,
-            [(set FPR32:$Rd,
-                  (node (f32 FPR32:$Rn), (f32 FPR32:$Rm), (f32 FPR32:$Ra)))]> {
-    let Inst{22} = 0; // 32-bit size flag
-  }
-
-  def Drrr : BaseThreeOperandFPData<isNegated, isSub, FPR64, asm,
-            [(set FPR64:$Rd,
-                  (node (f64 FPR64:$Rn), (f64 FPR64:$Rm), (f64 FPR64:$Ra)))]> {
-    let Inst{22} = 1; // 64-bit size flag
-  }
-}
-
-//---
-// Floating point data comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseOneOperandFPComparison<bit signalAllNans,
-                                 RegisterClass regtype, string asm,
-                                 list<dag> pat>
-    : I<(outs), (ins regtype:$Rn), asm, "\t$Rn, #0.0", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-
-  let Inst{20-16} = 0b00000;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b1000;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseTwoOperandFPComparison<bit signalAllNans, RegisterClass regtype,
-                                string asm, list<dag> pat>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm), asm, "\t$Rn, $Rm", "", pat>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rm;
-  bits<5> Rn;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-10} = 0b001000;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = 0b0000;
-}
-
-multiclass FPComparison<bit signalAllNans, string asm,
-                        SDPatternOperator OpNode = null_frag> {
-  let Defs = [CPSR] in {
-  def Srr : BaseTwoOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode FPR32:$Rn, (f32 FPR32:$Rm)), (implicit CPSR)]> {
-    let Inst{22} = 0;
-  }
-
-  def Sri : BaseOneOperandFPComparison<signalAllNans, FPR32, asm,
-      [(OpNode (f32 FPR32:$Rn), fpimm0), (implicit CPSR)]> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseTwoOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode FPR64:$Rn, (f64 FPR64:$Rm)), (implicit CPSR)]> {
-    let Inst{22} = 1;
-  }
-
-  def Dri : BaseOneOperandFPComparison<signalAllNans, FPR64, asm,
-      [(OpNode (f64 FPR64:$Rn), fpimm0), (implicit CPSR)]> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [CPSR]
-}
-
-//---
-// Floating point conditional comparisons
-//---
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseFPCondComparison<bit signalAllNans,
-                              RegisterClass regtype, string asm>
-    : I<(outs), (ins regtype:$Rn, regtype:$Rm, imm0_15:$nzcv, ccode:$cond),
-         asm, "\t$Rn, $Rm, $nzcv, $cond", "", []>,
-      Sched<[WriteFCmp]> {
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> nzcv;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = Rn;
-  let Inst{4}     = signalAllNans;
-  let Inst{3-0}   = nzcv;
-}
-
-multiclass FPCondComparison<bit signalAllNans, string asm> {
-  let Defs = [CPSR], Uses = [CPSR] in {
-  def Srr : BaseFPCondComparison<signalAllNans, FPR32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drr : BaseFPCondComparison<signalAllNans, FPR64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Defs = [CPSR], Uses = [CPSR]
-}
-
-//---
-// Floating point conditional select
-//---
-
-class BaseFPCondSelect<RegisterClass regtype, ValueType vt, string asm>
-    : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, ccode:$cond),
-         asm, "\t$Rd, $Rn, $Rm, $cond", "",
-         [(set regtype:$Rd,
-               (ARM64csel (vt regtype:$Rn), regtype:$Rm,
-                          (i32 imm:$cond), CPSR))]>,
-      Sched<[WriteF]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> cond;
-
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = cond;
-  let Inst{11-10} = 0b11;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPCondSelect<string asm> {
-  let Uses = [CPSR] in {
-  def Srrr : BaseFPCondSelect<FPR32, f32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Drrr : BaseFPCondSelect<FPR64, f64, asm> {
-    let Inst{22} = 1;
-  }
-  } // Uses = [CPSR]
-}
-
-//---
-// Floating move immediate
-//---
-
-class BaseFPMoveImmediate<RegisterClass regtype, Operand fpimmtype, string asm>
-  : I<(outs regtype:$Rd), (ins fpimmtype:$imm), asm, "\t$Rd, $imm", "",
-      [(set regtype:$Rd, fpimmtype:$imm)]>,
-    Sched<[WriteFImm]> {
-  bits<5> Rd;
-  bits<8> imm;
-  let Inst{31-23} = 0b000111100;
-  let Inst{21}    = 1;
-  let Inst{20-13} = imm;
-  let Inst{12-5}  = 0b10000000;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass FPMoveImmediate<string asm> {
-  def Si : BaseFPMoveImmediate<FPR32, fpimm32, asm> {
-    let Inst{22} = 0;
-  }
-
-  def Di : BaseFPMoveImmediate<FPR64, fpimm64, asm> {
-    let Inst{22} = 1;
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD
-//----------------------------------------------------------------------------
-
-def VectorIndexBOperand : AsmOperandClass { let Name = "VectorIndexB"; }
-def VectorIndexHOperand : AsmOperandClass { let Name = "VectorIndexH"; }
-def VectorIndexSOperand : AsmOperandClass { let Name = "VectorIndexS"; }
-def VectorIndexDOperand : AsmOperandClass { let Name = "VectorIndexD"; }
-def VectorIndexB : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 16;
-}]> {
-  let ParserMatchClass = VectorIndexBOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexH : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 8;
-}]> {
-  let ParserMatchClass = VectorIndexHOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexS : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 4;
-}]> {
-  let ParserMatchClass = VectorIndexSOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-def VectorIndexD : Operand<i64>, ImmLeaf<i64, [{
-  return ((uint64_t)Imm) < 2;
-}]> {
-  let ParserMatchClass = VectorIndexDOperand;
-  let PrintMethod = "printVectorIndex";
-  let MIOperandInfo = (ops i64imm);
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register vector instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm|}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string kind,
-                        list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// All operand sizes distinguished in the encoding.
-multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-         [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-         [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-         [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-         [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-         [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-         [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v2i64 : BaseSIMDThreeSameVector<1, U, 0b11, opc, V128,
-                                      asm, ".2d",
-         [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-}
-
-// As above, but D sized elements unsupported.
-multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-        [(set V64:$Rd, (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-        [(set V128:$Rd, (v16i8 (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm))))]>;
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set V64:$Rd, (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set V128:$Rd, (v8i16 (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm))))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set V64:$Rd, (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set V128:$Rd, (v4i32 (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm))))]>;
-}
-
-multiclass SIMDThreeSameVectorBHSTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-      [(set (v8i8 V64:$dst),
-            (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVectorTied<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-      [(set (v16i8 V128:$dst),
-            (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-  def v4i16 : BaseSIMDThreeSameVectorTied<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-      [(set (v4i16 V64:$dst),
-            (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVectorTied<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-      [(set (v8i16 V128:$dst),
-            (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVectorTied<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-      [(set (v2i32 V64:$dst),
-            (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVectorTied<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-      [(set (v4i32 V128:$dst),
-            (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// As above, but only B sized elements supported.
-multiclass SIMDThreeSameVectorB<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8 : BaseSIMDThreeSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b",
-    [(set (v16i8 V128:$Rd),
-          (OpNode (v16i8 V128:$Rn), (v16i8 V128:$Rm)))]>;
-}
-
-// As above, but only S and D sized floating point elements supported.
-multiclass SIMDThreeSameVectorFP<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPCmp<bit U, bit S, bits<5> opc,
-                                    string asm,
-                                    SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVector<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVector<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVector<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-        [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-multiclass SIMDThreeSameVectorFPTied<bit U, bit S, bits<5> opc,
-                                 string asm, SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDThreeSameVectorTied<0, U, {S,0}, opc, V64,
-                                      asm, ".2s",
-     [(set (v2f32 V64:$dst),
-           (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn), (v2f32 V64:$Rm)))]>;
-  def v4f32 : BaseSIMDThreeSameVectorTied<1, U, {S,0}, opc, V128,
-                                      asm, ".4s",
-     [(set (v4f32 V128:$dst),
-           (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm)))]>;
-  def v2f64 : BaseSIMDThreeSameVectorTied<1, U, {S,1}, opc, V128,
-                                      asm, ".2d",
-     [(set (v2f64 V128:$dst),
-           (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn), (v2f64 V128:$Rm)))]>;
-}
-
-// As above, but D and B sized elements unsupported.
-multiclass SIMDThreeSameVectorHS<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v4i16 : BaseSIMDThreeSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16 : BaseSIMDThreeSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h",
-        [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v2i32 : BaseSIMDThreeSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32 : BaseSIMDThreeSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s",
-        [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-}
-
-// Logical three vector ops share opcode bits, and only use B sized elements.
-multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDThreeSameVector<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-                         [(set (v8i8 V64:$Rd), (OpNode V64:$Rn, V64:$Rm))]>;
-  def v16i8  : BaseSIMDThreeSameVector<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-                         [(set (v16i8 V128:$Rd), (OpNode V128:$Rn, V128:$Rm))]>;
-
-  def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode V64:$LHS, V64:$RHS)),
-          (!cast<Instruction>(NAME#"v8i8") V64:$LHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
-      (!cast<Instruction>(NAME#"v16i8") V128:$LHS, V128:$RHS)>;
-}
-
-multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorTied<0, U, size, 0b00011, V64,
-                                     asm, ".8b",
-             [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDThreeSameVectorTied<1, U, size, 0b00011, V128,
-                                     asm, ".16b",
-             [(set (v16i8 V128:$dst),
-                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                           (v16i8 V128:$Rm)))]>;
-
-  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
-                           (v4i16 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
-                           (v2i32 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
-                           (v1i64 V64:$RHS))),
-          (!cast<Instruction>(NAME#"v8i8")
-            V64:$LHS, V64:$MHS, V64:$RHS)>;
-
-  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
-                           (v8i16 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
-                           (v4i32 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
-                           (v2i64 V128:$RHS))),
-      (!cast<Instruction>(NAME#"v16i8")
-        V128:$LHS, V128:$MHS, V128:$RHS)>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, string asm, string dstkind,
-                        string srckind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoSameVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, list<dag> pattern>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind #
-      "|" # dstkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Supports B, H, and S element sizes.
-multiclass SIMDTwoVectorBHS<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".8b", ".8b",
-                          [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".16b", ".16b",
-                          [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".4h", ".4h",
-                          [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".8h", ".8h",
-                          [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-class BaseSIMDVectorLShiftLongBySize<bit Q, bits<2> size,
-                            RegisterOperand regtype, string asm, string dstkind,
-                            string srckind, string amount>
-  : I<(outs V128:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # dstkind # ", $Rn" # srckind # ", #" # amount #
-      "|" # dstkind # "\t$Rd, $Rn, #" #  amount # "}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-24} = 0b101110;
-  let Inst{23-22} = size;
-  let Inst{21-10} = 0b100001001110;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorLShiftLongBySizeBHS {
-  let neverHasSideEffects = 1 in {
-  def v8i8  : BaseSIMDVectorLShiftLongBySize<0, 0b00, V64,
-                                             "shll", ".8h",  ".8b", "8">;
-  def v16i8 : BaseSIMDVectorLShiftLongBySize<1, 0b00, V128,
-                                             "shll2", ".8h", ".16b", "8">;
-  def v4i16 : BaseSIMDVectorLShiftLongBySize<0, 0b01, V64,
-                                             "shll", ".4s",  ".4h", "16">;
-  def v8i16 : BaseSIMDVectorLShiftLongBySize<1, 0b01, V128,
-                                             "shll2", ".4s", ".8h", "16">;
-  def v2i32 : BaseSIMDVectorLShiftLongBySize<0, 0b10, V64,
-                                             "shll", ".2d",  ".2s", "32">;
-  def v4i32 : BaseSIMDVectorLShiftLongBySize<1, 0b10, V128,
-                                             "shll2", ".2d", ".4s", "32">;
-  }
-}
-
-// Supports all element sizes.
-multiclass SIMDLongTwoVector<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                      asm, ".4h", ".8b",
-               [(set (v4i16 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                      asm, ".8h", ".16b",
-               [(set (v8i16 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                      asm, ".2s", ".4h",
-               [(set (v2i32 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                      asm, ".4s", ".8h",
-               [(set (v4i32 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                      asm, ".1d", ".2s",
-               [(set (v1i64 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                      asm, ".2d", ".4s",
-               [(set (v2i64 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-multiclass SIMDLongTwoVectorTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v8i8_v4i16  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                          asm, ".4h", ".8b",
-      [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd),
-                                      (v8i8 V64:$Rn)))]>;
-  def v16i8_v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                          asm, ".8h", ".16b",
-      [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd),
-                                      (v16i8 V128:$Rn)))]>;
-  def v4i16_v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                          asm, ".2s", ".4h",
-      [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd),
-                                      (v4i16 V64:$Rn)))]>;
-  def v8i16_v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                          asm, ".4s", ".8h",
-      [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
-                                      (v8i16 V128:$Rn)))]>;
-  def v2i32_v1i64 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                          asm, ".1d", ".2s",
-      [(set (v1i64 V64:$dst), (OpNode (v1i64 V64:$Rd),
-                                      (v2i32 V64:$Rn)))]>;
-  def v4i32_v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                          asm, ".2d", ".4s",
-      [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd),
-                                      (v4i32 V128:$Rn)))]>;
-}
-
-// Supports all element sizes, except 1xD.
-multiclass SIMDTwoVectorBHSDTied<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVectorTied<0, U, 0b00, opc, V64,
-                                    asm, ".8b", ".8b",
-    [(set (v8i8 V64:$dst), (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVectorTied<1, U, 0b00, opc, V128,
-                                    asm, ".16b", ".16b",
-    [(set (v16i8 V128:$dst), (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVectorTied<0, U, 0b01, opc, V64,
-                                    asm, ".4h", ".4h",
-    [(set (v4i16 V64:$dst), (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVectorTied<1, U, 0b01, opc, V128,
-                                    asm, ".8h", ".8h",
-    [(set (v8i16 V128:$dst), (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVectorTied<0, U, 0b10, opc, V64,
-                                    asm, ".2s", ".2s",
-    [(set (v2i32 V64:$dst), (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVectorTied<1, U, 0b10, opc, V128,
-                                    asm, ".4s", ".4s",
-    [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVectorTied<1, U, 0b11, opc, V128,
-                                    asm, ".2d", ".2d",
-    [(set (v2i64 V128:$dst), (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-    [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn)))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-    [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v2i32 : BaseSIMDTwoSameVector<0, U, 0b10, opc, V64,
-                                asm, ".2s", ".2s",
-    [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, 0b10, opc, V128,
-                                asm, ".4s", ".4s",
-    [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2i64 : BaseSIMDTwoSameVector<1, U, 0b11, opc, V128,
-                                asm, ".2d", ".2d",
-    [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-// Supports only B element sizes.
-multiclass SIMDTwoVectorB<bit U, bits<2> size, bits<5> opc, string asm,
-                          SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, size, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn)))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, size, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-}
-
-// Supports only B and H element sizes.
-multiclass SIMDTwoVectorBH<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDTwoSameVector<0, U, 0b00, opc, V64,
-                                asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v16i8 : BaseSIMDTwoSameVector<1, U, 0b00, opc, V128,
-                                asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$Rd), (OpNode V128:$Rn))]>;
-  def v4i16 : BaseSIMDTwoSameVector<0, U, 0b01, opc, V64,
-                                asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$Rd), (OpNode V64:$Rn))]>;
-  def v8i16 : BaseSIMDTwoSameVector<1, U, 0b01, opc, V128,
-                                asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$Rd), (OpNode V128:$Rn))]>;
-}
-
-// Supports only S and D element sizes, uses high bit of the size field
-// as an extra opcode bit.
-multiclass SIMDTwoVectorFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-// Supports only S element size.
-multiclass SIMDTwoVectorS<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4i32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-}
-
-
-multiclass SIMDTwoVectorFPToInt<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-}
-
-multiclass SIMDTwoVectorIntToFP<bit U, bit S, bits<5> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2f32 : BaseSIMDTwoSameVector<0, U, {S,0}, opc, V64,
-                                asm, ".2s", ".2s",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn)))]>;
-  def v4f32 : BaseSIMDTwoSameVector<1, U, {S,0}, opc, V128,
-                                asm, ".4s", ".4s",
-                          [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v2f64 : BaseSIMDTwoSameVector<1, U, {S,1}, opc, V128,
-                                asm, ".2d", ".2d",
-                          [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-}
-
-
-class BaseSIMDMixedTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$Rd), (ins inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDMixedTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand inreg, RegisterOperand outreg,
-                           string asm, string outkind, string inkind,
-                           list<dag> pattern>
-  : I<(outs outreg:$dst), (ins outreg:$Rd, inreg:$Rn), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind #
-      "|" # outkind # "\t$Rd, $Rn}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDMixedTwoVector<bit U, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDMixedTwoVector<0, U, 0b00, opc, V128, V64,
-                                      asm, ".8b", ".8h",
-        [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn)))]>;
-  def v16i8 : BaseSIMDMixedTwoVectorTied<1, U, 0b00, opc, V128, V128,
-                                      asm#"2", ".16b", ".8h", []>;
-  def v4i16 : BaseSIMDMixedTwoVector<0, U, 0b01, opc, V128, V64,
-                                      asm, ".4h", ".4s",
-        [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn)))]>;
-  def v8i16 : BaseSIMDMixedTwoVectorTied<1, U, 0b01, opc, V128, V128,
-                                      asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDMixedTwoVector<0, U, 0b10, opc, V128, V64,
-                                      asm, ".2s", ".2d",
-        [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn)))]>;
-  def v4i32 : BaseSIMDMixedTwoVectorTied<1, U, 0b10, opc, V128, V128,
-                                      asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-class BaseSIMDCmpTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                           RegisterOperand regtype, string asm, string kind,
-                           ValueType dty, ValueType sty, SDNode OpNode>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", #0" #
-      "|" # kind # "\t$Rd, $Rn, #0}", "",
-      [(set (dty regtype:$Rd), (OpNode (sty regtype:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// Comparisons support all element sizes, except 1xD.
-multiclass SIMDCmpTwoVector<bit U, bits<5> opc, string asm,
-                            SDNode OpNode> {
-  def v8i8rz  : BaseSIMDCmpTwoVector<0, U, 0b00, opc, V64,
-                                     asm, ".8b",
-                                     v8i8, v8i8, OpNode>;
-  def v16i8rz : BaseSIMDCmpTwoVector<1, U, 0b00, opc, V128,
-                                     asm, ".16b",
-                                     v16i8, v16i8, OpNode>;
-  def v4i16rz : BaseSIMDCmpTwoVector<0, U, 0b01, opc, V64,
-                                     asm, ".4h",
-                                     v4i16, v4i16, OpNode>;
-  def v8i16rz : BaseSIMDCmpTwoVector<1, U, 0b01, opc, V128,
-                                     asm, ".8h",
-                                     v8i16, v8i16, OpNode>;
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, 0b10, opc, V64,
-                                     asm, ".2s",
-                                     v2i32, v2i32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, 0b10, opc, V128,
-                                     asm, ".4s",
-                                     v4i32, v4i32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, 0b11, opc, V128,
-                                     asm, ".2d",
-                                     v2i64, v2i64, OpNode>;
-}
-
-// FP Comparisons support only S and D element sizes.
-multiclass SIMDFPCmpTwoVector<bit U, bit S, bits<5> opc,
-                              string asm, SDNode OpNode> {
-  def v2i32rz : BaseSIMDCmpTwoVector<0, U, {S,0}, opc, V64,
-                                     asm, ".2s",
-                                     v2i32, v2f32, OpNode>;
-  def v4i32rz : BaseSIMDCmpTwoVector<1, U, {S,0}, opc, V128,
-                                     asm, ".4s",
-                                     v4i32, v4f32, OpNode>;
-  def v2i64rz : BaseSIMDCmpTwoVector<1, U, {S,1}, opc, V128,
-                                     asm, ".2d",
-                                     v2i64, v2f64, OpNode>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDFPCvtTwoVector<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDFPCvtTwoVectorTied<bit Q, bit U, bits<2> size, bits<5> opcode,
-                             RegisterOperand outtype, RegisterOperand intype,
-                             string asm, string VdTy, string VnTy,
-                             list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype:$Rn), asm,
-      !strconcat("\t$Rd", VdTy, ", $Rn", VnTy), "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPWidenTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V128, V64,
-                                    asm, ".4s", ".4h", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVector<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".4s", ".8h", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V128, V64,
-                                    asm, ".2d", ".2s", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVector<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".2d", ".4s", []>;
-}
-
-multiclass SIMDFPNarrowTwoVector<bit U, bit S, bits<5> opc, string asm> {
-  def v4i16 : BaseSIMDFPCvtTwoVector<0, U, {S,0}, opc, V64, V128,
-                                    asm, ".4h", ".4s", []>;
-  def v8i16 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,0}, opc, V128, V128,
-                                    asm#"2", ".8h", ".4s", []>;
-  def v2i32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                    asm, ".2s", ".2d", []>;
-  def v4i32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-}
-
-multiclass SIMDFPInexactCvtTwoVector<bit U, bit S, bits<5> opc, string asm,
-                                     Intrinsic OpNode> {
-  def v2f32 : BaseSIMDFPCvtTwoVector<0, U, {S,1}, opc, V64, V128,
-                                     asm, ".2s", ".2d",
-                          [(set (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn)))]>;
-  def v4f32 : BaseSIMDFPCvtTwoVectorTied<1, U, {S,1}, opc, V128, V128,
-                                    asm#"2", ".4s", ".2d", []>;
-
-  def : Pat<(concat_vectors (v2f32 V64:$Rd), (OpNode (v2f64 V128:$Rn))),
-            (!cast<Instruction>(NAME # "v4f32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register different-size vector instructions.
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVector<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$Rd), (ins intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDDifferentThreeVectorTied<bit U, bits<3> size, bits<4> opcode,
-                      RegisterOperand outtype, RegisterOperand intype1,
-                      RegisterOperand intype2, string asm,
-                      string outkind, string inkind1, string inkind2,
-                      list<dag> pattern>
-  : I<(outs outtype:$dst), (ins outtype:$Rd, intype1:$Rn, intype2:$Rm), asm,
-      "{\t$Rd" # outkind # ", $Rn" # inkind1 # ", $Rm" # inkind2 #
-      "|" # outkind # "\t$Rd, $Rn, $Rm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-// FIXME: TableGen doesn't know how to deal with expanded types that also
-//        change the element count (in this case, placing the results in
-//        the high elements of the result register rather than the low
-//        elements). Until that's fixed, we can't code-gen those.
-multiclass SIMDNarrowThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                    Intrinsic IntOp> {
-  def v8i16_v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".8b", ".8h", ".8h",
-     [(set (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn), (v8i16 V128:$Rm)))]>;
-  def v8i16_v16i8  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".16b", ".8h", ".8h",
-     []>;
-  def v4i32_v4i16  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".4h", ".4s", ".4s",
-     [(set (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn), (v4i32 V128:$Rm)))]>;
-  def v4i32_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".4s", ".4s",
-     []>;
-  def v2i64_v2i32  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V64, V128, V128,
-                                                  asm, ".2s", ".2d", ".2d",
-     [(set (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
-  def v2i64_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".2d", ".2d",
-     []>;
-
-
-  // Patterns for the '2' variants involve INSERT_SUBREG, which you can't put in
-  // a version attached to an instruction.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd), (IntOp (v8i16 V128:$Rn),
-                                                   (v8i16 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v8i16_v16i8")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (IntOp (v4i32 V128:$Rn),
-                                                    (v4i32 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v4i32_v8i16")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (IntOp (v2i64 V128:$Rn),
-                                                    (v2i64 V128:$Rm))),
-            (!cast<Instruction>(NAME # "v2i64_v4i32")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDDifferentThreeVectorBD<bit U, bits<4> opc, string asm,
-                                      Intrinsic IntOp> {
-  def v8i8   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                            V128, V64, V64,
-                                            asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (IntOp (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".8h", ".16b", ".16b", []>;
-  def v1i64  : BaseSIMDDifferentThreeVector<U, 0b110, opc,
-                                            V128, V64, V64,
-                                            asm, ".1q", ".1d", ".1d", []>;
-  def v2i64  : BaseSIMDDifferentThreeVector<U, 0b111, opc,
-                                            V128, V128, V128,
-                                            asm#"2", ".1q", ".2d", ".2d", []>;
-
-  def : Pat<(v8i16 (IntOp (v8i8 (extract_high_v16i8 V128:$Rn)),
-                          (v8i8 (extract_high_v16i8 V128:$Rm)))),
-      (!cast<Instruction>(NAME#"v16i8") V128:$Rn, V128:$Rm)>;
-}
-
-multiclass SIMDLongThreeVectorHS<bit U, bits<4> opc, string asm,
-                                 SDPatternOperator OpNode> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorBHSabdl<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                (extract_high_v16i8 V128:$Rm)))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                  (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                 (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHSabal<bit U, bits<4> opc,
-                                          string asm,
-                                          SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm))))))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (add (v8i16 V128:$Rd),
-               (zext (v8i8 (OpNode (extract_high_v16i8 V128:$Rn),
-                                   (extract_high_v16i8 V128:$Rm))))))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm))))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (add (v4i32 V128:$Rd),
-               (zext (v4i16 (OpNode (extract_high_v8i16 V128:$Rn),
-                                    (extract_high_v8i16 V128:$Rm))))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm))))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (add (v2i64 V128:$Rd),
-               (zext (v2i32 (OpNode (extract_high_v4i32 V128:$Rn),
-                                    (extract_high_v4i32 V128:$Rm))))))]>;
-}
-
-multiclass SIMDLongThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-      [(set (v8i16 V128:$Rd), (OpNode (extract_high_v16i8 V128:$Rn),
-                                      (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-      [(set (v4i32 V128:$Rd), (OpNode (extract_high_v8i16 V128:$Rn),
-                                      (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-      [(set (v2i64 V128:$Rd), (OpNode (extract_high_v4i32 V128:$Rn),
-                                      (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorTiedBHS<bit U, bits<4> opc,
-                                      string asm,
-                                      SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVectorTied<U, 0b000, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".8h", ".8b", ".8b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVectorTied<U, 0b001, opc,
-                                                 V128, V128, V128,
-                                                 asm#"2", ".8h", ".16b", ".16b",
-    [(set (v8i16 V128:$dst),
-          (OpNode (v8i16 V128:$Rd),
-                  (extract_high_v16i8 V128:$Rn),
-                  (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-multiclass SIMDLongThreeVectorSQDMLXTiedHS<bit U, bits<4> opc, string asm,
-                                           SDPatternOperator Accum> {
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b010, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".4s", ".4h", ".4h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                                                (v4i16 V64:$Rm)))))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVectorTied<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".8h", ".8h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull (extract_high_v8i16 V128:$Rn),
-                                            (extract_high_v8i16 V128:$Rm)))))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b100, opc,
-                                                  V128, V64, V64,
-                                                  asm, ".2d", ".2s", ".2s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (v2i32 V64:$Rn),
-                                                (v2i32 V64:$Rm)))))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVectorTied<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".4s", ".4s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull (extract_high_v4i32 V128:$Rn),
-                                            (extract_high_v4i32 V128:$Rm)))))]>;
-}
-
-multiclass SIMDWideThreeVectorBHS<bit U, bits<4> opc, string asm,
-                                  SDPatternOperator OpNode> {
-  def v8i8_v8i16   : BaseSIMDDifferentThreeVector<U, 0b000, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".8h", ".8h", ".8b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn), (v8i8 V64:$Rm)))]>;
-  def v16i8_v8i16  : BaseSIMDDifferentThreeVector<U, 0b001, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".8h", ".8h", ".16b",
-       [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                                       (extract_high_v16i8 V128:$Rm)))]>;
-  def v4i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b010, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".4s", ".4s", ".4h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (v4i16 V64:$Rm)))]>;
-  def v8i16_v4i32  : BaseSIMDDifferentThreeVector<U, 0b011, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".4s", ".4s", ".8h",
-       [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                                       (extract_high_v8i16 V128:$Rm)))]>;
-  def v2i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b100, opc,
-                                                  V128, V128, V64,
-                                                  asm, ".2d", ".2d", ".2s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i32 V64:$Rm)))]>;
-  def v4i32_v2i64  : BaseSIMDDifferentThreeVector<U, 0b101, opc,
-                                                  V128, V128, V128,
-                                                  asm#"2", ".2d", ".2d", ".4s",
-       [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                                       (extract_high_v4i32 V128:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDBitwiseExtract<bit size, RegisterOperand regtype, ValueType vty,
-                             string asm, string kind>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm, i32imm:$imm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind # ", $imm" #
-      "|" # kind # "\t$Rd, $Rn, $Rm, $imm}", "",
-      [(set (vty regtype:$Rd),
-            (ARM64ext regtype:$Rn, regtype:$Rm, (i32 imm:$imm)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  bits<4> imm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size;
-  let Inst{29-21} = 0b101110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-11} = imm;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-multiclass SIMDBitwiseExtract<string asm> {
-  def v8i8  : BaseSIMDBitwiseExtract<0, V64, v8i8, asm, ".8b">;
-  def v16i8 : BaseSIMDBitwiseExtract<1, V128, v16i8, asm, ".16b">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-class BaseSIMDZipVector<bits<3> size, bits<3> opc, RegisterOperand regtype,
-                        string asm, string kind, SDNode OpNode, ValueType valty>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "{\t$Rd" # kind # ", $Rn" # kind # ", $Rm" # kind #
-      "|" # kind # "\t$Rd, $Rn, $Rm}", "",
-      [(set (valty regtype:$Rd), (OpNode regtype:$Rn, regtype:$Rm))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31}    = 0;
-  let Inst{30}    = size{0};
-  let Inst{29-24} = 0b001110;
-  let Inst{23-22} = size{2-1};
-  let Inst{21}    = 0;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDZipVector<bits<3>opc, string asm,
-                         SDNode OpNode> {
-  def v8i8   : BaseSIMDZipVector<0b000, opc, V64,
-      asm, ".8b", OpNode, v8i8>;
-  def v16i8  : BaseSIMDZipVector<0b001, opc, V128,
-      asm, ".16b", OpNode, v16i8>;
-  def v4i16  : BaseSIMDZipVector<0b010, opc, V64,
-      asm, ".4h", OpNode, v4i16>;
-  def v8i16  : BaseSIMDZipVector<0b011, opc, V128,
-      asm, ".8h", OpNode, v8i16>;
-  def v2i32  : BaseSIMDZipVector<0b100, opc, V64,
-      asm, ".2s", OpNode, v2i32>;
-  def v4i32  : BaseSIMDZipVector<0b101, opc, V128,
-      asm, ".4s", OpNode, v4i32>;
-  def v2i64  : BaseSIMDZipVector<0b111, opc, V128,
-      asm, ".2d", OpNode, v2i64>;
-
-  def : Pat<(v2f32 (OpNode V64:$Rn, V64:$Rm)),
-        (!cast<Instruction>(NAME#"v2i32") V64:$Rn, V64:$Rm)>;
-  def : Pat<(v4f32 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v4i32") V128:$Rn, V128:$Rm)>;
-  def : Pat<(v2f64 (OpNode V128:$Rn, V128:$Rm)),
-        (!cast<Instruction>(NAME#"v2i64") V128:$Rn, V128:$Rm)>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD three register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDThreeScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm,
-                        list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn, regtype:$Rm), asm,
-      "\t$Rd, $Rn, $Rm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
-                            SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-}
-
-multiclass SIMDThreeScalarBHSD<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn), (v1i64 FPR64:$Rm)))]>;
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm, []>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-  def v1i8   : BaseSIMDThreeScalar<U, 0b00, opc, FPR8 , asm, []>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME#"v1i64") FPR64:$Rn, FPR64:$Rm)>;
-  def : Pat<(i32 (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm))),
-            (!cast<Instruction>(NAME#"v1i32") FPR32:$Rn, FPR32:$Rm)>;
-}
-
-multiclass SIMDThreeScalarHS<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i32  : BaseSIMDThreeScalar<U, 0b10, opc, FPR32, asm,
-                             [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
-}
-
-multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
-  }
-
-  def : Pat<(v1f64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<5> opc, string asm,
-                                SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,1}, opc, FPR64, asm,
-      [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0}, opc, FPR32, asm,
-      [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-            (!cast<Instruction>(NAME # "64") FPR64:$Rn, FPR64:$Rm)>;
-}
-
-class BaseSIMDThreeScalarMixed<bit U, bits<2> size, bits<5> opcode,
-              dag oops, dag iops, string asm, string cstr, list<dag> pat>
-  : I<oops, iops, asm,
-      "\t$Rd, $Rn, $Rm", cstr, pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21}    = 1;
-  let Inst{20-16} = Rm;
-  let Inst{15-11} = opcode;
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$Rd),
-                                      (ins FPR16:$Rn, FPR16:$Rm), asm, "", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$Rd),
-                                      (ins FPR32:$Rn, FPR32:$Rm), asm, "",
-            [(set (i64 FPR64:$Rd), (OpNode (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDThreeScalarMixedTiedHS<bit U, bits<5> opc, string asm,
-                                  SDPatternOperator OpNode = null_frag> {
-  def i16  : BaseSIMDThreeScalarMixed<U, 0b01, opc,
-                                      (outs FPR32:$dst),
-                                      (ins FPR32:$Rd, FPR16:$Rn, FPR16:$Rm),
-                                      asm, "$Rd = $dst", []>;
-  def i32  : BaseSIMDThreeScalarMixed<U, 0b10, opc,
-                                      (outs FPR64:$dst),
-                                      (ins FPR64:$Rd, FPR32:$Rn, FPR32:$Rm),
-                                      asm, "$Rd = $dst",
-            [(set (i64 FPR64:$dst),
-                  (OpNode (i64 FPR64:$Rd), (i32 FPR32:$Rn), (i32 FPR32:$Rm)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD two register scalar instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$Rd), (ins regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDTwoScalarTied<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, RegisterClass regtype2,
-                        string asm, list<dag> pat>
-  : I<(outs regtype:$dst), (ins regtype:$Rd, regtype2:$Rn), asm,
-      "\t$Rd, $Rn", "$Rd = $dst", pat>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDCmpTwoScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterClass regtype, string asm>
-  : I<(outs regtype:$Rd), (ins regtype:$Rn), asm,
-      "\t$Rd, $Rn, #0", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b10000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SIMDInexactCvtTwoScalar<bits<5> opcode, string asm>
-  : I<(outs FPR32:$Rd), (ins FPR64:$Rn), asm, "\t$Rd, $Rn", "",
-     [(set (f32 FPR32:$Rd), (int_arm64_sisd_fcvtxn (f64 FPR64:$Rn)))]>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-17} = 0b011111100110000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDCmpTwoScalarD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, 0b11, opc, FPR64, asm>;
-
-  def : Pat<(v1i64 (OpNode FPR64:$Rn)),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDCmpTwoScalarSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64rz  : BaseSIMDCmpTwoScalar<U, {S,1}, opc, FPR64, asm>;
-  def v1i32rz  : BaseSIMDCmpTwoScalar<U, {S,0}, opc, FPR32, asm>;
-
-  def : Pat<(v1i64 (OpNode (v1f64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64rz) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarD<bit U, bits<5> opc, string asm,
-                          SDPatternOperator OpNode = null_frag> {
-  def v1i64       : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn)))]>;
-
-  def : Pat<(i64 (OpNode (i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # "v1i64") FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v1i64       : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,[]>;
-  def v1i32       : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,[]>;
-}
-
-multiclass SIMDTwoScalarCVTSD<bit U, bit S, bits<5> opc, string asm,
-                              SDPatternOperator OpNode> {
-  def v1i64 : BaseSIMDTwoScalar<U, {S,1}, opc, FPR64, FPR64, asm,
-                                [(set FPR64:$Rd, (OpNode (f64 FPR64:$Rn)))]>;
-  def v1i32 : BaseSIMDTwoScalar<U, {S,0}, opc, FPR32, FPR32, asm,
-                                [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
-}
-
-multiclass SIMDTwoScalarBHSD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode = null_frag> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalar<U, 0b11, opc, FPR64, FPR64, asm,
-           [(set (i64 FPR64:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR32, asm,
-           [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rn)>;
-}
-
-multiclass SIMDTwoScalarBHSDTied<bit U, bits<5> opc, string asm,
-                                 Intrinsic OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def v1i64  : BaseSIMDTwoScalarTied<U, 0b11, opc, FPR64, FPR64, asm,
-        [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn)))]>;
-    def v1i32  : BaseSIMDTwoScalarTied<U, 0b10, opc, FPR32, FPR32, asm,
-        [(set (i32 FPR32:$dst), (OpNode (i32 FPR32:$Rd), (i32 FPR32:$Rn)))]>;
-    def v1i16  : BaseSIMDTwoScalarTied<U, 0b01, opc, FPR16, FPR16, asm, []>;
-    def v1i8   : BaseSIMDTwoScalarTied<U, 0b00, opc, FPR8 , FPR8 , asm, []>;
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn))),
-            (!cast<Instruction>(NAME # v1i64) FPR64:$Rd, FPR64:$Rn)>;
-}
-
-
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDTwoScalarMixedBHS<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def v1i32  : BaseSIMDTwoScalar<U, 0b10, opc, FPR32, FPR64, asm,
-        [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn)))]>;
-  def v1i16  : BaseSIMDTwoScalar<U, 0b01, opc, FPR16, FPR32, asm, []>;
-  def v1i8   : BaseSIMDTwoScalar<U, 0b00, opc, FPR8 , FPR16, asm, []>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDPairwiseScalar<bit U, bits<2> size, bits<5> opcode,
-                        RegisterOperand regtype, RegisterOperand vectype,
-                        string asm, string kind>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b11110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDPairwiseScalarD<bit U, bits<5> opc, string asm> {
-  def v2i64p : BaseSIMDPairwiseScalar<U, 0b11, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-multiclass SIMDPairwiseScalarSD<bit U, bit S, bits<5> opc, string asm> {
-  def v2i32p : BaseSIMDPairwiseScalar<U, {S,0}, opc, FPR32Op, V64,
-                                      asm, ".2s">;
-  def v2i64p : BaseSIMDPairwiseScalar<U, {S,1}, opc, FPR64Op, V128,
-                                      asm, ".2d">;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDAcrossLanes<bit Q, bit U, bits<2> size, bits<5> opcode,
-                          RegisterClass regtype, RegisterOperand vectype,
-                          string asm, string kind, list<dag> pattern>
-  : I<(outs regtype:$Rd), (ins vectype:$Rn), asm,
-      "{\t$Rd, $Rn" # kind # "|" # kind # "\t$Rd, $Rn}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-24} = 0b01110;
-  let Inst{23-22} = size;
-  let Inst{21-17} = 0b11000;
-  let Inst{16-12} = opcode;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDAcrossLanesBHS<bit U, bits<5> opcode,
-                              string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR8,  V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR8,  V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR16, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR16, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR32, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesHSD<bit U, bits<5> opcode, string asm> {
-  def v8i8v  : BaseSIMDAcrossLanes<0, U, 0b00, opcode, FPR16, V64,
-                                   asm, ".8b", []>;
-  def v16i8v : BaseSIMDAcrossLanes<1, U, 0b00, opcode, FPR16, V128,
-                                   asm, ".16b", []>;
-  def v4i16v : BaseSIMDAcrossLanes<0, U, 0b01, opcode, FPR32, V64,
-                                   asm, ".4h", []>;
-  def v8i16v : BaseSIMDAcrossLanes<1, U, 0b01, opcode, FPR32, V128,
-                                   asm, ".8h", []>;
-  def v4i32v : BaseSIMDAcrossLanes<1, U, 0b10, opcode, FPR64, V128,
-                                   asm, ".4s", []>;
-}
-
-multiclass SIMDAcrossLanesS<bits<5> opcode, bit sz1, string asm,
-                            Intrinsic intOp> {
-  def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
-                                   asm, ".4s",
-        [(set FPR32:$Rd, (intOp (v4f32 V128:$Rn)))]>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-// FIXME: There has got to be a better way to factor these. ugh.
-
-class BaseSIMDInsDup<bit Q, bit op, dag outs, dag ins, string asm,
-                     string operands, string constraints, list<dag> pattern>
-  : I<outs, ins, asm, operands, constraints, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29} = op;
-  let Inst{28-21} = 0b01110000;
-  let Inst{15} = 0;
-  let Inst{10} = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-class SIMDDupFromMain<bit Q, bits<5> imm5, string size, ValueType vectype,
-                      RegisterOperand vecreg, RegisterClass regtype>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins regtype:$Rn), "dup",
-                   "{\t$Rd" # size # ", $Rn" #
-                   "|" # size # "\t$Rd, $Rn}", "",
-                   [(set (vectype vecreg:$Rd), (ARM64dup regtype:$Rn))]> {
-  let Inst{20-16} = imm5;
-  let Inst{14-11} = 0b0001;
-}
-
-class SIMDDupFromElement<bit Q, string dstkind, string srckind,
-                         ValueType vectype, ValueType insreg,
-                         RegisterOperand vecreg, Operand idxtype,
-                         ValueType elttype, SDNode OpNode>
-  : BaseSIMDInsDup<Q, 0, (outs vecreg:$Rd), (ins V128:$Rn, idxtype:$idx), "dup",
-                   "{\t$Rd" # dstkind # ", $Rn" # srckind # "$idx" #
-                   "|" # dstkind # "\t$Rd, $Rn$idx}", "",
-                 [(set (vectype vecreg:$Rd),
-                       (OpNode (insreg V128:$Rn), idxtype:$idx))]> {
-  let Inst{14-11} = 0b0000;
-}
-
-class SIMDDup64FromElement
-  : SIMDDupFromElement<1, ".2d", ".d", v2i64, v2i64, V128,
-                       VectorIndexD, i64, ARM64duplane64> {
-  bits<1> idx;
-  let Inst{20} = idx;
-  let Inst{19-16} = 0b1000;
-}
-
-class SIMDDup32FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".s", vectype, v4i32, vecreg,
-                       VectorIndexS, i64, ARM64duplane32> {
-  bits<2> idx;
-  let Inst{20-19} = idx;
-  let Inst{18-16} = 0b100;
-}
-
-class SIMDDup16FromElement<bit Q, string size, ValueType vectype,
-                           RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".h", vectype, v8i16, vecreg,
-                       VectorIndexH, i64, ARM64duplane16> {
-  bits<3> idx;
-  let Inst{20-18} = idx;
-  let Inst{17-16} = 0b10;
-}
-
-class SIMDDup8FromElement<bit Q, string size, ValueType vectype,
-                          RegisterOperand vecreg>
-  : SIMDDupFromElement<Q, size, ".b", vectype, v16i8, vecreg,
-                       VectorIndexB, i64, ARM64duplane8> {
-  bits<4> idx;
-  let Inst{20-17} = idx;
-  let Inst{16} = 1;
-}
-
-class BaseSIMDMov<bit Q, string size, bits<4> imm4, RegisterClass regtype,
-                  Operand idxtype, string asm, list<dag> pattern>
-  : BaseSIMDInsDup<Q, 0, (outs regtype:$Rd), (ins V128:$Rn, idxtype:$idx), asm,
-                   "{\t$Rd, $Rn" # size # "$idx" #
-                   "|" # size # "\t$Rd, $Rn$idx}", "", pattern> {
-  let Inst{14-11} = imm4;
-}
-
-class SIMDSMov<bit Q, string size, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0101, regtype, idxtype, "smov", []>;
-class SIMDUMov<bit Q, string size, ValueType vectype, RegisterClass regtype,
-               Operand idxtype>
-  : BaseSIMDMov<Q, size, 0b0111, regtype, idxtype, "umov",
-      [(set regtype:$Rd, (vector_extract (vectype V128:$Rn), idxtype:$idx))]>;
-
-class SIMDMovAlias<string asm, string size, Instruction inst,
-                   RegisterClass regtype, Operand idxtype>
-    : InstAlias<asm#"{\t$dst, $src"#size#"$idx" #
-                    "|" # size # "\t$dst, $src$idx}",
-                (inst regtype:$dst, V128:$src, idxtype:$idx)>;
-
-multiclass SMov {
-  def vi8to32 : SIMDSMov<0, ".b", GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi8to64 : SIMDSMov<1, ".b", GPR64, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16to32 : SIMDSMov<0, ".h", GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi16to64 : SIMDSMov<1, ".h", GPR64, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32to64 : SIMDSMov<1, ".s", GPR64, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-}
-
-multiclass UMov {
-  def vi8 : SIMDUMov<0, ".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16 : SIMDUMov<0, ".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32 : SIMDUMov<0, ".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64 : SIMDUMov<1, ".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-  def : SIMDMovAlias<"mov", ".s",
-                     !cast<Instruction>(NAME#"vi32"),
-                     GPR32, VectorIndexS>;
-  def : SIMDMovAlias<"mov", ".d",
-                     !cast<Instruction>(NAME#"vi64"),
-                     GPR64, VectorIndexD>;
-}
-
-class SIMDInsFromMain<string size, ValueType vectype,
-                      RegisterClass regtype, Operand idxtype>
-  : BaseSIMDInsDup<1, 0, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, regtype:$Rn), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" #
-                   "|" # size # "\t$Rd$idx, $Rn}",
-                   "$Rd = $dst",
-            [(set V128:$dst,
-              (vector_insert (vectype V128:$Rd), regtype:$Rn, idxtype:$idx))]> {
-  let Inst{14-11} = 0b0011;
-}
-
-class SIMDInsFromElement<string size, ValueType vectype,
-                         ValueType elttype, Operand idxtype>
-  : BaseSIMDInsDup<1, 1, (outs V128:$dst),
-                   (ins V128:$Rd, idxtype:$idx, V128:$Rn, idxtype:$idx2), "ins",
-                   "{\t$Rd" # size # "$idx, $Rn" # size # "$idx2" #
-                   "|" # size # "\t$Rd$idx, $Rn$idx2}",
-                   "$Rd = $dst",
-         [(set V128:$dst,
-               (vector_insert
-                 (vectype V128:$Rd),
-                 (elttype (vector_extract (vectype V128:$Rn), idxtype:$idx2)),
-                 idxtype:$idx))]>;
-
-class SIMDInsMainMovAlias<string size, Instruction inst,
-                          RegisterClass regtype, Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" #
-                        "|" # size #"\t$dst$idx, $src}",
-                (inst V128:$dst, idxtype:$idx, regtype:$src)>;
-class SIMDInsElementMovAlias<string size, Instruction inst,
-                             Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
-                      # "|" # size #" $dst$idx, $src$idx2}",
-                (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
-
-
-multiclass SIMDIns {
-  def vi8gpr : SIMDInsFromMain<".b", v16i8, GPR32, VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def vi16gpr : SIMDInsFromMain<".h", v8i16, GPR32, VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def vi32gpr : SIMDInsFromMain<".s", v4i32, GPR32, VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def vi64gpr : SIMDInsFromMain<".d", v2i64, GPR64, VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  def vi8lane : SIMDInsFromElement<".b", v16i8, i32, VectorIndexB> {
-    bits<4> idx;
-    bits<4> idx2;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-    let Inst{14-11} = idx2;
-  }
-  def vi16lane : SIMDInsFromElement<".h", v8i16, i32, VectorIndexH> {
-    bits<3> idx;
-    bits<3> idx2;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-    let Inst{14-12} = idx2;
-    let Inst{11} = 0;
-  }
-  def vi32lane : SIMDInsFromElement<".s", v4i32, i32, VectorIndexS> {
-    bits<2> idx;
-    bits<2> idx2;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-    let Inst{14-13} = idx2;
-    let Inst{12-11} = 0;
-  }
-  def vi64lane : SIMDInsFromElement<".d", v2i64, i64, VectorIndexD> {
-    bits<1> idx;
-    bits<1> idx2;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-    let Inst{14} = idx2;
-    let Inst{13-11} = 0;
-  }
-
-  // For all forms of the INS instruction, the "mov" mnemonic is the
-  // preferred alias. Why they didn't just call the instruction "mov" in
-  // the first place is a very good question indeed...
-  def : SIMDInsMainMovAlias<".b", !cast<Instruction>(NAME#"vi8gpr"),
-                         GPR32, VectorIndexB>;
-  def : SIMDInsMainMovAlias<".h", !cast<Instruction>(NAME#"vi16gpr"),
-                         GPR32, VectorIndexH>;
-  def : SIMDInsMainMovAlias<".s", !cast<Instruction>(NAME#"vi32gpr"),
-                         GPR32, VectorIndexS>;
-  def : SIMDInsMainMovAlias<".d", !cast<Instruction>(NAME#"vi64gpr"),
-                         GPR64, VectorIndexD>;
-
-  def : SIMDInsElementMovAlias<".b", !cast<Instruction>(NAME#"vi8lane"),
-                         VectorIndexB>;
-  def : SIMDInsElementMovAlias<".h", !cast<Instruction>(NAME#"vi16lane"),
-                         VectorIndexH>;
-  def : SIMDInsElementMovAlias<".s", !cast<Instruction>(NAME#"vi32lane"),
-                         VectorIndexS>;
-  def : SIMDInsElementMovAlias<".d", !cast<Instruction>(NAME#"vi64lane"),
-                         VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookup<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$Vd), (ins listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDTableLookupTied<bit Q, bits<2> len, bit op, RegisterOperand vectype,
-                          RegisterOperand listtype, string asm, string kind>
-  : I<(outs vectype:$dst), (ins vectype:$Vd, listtype:$Vn, vectype:$Vm), asm,
-       "\t$Vd" # kind # ", $Vn, $Vm" # kind, "$Vd = $dst", []>,
-    Sched<[WriteV]> {
-  bits<5> Vd;
-  bits<5> Vn;
-  bits<5> Vm;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29-21} = 0b001110000;
-  let Inst{20-16} = Vm;
-  let Inst{15}    = 0;
-  let Inst{14-13} = len;
-  let Inst{12}    = op;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Vn;
-  let Inst{4-0}   = Vd;
-}
-
-class SIMDTableLookupAlias<string asm, Instruction inst,
-                          RegisterOperand vectype, RegisterOperand listtype>
-    : InstAlias<!strconcat(asm, "\t$dst, $lst, $index"),
-                (inst vectype:$dst, listtype:$lst, vectype:$index), 0>;
-
-multiclass SIMDTableLookup<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookup<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookup<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookup<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookup<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookup<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookup<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookup<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookup<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-multiclass SIMDTableLookupTied<bit op, string asm> {
-  def v8i8One   : BaseSIMDTableLookupTied<0, 0b00, op, V64, VecListOne16b,
-                                      asm, ".8b">;
-  def v8i8Two   : BaseSIMDTableLookupTied<0, 0b01, op, V64, VecListTwo16b,
-                                      asm, ".8b">;
-  def v8i8Three : BaseSIMDTableLookupTied<0, 0b10, op, V64, VecListThree16b,
-                                      asm, ".8b">;
-  def v8i8Four  : BaseSIMDTableLookupTied<0, 0b11, op, V64, VecListFour16b,
-                                      asm, ".8b">;
-  def v16i8One  : BaseSIMDTableLookupTied<1, 0b00, op, V128, VecListOne16b,
-                                      asm, ".16b">;
-  def v16i8Two  : BaseSIMDTableLookupTied<1, 0b01, op, V128, VecListTwo16b,
-                                      asm, ".16b">;
-  def v16i8Three: BaseSIMDTableLookupTied<1, 0b10, op, V128, VecListThree16b,
-                                      asm, ".16b">;
-  def v16i8Four : BaseSIMDTableLookupTied<1, 0b11, op, V128, VecListFour16b,
-                                      asm, ".16b">;
-
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8One"),
-                         V64, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Two"),
-                         V64, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Three"),
-                         V64, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".8b",
-                         !cast<Instruction>(NAME#"v8i8Four"),
-                         V64, VecListFour128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8One"),
-                         V128, VecListOne128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Two"),
-                         V128, VecListTwo128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Three"),
-                         V128, VecListThree128>;
-  def : SIMDTableLookupAlias<asm # ".16b",
-                         !cast<Instruction>(NAME#"v16i8Four"),
-                         V128, VecListFour128>;
-}
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY
-//----------------------------------------------------------------------------
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
-                        string kind, Operand idxtype>
-  : I<(outs regtype:$dst), (ins vectype:$src, idxtype:$idx), "mov",
-       "{\t$dst, $src" # kind # "$idx" #
-       "|\t$dst, $src$idx}", "", []>,
-    Sched<[WriteV]> {
-  bits<5> dst;
-  bits<5> src;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{15-10} = 0b000001;
-  let Inst{9-5}   = src;
-  let Inst{4-0}   = dst;
-}
-
-class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
-      RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
-    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
-                    # "|\t$dst, $src$index}",
-                (inst regtype:$dst, vectype:$src, idxtype:$index)>;
-
-
-multiclass SIMDScalarCPY<string asm> {
-  def i8  : BaseSIMDScalarCPY<FPR8,  V128, ".b", VectorIndexB> {
-    bits<4> idx;
-    let Inst{20-17} = idx;
-    let Inst{16} = 1;
-  }
-  def i16 : BaseSIMDScalarCPY<FPR16, V128, ".h", VectorIndexH> {
-    bits<3> idx;
-    let Inst{20-18} = idx;
-    let Inst{17-16} = 0b10;
-  }
-  def i32 : BaseSIMDScalarCPY<FPR32, V128, ".s", VectorIndexS> {
-    bits<2> idx;
-    let Inst{20-19} = idx;
-    let Inst{18-16} = 0b100;
-  }
-  def i64 : BaseSIMDScalarCPY<FPR64, V128, ".d", VectorIndexD> {
-    bits<1> idx;
-    let Inst{20} = idx;
-    let Inst{19-16} = 0b1000;
-  }
-
-  // 'DUP' mnemonic aliases.
-  def : SIMDScalarCPYAlias<"dup", ".b",
-                           !cast<Instruction>(NAME#"i8"),
-                           FPR8, V128, VectorIndexB>;
-  def : SIMDScalarCPYAlias<"dup", ".h",
-                           !cast<Instruction>(NAME#"i16"),
-                           FPR16, V128, VectorIndexH>;
-  def : SIMDScalarCPYAlias<"dup", ".s",
-                           !cast<Instruction>(NAME#"i32"),
-                           FPR32, V128, VectorIndexS>;
-  def : SIMDScalarCPYAlias<"dup", ".d",
-                           !cast<Instruction>(NAME#"i64"),
-                           FPR64, V128, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//----------------------------------------------------------------------------
-
-class BaseSIMDModifiedImm<bit Q, bit op, dag oops, dag iops,
-                          string asm, string op_string,
-                          string cstr, list<dag> pattern>
-  : I<oops, iops, asm, op_string, cstr, pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<8> imm8;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = op;
-  let Inst{28-19} = 0b0111100000;
-  let Inst{18-16} = imm8{7-5};
-  let Inst{11-10} = 0b01;
-  let Inst{9-5}   = imm8{4-0};
-  let Inst{4-0}   = Rd;
-}
-
-class BaseSIMDModifiedImmVector<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$Rd),
-                        !con((ins immtype:$imm8), opt_shift_iop), asm,
-                        "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                        "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "", pattern> {
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-class BaseSIMDModifiedImmVectorTied<bit Q, bit op, RegisterOperand vectype,
-                                Operand immtype, dag opt_shift_iop,
-                                string opt_shift, string asm, string kind,
-                                list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs vectype:$dst),
-                        !con((ins vectype:$Rd, immtype:$imm8), opt_shift_iop),
-                        asm, "{\t$Rd" # kind # ", $imm8" # opt_shift #
-                             "|" # kind # "\t$Rd, $imm8" # opt_shift # "}",
-                        "$Rd = $dst", pattern> {
-  let DecoderMethod = "DecodeModImmTiedInstruction";
-}
-
-class BaseSIMDModifiedImmVectorShift<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftTied<bit Q, bit op, bits<2> b15_b12,
-                                     RegisterOperand vectype, string asm,
-                                     string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15}    = b15_b12{1};
-  let Inst{14-13} = shift;
-  let Inst{12}    = b15_b12{0};
-}
-
-
-class BaseSIMDModifiedImmVectorShiftHalf<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-class BaseSIMDModifiedImmVectorShiftHalfTied<bit Q, bit op, bits<2> b15_b12,
-                                         RegisterOperand vectype, string asm,
-                                         string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVectorTied<Q, op, vectype, imm0_255,
-                              (ins logical_vec_hw_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<2> shift;
-  let Inst{15} = b15_b12{1};
-  let Inst{14} = 0;
-  let Inst{13} = shift{0};
-  let Inst{12} = b15_b12{0};
-}
-
-multiclass SIMDModifiedImmVectorShift<bit op, bits<2> hw_cmode, bits<2> w_cmode,
-                                      string asm> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalf<0, op, hw_cmode, V64,
-                                                 asm, ".4h", []>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalf<1, op, hw_cmode, V128,
-                                                 asm, ".8h", []>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShift<0, op, w_cmode, V64,
-                                             asm, ".2s", []>;
-  def v4i32 : BaseSIMDModifiedImmVectorShift<1, op, w_cmode, V128,
-                                             asm, ".4s", []>;
-}
-
-multiclass SIMDModifiedImmVectorShiftTied<bit op, bits<2> hw_cmode,
-                                      bits<2> w_cmode, string asm,
-                                      SDNode OpNode> {
-  def v4i16 : BaseSIMDModifiedImmVectorShiftHalfTied<0, op, hw_cmode, V64,
-                                                 asm, ".4h",
-             [(set (v4i16 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v8i16 : BaseSIMDModifiedImmVectorShiftHalfTied<1, op, hw_cmode, V128,
-                                                 asm, ".8h",
-             [(set (v8i16 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-
-  def v2i32 : BaseSIMDModifiedImmVectorShiftTied<0, op, w_cmode, V64,
-                                             asm, ".2s",
-             [(set (v2i32 V64:$dst), (OpNode V64:$Rd,
-                                             imm0_255:$imm8,
-                                             (i32 imm:$shift)))]>;
-  def v4i32 : BaseSIMDModifiedImmVectorShiftTied<1, op, w_cmode, V128,
-                                             asm, ".4s",
-             [(set (v4i32 V128:$dst), (OpNode V128:$Rd,
-                                              imm0_255:$imm8,
-                                              (i32 imm:$shift)))]>;
-}
-
-class SIMDModifiedImmMoveMSL<bit Q, bit op, bits<4> cmode,
-                             RegisterOperand vectype, string asm,
-                             string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm0_255,
-                              (ins move_vec_shift:$shift),
-                              "$shift", asm, kind, pattern> {
-  bits<1> shift;
-  let Inst{15-13} = cmode{3-1};
-  let Inst{12}    = shift;
-}
-
-class SIMDModifiedImmVectorNoShift<bit Q, bit op, bits<4> cmode,
-                                   RegisterOperand vectype,
-                                   Operand imm_type, string asm,
-                                   string kind, list<dag> pattern>
-  : BaseSIMDModifiedImmVector<Q, op, vectype, imm_type, (ins), "",
-                              asm, kind, pattern> {
-  let Inst{15-12} = cmode;
-}
-
-class SIMDModifiedImmScalarNoShift<bit Q, bit op, bits<4> cmode, string asm,
-                                   list<dag> pattern>
-  : BaseSIMDModifiedImm<Q, op, (outs FPR64:$Rd), (ins simdimmtype10:$imm8), asm,
-                        "\t$Rd, $imm8", "", pattern> {
-  let Inst{15-12} = cmode;
-  let DecoderMethod = "DecodeModImmInstruction";
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexed<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx),
-      asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
-                      RegisterOperand dst_reg, RegisterOperand lhs_reg,
-                      RegisterOperand rhs_reg, Operand vec_idx, string asm,
-                      string apple_kind, string dst_kind, string lhs_kind,
-                      string rhs_kind, list<dag> pattern>
-  : I<(outs dst_reg:$dst),
-      (ins dst_reg:$Rd, lhs_reg:$Rn, rhs_reg:$Rm, vec_idx:$idx), asm,
-      "{\t$Rd" # dst_kind # ", $Rn" # lhs_kind # ", $Rm" # rhs_kind # "$idx" #
-      "|" # apple_kind # "\t$Rd, $Rn, $Rm$idx}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28}    = Scalar;
-  let Inst{27-24} = 0b1111;
-  let Inst{23-22} = size;
-  // Bit 21 must be set by the derived class.
-  let Inst{20-16} = Rm;
-  let Inst{15-12} = opc;
-  // Bit 11 must be set by the derived class.
-  let Inst{10}    = 0;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDFPIndexedSD<bit U, bits<4> opc, string asm,
-                           SDPatternOperator OpNode> {
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2f32 V64:$Rd),
-        (OpNode (v2f32 V64:$Rn),
-         (v2f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4f32 V128:$Rd),
-        (OpNode (v4f32 V128:$Rn),
-         (v4f32 (ARM64duplane32 (v4f32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexed<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d",
-    [(set (v2f64 V128:$Rd),
-        (OpNode (v2f64 V128:$Rn),
-         (v2f64 (ARM64duplane64 (v2f64 V128:$Rm), VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (f32 FPR32Op:$Rd),
-          (OpNode (f32 FPR32Op:$Rn),
-                  (f32 (vector_extract (v4f32 V128:$Rm),
-                                       VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d",
-    [(set (f64 FPR64Op:$Rd),
-          (OpNode (f64 FPR64Op:$Rn),
-                  (f64 (vector_extract (v2f64 V128:$Rm),
-                                       VectorIndexD:$idx))))]> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDFPIndexedSDTiedPatterns<string INST, SDPatternOperator OpNode> {
-  // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # v2i32_indexed)
-                V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i32_indexed") V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-
-  // 2 variants for the .4s version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 V128:$Rm),
-                                           VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v4i32_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 FPR32Op:$Rm)))),
-            (!cast<Instruction>(INST # "v4i32_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit and DUP scalar.
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 V128:$Rm),
-                                           VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v2i64_indexed")
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 FPR64Op:$Rm)))),
-            (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 V128:$Rm), VectorIndexD:$idx))),
-            (!cast<Instruction>(INST # "v1i64_indexed") FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexD:$idx)>;
-}
-
-multiclass SIMDFPIndexedSDTied<bit U, bits<4> opc, string asm> {
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc, V64, V64,
-                                          V128, VectorIndexS,
-                                          asm, ".2s", ".2s", ".2s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v2i64_indexed : BaseSIMDIndexedTied<1, U, 0, 0b11, opc,
-                                      V128, V128,
-                                      V128, VectorIndexD,
-                                      asm, ".2d", ".2d", ".2d", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b11, opc,
-                                      FPR64Op, FPR64Op, V128, VectorIndexD,
-                                      asm, ".d", "", "", ".d", []> {
-    bits<1> idx;
-    let Inst{11} = idx{0};
-    let Inst{21} = 0;
-  }
-}
-
-multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
-                         SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s",  ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i16_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR16Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR32Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-      [(set (i32 FPR32Op:$Rd),
-            (OpNode FPR32Op:$Rn,
-                    (i32 (vector_extract (v4i32 V128:$Rm),
-                                         VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHS<bit U, bits<4> opc, string asm,
-                               SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V64, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$Rd),
-       (OpNode (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$Rd),
-       (OpNode (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$Rd),
-       (OpNode (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedHSTied<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc, V64, V64,
-                                          V128_lo, VectorIndexH,
-                                          asm, ".4h", ".4h", ".4h", ".h",
-    [(set (v4i16 V64:$dst),
-        (OpNode (v4i16 V64:$Rd),(v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".8h", ".8h", ".8h", ".h",
-    [(set (v8i16 V128:$dst),
-       (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-         (v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V64, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2s", ".2s", ".2s", ".s",
-    [(set (v2i32 V64:$dst),
-       (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm, ".4s", ".4s", ".4s", ".s",
-    [(set (v4i32 V128:$dst),
-       (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-          (v4i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSD<bit U, bits<4> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexed<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v1i64_indexed : BaseSIMDIndexed<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s", []> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDIndexedLongSQDMLXSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator Accum> {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                             (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but an
-  // intermediate EXTRACT_SUBREG would be untyped.
-  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
-                (i32 (vector_extract (v4i32
-                         (int_arm64_neon_sqdmull (v4i16 V64:$Rn),
-                             (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                    VectorIndexH:$idx)))),
-                         (i64 0))))),
-            (EXTRACT_SUBREG
-                (!cast<Instruction>(NAME # v4i16_indexed)
-                    (SUBREG_TO_REG (i32 0), FPR32Op:$Rd, ssub), V64:$Rn,
-                    V128_lo:$Rm, VectorIndexH:$idx),
-                ssub)>;
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (Accum (v4i32 V128:$Rd),
-                 (v4i32 (int_arm64_neon_sqdmull
-                            (extract_high_v8i16 V128:$Rn),
-                            (extract_high_v8i16
-                                (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                VectorIndexH:$idx))))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (Accum (v2i64 V128:$Rd),
-               (v2i64 (int_arm64_neon_sqdmull
-                          (v2i32 V64:$Rn),
-                          (v2i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (Accum (v2i64 V128:$Rd),
-                 (v2i64 (int_arm64_neon_sqdmull
-                            (extract_high_v4i32 V128:$Rn),
-                            (extract_high_v4i32
-                                (ARM64duplane32 (v4i32 V128:$Rm),
-                                                VectorIndexS:$idx))))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v1i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
-                                      FPR32Op, FPR16Op, V128_lo, VectorIndexH,
-                                      asm, ".h", "", "", ".h", []> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-
-  def v1i64_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
-                                      FPR64Op, FPR32Op, V128, VectorIndexS,
-                                      asm, ".s", "", "", ".s",
-    [(set (i64 FPR64Op:$dst),
-          (Accum (i64 FPR64Op:$Rd),
-                 (i64 (int_arm64_neon_sqdmulls_scalar
-                            (i32 FPR32Op:$Rn),
-                            (i32 (vector_extract (v4i32 V128:$Rm),
-                                                 VectorIndexS:$idx))))))]> {
-
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-}
-
-multiclass SIMDVectorIndexedLongSD<bit U, bits<4> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$Rd),
-        (OpNode (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexed<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$Rd),
-          (OpNode (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexed<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$Rd),
-        (OpNode (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexed<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$Rd),
-          (OpNode (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-multiclass SIMDVectorIndexedLongSDTied<bit U, bits<4> opc, string asm,
-                                       SDPatternOperator OpNode> {
-  let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
-                                      V128, V64,
-                                      V128_lo, VectorIndexH,
-                                      asm, ".4s", ".4s", ".4h", ".h",
-    [(set (v4i32 V128:$dst),
-        (OpNode (v4i32 V128:$Rd), (v4i16 V64:$Rn),
-         (v4i16 (ARM64duplane16 (v8i16 V128_lo:$Rm), VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
-                                      V128, V128,
-                                      V128_lo, VectorIndexH,
-                                      asm#"2", ".4s", ".4s", ".8h", ".h",
-    [(set (v4i32 V128:$dst),
-          (OpNode (v4i32 V128:$Rd),
-                  (extract_high_v8i16 V128:$Rn),
-                  (extract_high_v8i16 (ARM64duplane16 (v8i16 V128_lo:$Rm),
-                                                      VectorIndexH:$idx))))]> {
-    bits<3> idx;
-    let Inst{11} = idx{2};
-    let Inst{21} = idx{1};
-    let Inst{20} = idx{0};
-  }
-
-  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
-                                      V128, V64,
-                                      V128, VectorIndexS,
-                                      asm, ".2d", ".2d", ".2s", ".s",
-    [(set (v2i64 V128:$dst),
-        (OpNode (v2i64 V128:$Rd), (v2i32 V64:$Rn),
-         (v2i32 (ARM64duplane32 (v4i32 V128:$Rm), VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-
-  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
-                                      V128, V128,
-                                      V128, VectorIndexS,
-                                      asm#"2", ".2d", ".2d", ".4s", ".s",
-    [(set (v2i64 V128:$dst),
-          (OpNode (v2i64 V128:$Rd),
-                  (extract_high_v4i32 V128:$Rn),
-                  (extract_high_v4i32 (ARM64duplane32 (v4i32 V128:$Rm),
-                                                      VectorIndexS:$idx))))]> {
-    bits<2> idx;
-    let Inst{11} = idx{1};
-    let Inst{21} = idx{0};
-  }
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift by immediate
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShift<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$Rd), (ins regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDScalarShiftTied<bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterClass regtype1, RegisterClass regtype2,
-                     Operand immtype, string asm, list<dag> pattern>
-  : I<(outs regtype1:$dst), (ins regtype1:$Rd, regtype2:$Rn, immtype:$imm),
-      asm, "\t$Rd, $Rn, $imm", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<7> imm;
-  let Inst{31-30} = 0b01;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b111110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5} = Rn;
-  let Inst{4-0} = Rd;
-}
-
-
-multiclass SIMDScalarRShiftSD<bit U, bits<5> opc, string asm> {
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$Rd),
-     (OpNode (i64 FPR64:$Rn), (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rn, vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarRShiftDTied<bit U, bits<5> opc, string asm,
-                                 SDPatternOperator OpNode = null_frag> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm,
-  [(set (i64 FPR64:$dst), (OpNode (i64 FPR64:$Rd), (i64 FPR64:$Rn),
-                                                   (i32 vecshiftR64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-
-  def : Pat<(v1i64 (OpNode (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                           (i32 vecshiftR64:$imm))),
-            (!cast<Instruction>(NAME # "d") FPR64:$Rd, FPR64:$Rn,
-                                            vecshiftR64:$imm)>;
-}
-
-multiclass SIMDScalarLShiftD<bit U, bits<5> opc, string asm,
-                             SDPatternOperator OpNode> {
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd),
-       (OpNode (v1i64 FPR64:$Rn), (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarLShiftDTied<bit U, bits<5> opc, string asm> {
-  def d : BaseSIMDScalarShiftTied<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-multiclass SIMDScalarRShiftBHS<bit U, bits<5> opc, string asm,
-                               SDPatternOperator OpNode = null_frag> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR16, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR32, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR64, vecshiftR32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i64 FPR64:$Rn), vecshiftR32:$imm))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-}
-
-multiclass SIMDScalarLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftL8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftL16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftL32, asm,
-    [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn), (i32 vecshiftL32:$imm)))]> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftL64, asm,
-    [(set (v1i64 FPR64:$Rd), (OpNode (v1i64 FPR64:$Rn),
-                                     (i32 vecshiftL64:$imm)))]> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-multiclass SIMDScalarRShiftBHSD<bit U, bits<5> opc, string asm> {
-  def b : BaseSIMDScalarShift<U, opc, {0,0,0,1,?,?,?},
-                              FPR8, FPR8, vecshiftR8, asm, []> {
-    let Inst{18-16} = imm{2-0};
-  }
-
-  def h : BaseSIMDScalarShift<U, opc, {0,0,1,?,?,?,?},
-                              FPR16, FPR16, vecshiftR16, asm, []> {
-    let Inst{19-16} = imm{3-0};
-  }
-
-  def s : BaseSIMDScalarShift<U, opc, {0,1,?,?,?,?,?},
-                              FPR32, FPR32, vecshiftR32, asm, []> {
-    let Inst{20-16} = imm{4-0};
-  }
-
-  def d : BaseSIMDScalarShift<U, opc, {1,?,?,?,?,?,?},
-                              FPR64, FPR64, vecshiftR64, asm, []> {
-    let Inst{21-16} = imm{5-0};
-  }
-}
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector x indexed element
-//----------------------------------------------------------------------------
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShift<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand dst_reg, RegisterOperand src_reg,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs dst_reg:$Rd), (ins src_reg:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
-class BaseSIMDVectorShiftTied<bit Q, bit U, bits<5> opc, bits<7> fixed_imm,
-                     RegisterOperand vectype1, RegisterOperand vectype2,
-                     Operand immtype,
-                     string asm, string dst_kind, string src_kind,
-                     list<dag> pattern>
-  : I<(outs vectype1:$dst), (ins vectype1:$Rd, vectype2:$Rn, immtype:$imm),
-      asm, "{\t$Rd" # dst_kind # ", $Rn" # src_kind # ", $imm" #
-           "|" # dst_kind # "\t$Rd, $Rn, $imm}", "$Rd = $dst", pattern>,
-    Sched<[WriteV]> {
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31}    = 0;
-  let Inst{30}    = Q;
-  let Inst{29}    = U;
-  let Inst{28-23} = 0b011110;
-  let Inst{22-16} = fixed_imm;
-  let Inst{15-11} = opc;
-  let Inst{10}    = 1;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-multiclass SIMDVectorRShiftSD<bit U, bits<5> opc, string asm,
-                              Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2i32 V64:$Rd), (OpNode (v2f32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4i32 V128:$Rd), (OpNode (v4f32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2i64 V128:$Rd), (OpNode (v2f64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftSDToFP<bit U, bits<5> opc, string asm,
-                                  Intrinsic OpNode> {
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-      [(set (v2f32 V64:$Rd), (OpNode (v2i32 V64:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-      [(set (v4f32 V128:$Rd), (OpNode (v4i32 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-      [(set (v2f64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (i32 imm:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftNarrowBHS<bit U, bits<5> opc, string asm,
-                                     SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V128, vecshiftR16Narrow,
-                                  asm, ".8b", ".8h",
-      [(set (v8i8 V64:$Rd), (OpNode (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR16Narrow,
-                                  asm#"2", ".16b", ".8h", []> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V128, vecshiftR32Narrow,
-                                  asm, ".4h", ".4s",
-      [(set (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR32Narrow,
-                                  asm#"2", ".8h", ".4s", []> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V128, vecshiftR64Narrow,
-                                  asm, ".2s", ".2d",
-      [(set (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR64Narrow,
-                                  asm#"2", ".4s", ".2d", []> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-    let hasSideEffects = 0;
-  }
-
-  // TableGen doesn't like patters w/ INSERT_SUBREG on the instructions
-  // themselves, so put them here instead.
-
-  // Patterns involving what's effectively an insert high and a normal
-  // intrinsic, represented by CONCAT_VECTORS.
-  def : Pat<(concat_vectors (v8i8 V64:$Rd),(OpNode (v8i16 V128:$Rn),
-                                                   vecshiftR16Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v16i8_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR16Narrow:$imm)>;
-  def : Pat<(concat_vectors (v4i16 V64:$Rd), (OpNode (v4i32 V128:$Rn),
-                                                     vecshiftR32Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v8i16_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR32Narrow:$imm)>;
-  def : Pat<(concat_vectors (v2i32 V64:$Rd), (OpNode (v2i64 V128:$Rn),
-                                                     vecshiftR64Narrow:$imm)),
-            (!cast<Instruction>(NAME # "v4i32_shift")
-                (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                V128:$Rn, vecshiftR64Narrow:$imm)>;
-}
-
-multiclass SIMDVectorLShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorRShiftBHSD<bit U, bits<5> opc, string asm,
-                                SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8,
-                                  asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$Rd), (OpNode (v8i8 V64:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8,
-                                  asm, ".16b", ".16b",
-             [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn),
-                   (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16,
-                                  asm, ".4h", ".4h",
-              [(set (v4i16 V64:$Rd), (OpNode (v4i16 V64:$Rn),
-                    (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16,
-                                  asm, ".8h", ".8h",
-            [(set (v8i16 V128:$Rd), (OpNode (v8i16 V128:$Rn),
-                  (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32,
-                                  asm, ".2s", ".2s",
-              [(set (v2i32 V64:$Rd), (OpNode (v2i32 V64:$Rn),
-                    (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32,
-                                  asm, ".4s", ".4s",
-            [(set (v4i32 V128:$Rd), (OpNode (v4i32 V128:$Rn),
-                  (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShift<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d",
-            [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn),
-                  (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDVectorRShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftR8, asm, ".8b", ".8b",
-                 [(set (v8i8 V64:$dst),
-                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                           (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftR8, asm, ".16b", ".16b",
-             [(set (v16i8 V128:$dst),
-               (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                       (i32 vecshiftR8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftR16, asm, ".4h", ".4h",
-              [(set (v4i16 V64:$dst),
-                (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                        (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftR16, asm, ".8h", ".8h",
-            [(set (v8i16 V128:$dst),
-              (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                      (i32 vecshiftR16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftR32, asm, ".2s", ".2s",
-              [(set (v2i32 V64:$dst),
-                (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                        (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftR32, asm, ".4s", ".4s",
-            [(set (v4i32 V128:$dst),
-              (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                      (i32 vecshiftR32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftR64,
-                                  asm, ".2d", ".2d", [(set (v2i64 V128:$dst),
-              (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                      (i32 vecshiftR64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftBHSDTied<bit U, bits<5> opc, string asm,
-                                    SDPatternOperator OpNode = null_frag> {
-  def v8i8_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,0,1,?,?,?},
-                                  V64, V64, vecshiftL8,
-                                  asm, ".8b", ".8b",
-                    [(set (v8i8 V64:$dst),
-                          (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm, ".16b", ".16b",
-                    [(set (v16i8 V128:$dst),
-                          (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
-                                  (i32 vecshiftL8:$imm)))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,0,1,?,?,?,?},
-                                  V64, V64, vecshiftL16,
-                                  asm, ".4h", ".4h",
-                    [(set (v4i16 V64:$dst),
-                           (OpNode (v4i16 V64:$Rd), (v4i16 V64:$Rn),
-                                   (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm, ".8h", ".8h",
-                    [(set (v8i16 V128:$dst),
-                          (OpNode (v8i16 V128:$Rd), (v8i16 V128:$Rn),
-                                  (i32 vecshiftL16:$imm)))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShiftTied<0, U, opc, {0,1,?,?,?,?,?},
-                                  V64, V64, vecshiftL32,
-                                  asm, ".2s", ".2s",
-                    [(set (v2i32 V64:$dst),
-                          (OpNode (v2i32 V64:$Rd), (v2i32 V64:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShiftTied<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm, ".4s", ".4s",
-                    [(set (v4i32 V128:$dst),
-                          (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                  (i32 vecshiftL32:$imm)))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v2i64_shift : BaseSIMDVectorShiftTied<1, U, opc, {1,?,?,?,?,?,?},
-                                  V128, V128, vecshiftL64,
-                                  asm, ".2d", ".2d",
-                    [(set (v2i64 V128:$dst),
-                          (OpNode (v2i64 V128:$Rd), (v2i64 V128:$Rn),
-                                  (i32 vecshiftL64:$imm)))]> {
-    bits<6> imm;
-    let Inst{21-16} = imm;
-  }
-}
-
-multiclass SIMDVectorLShiftLongBHSD<bit U, bits<5> opc, string asm,
-                                   SDPatternOperator OpNode> {
-  def v8i8_shift : BaseSIMDVectorShift<0, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V64, vecshiftL8, asm, ".8h", ".8b",
-      [(set (v8i16 V128:$Rd), (OpNode (v8i8 V64:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v16i8_shift : BaseSIMDVectorShift<1, U, opc, {0,0,0,1,?,?,?},
-                                  V128, V128, vecshiftL8,
-                                  asm#"2", ".8h", ".16b",
-      [(set (v8i16 V128:$Rd),
-            (OpNode (extract_high_v16i8 V128:$Rn), vecshiftL8:$imm))]> {
-    bits<3> imm;
-    let Inst{18-16} = imm;
-  }
-
-  def v4i16_shift : BaseSIMDVectorShift<0, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V64, vecshiftL16, asm, ".4s", ".4h",
-      [(set (v4i32 V128:$Rd), (OpNode (v4i16 V64:$Rn), vecshiftL16:$imm))]> {
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v8i16_shift : BaseSIMDVectorShift<1, U, opc, {0,0,1,?,?,?,?},
-                                  V128, V128, vecshiftL16,
-                                  asm#"2", ".4s", ".8h",
-      [(set (v4i32 V128:$Rd),
-            (OpNode (extract_high_v8i16 V128:$Rn), vecshiftL16:$imm))]> {
-
-    bits<4> imm;
-    let Inst{19-16} = imm;
-  }
-
-  def v2i32_shift : BaseSIMDVectorShift<0, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V64, vecshiftL32, asm, ".2d", ".2s",
-      [(set (v2i64 V128:$Rd), (OpNode (v2i32 V64:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-
-  def v4i32_shift : BaseSIMDVectorShift<1, U, opc, {0,1,?,?,?,?,?},
-                                  V128, V128, vecshiftL32,
-                                  asm#"2", ".2d", ".4s",
-      [(set (v2i64 V128:$Rd),
-            (OpNode (extract_high_v4i32 V128:$Rn), vecshiftL32:$imm))]> {
-    bits<5> imm;
-    let Inst{20-16} = imm;
-  }
-}
-
-
-//---
-// Vector load/store
-//---
-// SIMD ldX/stX no-index memory references don't allow the optional
-// ", #0" constant and handle post-indexing explicitly, so we use
-// a more specialized parse method for them. Otherwise, it's the same as
-// the general am_noindex handling.
-def MemorySIMDNoIndexOperand : AsmOperandClass {
-  let Name = "MemorySIMDNoIndex";
-  let ParserMethod = "tryParseNoIndexMemory";
-}
-def am_simdnoindex : Operand<i64>,
-                     ComplexPattern<i64, 1, "SelectAddrModeNoIndex", []> {
-  let PrintMethod = "printAMNoIndex";
-  let ParserMatchClass = MemorySIMDNoIndexOperand;
-  let MIOperandInfo = (ops GPR64sp:$base);
-  let DecoderMethod = "DecodeGPR64spRegisterClass";
-}
-
-class BaseSIMDLdSt<bit Q, bit L, bits<4> opcode, bits<2> size,
-                   string asm, dag oops, dag iops, list<dag> pattern>
-  : I<oops, iops, asm, "\t$Vt, $vaddr", "", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011000;
-  let Inst{22} = L;
-  let Inst{21-16} = 0b000000;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-}
-
-class BaseSIMDLdStPost<bit Q, bit L, bits<4> opcode, bits<2> size,
-                       string asm, dag oops, dag iops>
-  : I<oops, iops, asm, "\t$Vt, $vaddr, $Xm", "", []> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  bits<5> Xm;
-  let Inst{31} = 0;
-  let Inst{30} = Q;
-  let Inst{29-23} = 0b0011001;
-  let Inst{22} = L;
-  let Inst{21} = 0;
-  let Inst{20-16} = Xm;
-  let Inst{15-12} = opcode;
-  let Inst{11-10} = size;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStPost";
-}
-
-// The immediate form of AdvSIMD post-indexed addressing is encoded with
-// register post-index addressing from the zero register.
-multiclass SIMDLdStAliases<string asm, string layout, string Count,
-                           int Offset, int Size> {
-  // E.g. "ld1 { v0.8b, v1.8b }, [x1], #16"
-  //      "ld1\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], #16"
-  //      "ld1.8b\t$Vt, $vaddr, #16"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1]"
-  //      "ld1\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Twov8b VecListTwo64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # Count # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0, v1 }, [x1], x2"
-  //      "ld1\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Twov8b_POST VecListTwo64:$Vt, am_simdnoindex:$vaddr, GPR64pi8:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # Count # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass BaseSIMDLdN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v16b: BaseSIMDLdSt<1, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 1, opcode, 0b00, asm,
-                           (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 1, opcode, 0b01, asm,
-                           (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 1, opcode, 0b10, asm,
-                           (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-
-    def v16b_POST: BaseSIMDLdStPost<1, 1, opcode, 0b00, asm,
-                       (outs !cast<RegisterOperand>(veclist # "16b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 1, opcode, 0b01, asm,
-                       (outs !cast<RegisterOperand>(veclist # "8h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 1, opcode, 0b10, asm,
-                       (outs !cast<RegisterOperand>(veclist # "4s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 1, opcode, 0b11, asm,
-                       (outs !cast<RegisterOperand>(veclist # "2d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 1, opcode, 0b00, asm,
-                       (outs !cast<RegisterOperand>(veclist # "8b"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 1, opcode, 0b01, asm,
-                       (outs !cast<RegisterOperand>(veclist # "4h"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 1, opcode, 0b10, asm,
-                       (outs !cast<RegisterOperand>(veclist # "2s"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-// Only ld1/st1 has a v1d version.
-multiclass BaseSIMDStN<string Count, string asm, string veclist, int Offset128,
-                       int Offset64, bits<4> opcode> {
-  let hasSideEffects = 0, mayStore = 1, mayLoad = 0 in {
-    def v16b : BaseSIMDLdSt<1, 0, opcode, 0b00, asm, (outs),
-                            (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                                 am_simdnoindex:$vaddr), []>;
-    def v8h : BaseSIMDLdSt<1, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4s : BaseSIMDLdSt<1, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2d : BaseSIMDLdSt<1, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v8b : BaseSIMDLdSt<0, 0, opcode, 0b00, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v4h : BaseSIMDLdSt<0, 0, opcode, 0b01, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-    def v2s : BaseSIMDLdSt<0, 0, opcode, 0b10, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v16b_POST : BaseSIMDLdStPost<1, 0, opcode, 0b00, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "16b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8h_POST : BaseSIMDLdStPost<1, 0, opcode, 0b01, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "8h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v4s_POST : BaseSIMDLdStPost<1, 0, opcode, 0b10, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "4s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v2d_POST : BaseSIMDLdStPost<1, 0, opcode, 0b11, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "2d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset128):$Xm)>;
-    def v8b_POST : BaseSIMDLdStPost<0, 0, opcode, 0b00, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "8b"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v4h_POST : BaseSIMDLdStPost<0, 0, opcode, 0b01, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "4h"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-    def v2s_POST : BaseSIMDLdStPost<0, 0, opcode, 0b10, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "2s"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "16b", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8h", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "4s", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "2d", Count, Offset128, 128>;
-  defm : SIMDLdStAliases<asm, "8b", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "4h", Count, Offset64, 64>;
-  defm : SIMDLdStAliases<asm, "2s", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDLd1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDLdN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // LD1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
-    def v1d : BaseSIMDLdSt<0, 1, opcode, 0b11, asm,
-                           (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                           (ins am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 1, opcode, 0b11, asm,
-                       (outs !cast<RegisterOperand>(veclist # "1d"):$Vt),
-                       (ins am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass BaseSIMDSt1<string Count, string asm, string veclist,
-                       int Offset128, int Offset64, bits<4> opcode>
-  : BaseSIMDStN<Count, asm, veclist, Offset128, Offset64, opcode> {
-
-  // ST1 instructions have extra "1d" variants.
-  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
-    def v1d : BaseSIMDLdSt<0, 0, opcode, 0b11, asm, (outs),
-                           (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                                am_simdnoindex:$vaddr), []>;
-
-    def v1d_POST : BaseSIMDLdStPost<0, 0, opcode, 0b11, asm, (outs),
-                       (ins !cast<RegisterOperand>(veclist # "1d"):$Vt,
-                            am_simdnoindex:$vaddr,
-                            !cast<RegisterOperand>("GPR64pi" # Offset64):$Xm)>;
-  }
-
-  defm : SIMDLdStAliases<asm, "1d", Count, Offset64, 64>;
-}
-
-multiclass SIMDLd1Multiple<string asm> {
-  defm One   : BaseSIMDLd1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDLd1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDLd1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDLd1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDSt1Multiple<string asm> {
-  defm One   : BaseSIMDSt1<"One", asm, "VecListOne", 16, 8,  0b0111>;
-  defm Two   : BaseSIMDSt1<"Two", asm, "VecListTwo", 32, 16, 0b1010>;
-  defm Three : BaseSIMDSt1<"Three", asm, "VecListThree", 48, 24, 0b0110>;
-  defm Four  : BaseSIMDSt1<"Four", asm, "VecListFour", 64, 32, 0b0010>;
-}
-
-multiclass SIMDLd2Multiple<string asm> {
-  defm Two : BaseSIMDLdN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDSt2Multiple<string asm> {
-  defm Two : BaseSIMDStN<"Two", asm, "VecListTwo", 32, 16, 0b1000>;
-}
-
-multiclass SIMDLd3Multiple<string asm> {
-  defm Three : BaseSIMDLdN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDSt3Multiple<string asm> {
-  defm Three : BaseSIMDStN<"Three", asm, "VecListThree", 48, 24, 0b0100>;
-}
-
-multiclass SIMDLd4Multiple<string asm> {
-  defm Four : BaseSIMDLdN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-multiclass SIMDSt4Multiple<string asm> {
-  defm Four : BaseSIMDStN<"Four", asm, "VecListFour", 64, 32, 0b0000>;
-}
-
-//---
-// AdvSIMD Load/store single-element
-//---
-
-class BaseSIMDLdStSingle<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, dag oops, dag iops,
-                         list<dag> pattern>
-  : I<oops, iops, asm, operands, "", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStSingle";
-}
-
-class BaseSIMDLdStSingleTied<bit L, bit R, bits<3> opcode,
-                         string asm, string operands, dag oops, dag iops,
-                         list<dag> pattern>
-  : I<oops, iops, asm, operands, "$Vt = $dst", pattern> {
-  bits<5> Vt;
-  bits<5> vaddr;
-  let Inst{31} = 0;
-  let Inst{29-24} = 0b001101;
-  let Inst{22} = L;
-  let Inst{21} = R;
-  let Inst{15-13} = opcode;
-  let Inst{9-5} = vaddr;
-  let Inst{4-0} = Vt;
-  let DecoderMethod = "DecodeSIMDLdStSingleTied";
-}
-
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdR<bit Q, bit R, bits<3> opcode, bit S, bits<2> size, string asm,
-                  Operand listtype>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr",
-                       (outs listtype:$Vt), (ins am_simdnoindex:$vaddr), []> {
-  let Inst{30} = Q;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-class BaseSIMDLdRPost<bit Q, bit R, bits<3> opcode, bit S, bits<2> size,
-                      string asm, Operand listtype, Operand GPR64pi>
-  : BaseSIMDLdStSingle<1, R, opcode, asm, "\t$Vt, $vaddr, $Xm",
-                       (outs listtype:$Vt),
-                       (ins am_simdnoindex:$vaddr, GPR64pi:$Xm), []> {
-  bits<5> Xm;
-  let Inst{30} = Q;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = S;
-  let Inst{11-10} = size;
-}
-
-multiclass SIMDLdrAliases<string asm, string layout, string Count,
-                          int Offset, int Size> {
-  // E.g. "ld1r { v0.8b }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], #1"
-  //      "ld1r.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1]"
-  //      "ld1r.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr",
-                  (!cast<Instruction>(NAME # "v" # layout)
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1r.8b { v0 }, [x1], x2"
-  //      "ld1r.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt, $vaddr, $Xm",
-                  (!cast<Instruction>(NAME # "v" # layout # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # Size):$Vt,
-                      am_simdnoindex:$vaddr,
-                      !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdR<bit R, bits<3> opcode, bit S, string asm, string Count,
-  int Offset1, int Offset2, int Offset4, int Offset8> {
-  def v8b : BaseSIMDLdR<0, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count # "8b")>;
-  def v16b: BaseSIMDLdR<1, R, opcode, S, 0b00, asm,
-                        !cast<Operand>("VecList" # Count #"16b")>;
-  def v4h : BaseSIMDLdR<0, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"4h")>;
-  def v8h : BaseSIMDLdR<1, R, opcode, S, 0b01, asm,
-                        !cast<Operand>("VecList" # Count #"8h")>;
-  def v2s : BaseSIMDLdR<0, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"2s")>;
-  def v4s : BaseSIMDLdR<1, R, opcode, S, 0b10, asm,
-                        !cast<Operand>("VecList" # Count #"4s")>;
-  def v1d : BaseSIMDLdR<0, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"1d")>;
-  def v2d : BaseSIMDLdR<1, R, opcode, S, 0b11, asm,
-                        !cast<Operand>("VecList" # Count #"2d")>;
-
-  def v8b_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "8b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v16b_POST: BaseSIMDLdRPost<1, R, opcode, S, 0b00, asm,
-                                 !cast<Operand>("VecList" # Count # "16b"),
-                                 !cast<Operand>("GPR64pi" # Offset1)>;
-  def v4h_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "4h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v8h_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b01, asm,
-                                 !cast<Operand>("VecList" # Count # "8h"),
-                                 !cast<Operand>("GPR64pi" # Offset2)>;
-  def v2s_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "2s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v4s_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b10, asm,
-                                 !cast<Operand>("VecList" # Count # "4s"),
-                                 !cast<Operand>("GPR64pi" # Offset4)>;
-  def v1d_POST : BaseSIMDLdRPost<0, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "1d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-  def v2d_POST : BaseSIMDLdRPost<1, R, opcode, S, 0b11, asm,
-                                 !cast<Operand>("VecList" # Count # "2d"),
-                                 !cast<Operand>("GPR64pi" # Offset8)>;
-
-  defm : SIMDLdrAliases<asm, "8b",  Count, Offset1,  64>;
-  defm : SIMDLdrAliases<asm, "16b", Count, Offset1, 128>;
-  defm : SIMDLdrAliases<asm, "4h",  Count, Offset2,  64>;
-  defm : SIMDLdrAliases<asm, "8h",  Count, Offset2, 128>;
-  defm : SIMDLdrAliases<asm, "2s",  Count, Offset4,  64>;
-  defm : SIMDLdrAliases<asm, "4s",  Count, Offset4, 128>;
-  defm : SIMDLdrAliases<asm, "1d",  Count, Offset8,  64>;
-  defm : SIMDLdrAliases<asm, "2d",  Count, Offset8, 128>;
-}
-
-class SIMDLdStSingleB<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTied<bit L, bit R, bits<3> opcode, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  let Inst{30} = idx{3};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-class SIMDLdStSingleBTiedPost<bit L, bit R, bits<3> opcode, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size fields.
-  bits<4> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{3};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{2};
-  let Inst{11-10} = idx{1-0};
-}
-
-class SIMDLdStSingleH<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTied<bit L, bit R, bits<3> opcode, bit size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  let Inst{30} = idx{2};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-
-class SIMDLdStSingleHPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleHTiedPost<bit L, bit R, bits<3> opcode, bit size, string asm,
-                          dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S:size<1> fields.
-  bits<3> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{2};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{1};
-  let Inst{11} = idx{0};
-  let Inst{10} = size;
-}
-class SIMDLdStSingleS<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  let Inst{30} = idx{1};
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleSTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q:S fields.
-  bits<2> idx;
-  bits<5> Xm;
-  let Inst{30} = idx{1};
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = idx{0};
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleD<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTied<bit L, bit R, bits<3> opcode, bits<2> size, string asm,
-                      dag oops, dag iops, list<dag> pattern>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr", oops, iops,
-                       pattern> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  let Inst{30} = idx;
-  let Inst{23} = 0;
-  let Inst{20-16} = 0b00000;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingle<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-class SIMDLdStSingleDTiedPost<bit L, bit R, bits<3> opcode, bits<2> size,
-                          string asm, dag oops, dag iops>
-  : BaseSIMDLdStSingleTied<L, R, opcode, asm, "\t$Vt$idx, $vaddr, $Xm",
-                       oops, iops, []> {
-  // idx encoded in Q field.
-  bits<1> idx;
-  bits<5> Xm;
-  let Inst{30} = idx;
-  let Inst{23} = 1;
-  let Inst{20-16} = Xm;
-  let Inst{12} = 0;
-  let Inst{11-10} = size;
-}
-
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleBTied<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleBTied<1, R, opcode, asm,
-                           (outs listtype:$dst),
-                           (ins listtype:$Vt, VectorIndexB:$idx,
-                                am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBTiedPost<1, R, opcode, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexB:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleHTied<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleHTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexH:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleSTied<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype,
-                         RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleSTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexS:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
-multiclass SIMDLdSingleDTied<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleDTied<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDTiedPost<1, R, opcode, size, asm,
-                            (outs listtype:$dst),
-                            (ins listtype:$Vt, VectorIndexD:$idx,
-                                 am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleB<bit R, bits<3> opcode, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i8 : SIMDLdStSingleB<0, R, opcode, asm,
-                           (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                        am_simdnoindex:$vaddr), []>;
-
-  def i8_POST : SIMDLdStSingleBPost<0, R, opcode, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexB:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleH<bit R, bits<3> opcode, bit size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i16 : SIMDLdStSingleH<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i16_POST : SIMDLdStSingleHPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexH:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleS<bit R, bits<3> opcode, bits<2> size,string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i32 : SIMDLdStSingleS<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i32_POST : SIMDLdStSingleSPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexS:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-let mayLoad = 0, mayStore = 1, hasSideEffects = 0 in
-multiclass SIMDStSingleD<bit R, bits<3> opcode, bits<2> size, string asm,
-                         RegisterOperand listtype, RegisterOperand GPR64pi> {
-  def i64 : SIMDLdStSingleD<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr), []>;
-
-  def i64_POST : SIMDLdStSingleDPost<0, R, opcode, size, asm,
-                            (outs), (ins listtype:$Vt, VectorIndexD:$idx,
-                                         am_simdnoindex:$vaddr, GPR64pi:$Xm)>;
-}
-
-multiclass SIMDLdStSingleAliases<string asm, string layout, string Type,
-                                 string Count, int Offset, Operand idxtype> {
-  // E.g. "ld1 { v0.8b }[0], [x1], #1"
-  //      "ld1\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne8b:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type  # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # layout):$Vt,
-                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 1>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], #1"
-  //      "ld1.8b\t$Vt, $vaddr, #1"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, XZR)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, #" # Offset,
-                  (!cast<Instruction>(NAME # Type # "_POST")
-                      !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                      idxtype:$idx, am_simdnoindex:$vaddr, XZR), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1]"
-  //      "ld1.8b\t$Vt, $vaddr"
-  // may get mapped to
-  //      (LD1Rv8b VecListOne64:$Vt, am_simdnoindex:$vaddr)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr",
-                      (!cast<Instruction>(NAME # Type)
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, am_simdnoindex:$vaddr), 0>;
-
-  // E.g. "ld1.8b { v0 }[0], [x1], x2"
-  //      "ld1.8b\t$Vt, $vaddr, $Xm"
-  // may get mapped to
-  //      (LD1Rv8b_POST VecListOne64:$Vt, am_simdnoindex:$vaddr, GPR64pi1:$Xm)
-  def : InstAlias<asm # "." # layout # "\t$Vt$idx, $vaddr, $Xm",
-                      (!cast<Instruction>(NAME # Type # "_POST")
-                         !cast<RegisterOperand>("VecList" # Count # "128"):$Vt,
-                         idxtype:$idx, am_simdnoindex:$vaddr,
-                         !cast<RegisterOperand>("GPR64pi" # Offset):$Xm), 0>;
-}
-
-multiclass SIMDLdSt1SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "One", 1, VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "One", 2, VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "One", 4, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "One", 8, VectorIndexD>;
-}
-
-multiclass SIMDLdSt2SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Two", 2,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Two", 4,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Two", 8,  VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Two", 16, VectorIndexD>;
-}
-
-multiclass SIMDLdSt3SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Three", 3,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Three", 6,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Three", 12, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Three", 24, VectorIndexD>;
-}
-
-multiclass SIMDLdSt4SingleAliases<string asm> {
-  defm : SIMDLdStSingleAliases<asm, "b", "i8",  "Four", 4,  VectorIndexB>;
-  defm : SIMDLdStSingleAliases<asm, "h", "i16", "Four", 8,  VectorIndexH>;
-  defm : SIMDLdStSingleAliases<asm, "s", "i32", "Four", 16, VectorIndexS>;
-  defm : SIMDLdStSingleAliases<asm, "d", "i64", "Four", 32, VectorIndexD>;
-}
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class AESBase<bits<4> opc, string asm, dag outs, dag ins, string cstr,
-              list<dag> pat>
-  : I<outs, ins, asm, "{\t$Rd.16b, $Rn.16b|.16b\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0100111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class AESInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$Rd), (ins V128:$Rn), "",
-            [(set (v16i8 V128:$Rd), (OpNode (v16i8 V128:$Rn)))]>;
-
-class AESTiedInst<bits<4> opc, string asm, Intrinsic OpNode>
-  : AESBase<opc, asm, (outs V128:$dst), (ins V128:$Rd, V128:$Rn),
-            "$Rd = $dst",
-            [(set (v16i8 V128:$dst),
-                  (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA3OpTiedInst<bits<3> opc, string asm, string dst_lhs_kind,
-                     dag oops, dag iops, list<dag> pat>
-  : I<oops, iops, asm,
-      "{\t$Rd" # dst_lhs_kind # ", $Rn" # dst_lhs_kind # ", $Rm.4s" #
-      "|.4s\t$Rd, $Rn, $Rm}", "$Rd = $dst", pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  bits<5> Rm;
-  let Inst{31-21} = 0b01011110000;
-  let Inst{20-16} = Rm;
-  let Inst{15}    = 0;
-  let Inst{14-12} = opc;
-  let Inst{11-10} = 0b00;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstQSV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR32:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (i32 FPR32:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstVVV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, ".4s", (outs V128:$dst),
-                   (ins V128:$Rd, V128:$Rn, V128:$Rm),
-                   [(set (v4i32 V128:$dst),
-                         (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-class SHATiedInstQQV<bits<3> opc, string asm, Intrinsic OpNode>
-  : SHA3OpTiedInst<opc, asm, "", (outs FPR128:$dst),
-                   (ins FPR128:$Rd, FPR128:$Rn, V128:$Rm),
-                   [(set (v4i32 FPR128:$dst),
-                         (OpNode (v4i32 FPR128:$Rd), (v4i32 FPR128:$Rn),
-                                 (v4i32 V128:$Rm)))]>;
-
-let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
-class SHA2OpInst<bits<4> opc, string asm, string kind,
-                 string cstr, dag oops, dag iops,
-                 list<dag> pat>
-  : I<oops, iops, asm, "{\t$Rd" # kind # ", $Rn" # kind #
-                       "|" # kind # "\t$Rd, $Rn}", cstr, pat>,
-    Sched<[WriteV]>{
-  bits<5> Rd;
-  bits<5> Rn;
-  let Inst{31-16} = 0b0101111000101000;
-  let Inst{15-12} = opc;
-  let Inst{11-10} = 0b10;
-  let Inst{9-5}   = Rn;
-  let Inst{4-0}   = Rd;
-}
-
-class SHATiedInstVV<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, ".4s", "$Rd = $dst", (outs V128:$dst),
-               (ins V128:$Rd, V128:$Rn),
-               [(set (v4i32 V128:$dst),
-                     (OpNode (v4i32 V128:$Rd), (v4i32 V128:$Rn)))]>;
-
-class SHAInstSS<bits<4> opc, string asm, Intrinsic OpNode>
-  : SHA2OpInst<opc, asm, "", "", (outs FPR32:$Rd), (ins FPR32:$Rn),
-               [(set (i32 FPR32:$Rd), (OpNode (i32 FPR32:$Rn)))]>;
-
-// Allow the size specifier tokens to be upper case, not just lower.
-def : TokenAlias<".8B", ".8b">;
-def : TokenAlias<".4H", ".4h">;
-def : TokenAlias<".2S", ".2s">;
-def : TokenAlias<".1D", ".1d">;
-def : TokenAlias<".16B", ".16b">;
-def : TokenAlias<".8H", ".8h">;
-def : TokenAlias<".4S", ".4s">;
-def : TokenAlias<".2D", ".2d">;
-def : TokenAlias<".B", ".b">;
-def : TokenAlias<".H", ".h">;
-def : TokenAlias<".S", ".s">;
-def : TokenAlias<".D", ".d">;
diff --git a/lib/Target/ARM64/ARM64InstrInfo.cpp b/lib/Target/ARM64/ARM64InstrInfo.cpp
deleted file mode 100644
index 8f11757..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.cpp
+++ /dev/null
@@ -1,1864 +0,0 @@
-//===- ARM64InstrInfo.cpp - ARM64 Instruction Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_CTOR_DTOR
-#include "ARM64GenInstrInfo.inc"
-
-using namespace llvm;
-
-ARM64InstrInfo::ARM64InstrInfo(const ARM64Subtarget &STI)
-    : ARM64GenInstrInfo(ARM64::ADJCALLSTACKDOWN, ARM64::ADJCALLSTACKUP),
-      RI(this, &STI), Subtarget(STI) {}
-
-/// GetInstSize - Return the number of bytes of code the specified
-/// instruction may be.  This returns the maximum number of bytes.
-unsigned ARM64InstrInfo::GetInstSizeInBytes(const MachineInstr *MI) const {
-  const MCInstrDesc &Desc = MI->getDesc();
-
-  switch (Desc.getOpcode()) {
-  default:
-    // Anything not explicitly designated otherwise is a nomal 4-byte insn.
-    return 4;
-  case TargetOpcode::DBG_VALUE:
-  case TargetOpcode::EH_LABEL:
-  case TargetOpcode::IMPLICIT_DEF:
-  case TargetOpcode::KILL:
-    return 0;
-  }
-
-  llvm_unreachable("GetInstSizeInBytes()- Unable to determin insn size");
-}
-
-static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
-                            SmallVectorImpl<MachineOperand> &Cond) {
-  // Block ends with fall-through condbranch.
-  switch (LastInst->getOpcode()) {
-  default:
-    llvm_unreachable("Unknown branch instruction?");
-  case ARM64::Bcc:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-    Target = LastInst->getOperand(1).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    break;
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-    Target = LastInst->getOperand(2).getMBB();
-    Cond.push_back(MachineOperand::CreateImm(-1));
-    Cond.push_back(MachineOperand::CreateImm(LastInst->getOpcode()));
-    Cond.push_back(LastInst->getOperand(0));
-    Cond.push_back(LastInst->getOperand(1));
-  }
-}
-
-// Branch analysis.
-bool ARM64InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
-                                   MachineBasicBlock *&TBB,
-                                   MachineBasicBlock *&FBB,
-                                   SmallVectorImpl<MachineOperand> &Cond,
-                                   bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return false;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
-  if (!isUnpredicatedTerminator(I))
-    return false;
-
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-
-  // If there is only one terminator instruction, process it.
-  unsigned LastOpc = LastInst->getOpcode();
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
-    }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      parseCondBranch(LastInst, TBB, Cond);
-      return false;
-    }
-    return true; // Can't handle indirect branch.
-  }
-
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
-      }
-    }
-  }
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
-
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    parseCondBranch(SecondLastInst, TBB, Cond);
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
-
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
-
-  // ...likewise if it ends with an indirect branch followed by an unconditional
-  // branch.
-  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
-  }
-
-  // Otherwise, can't handle this.
-  return true;
-}
-
-bool ARM64InstrInfo::ReverseBranchCondition(
-    SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    ARM64CC::CondCode CC = (ARM64CC::CondCode)(int)Cond[0].getImm();
-    Cond[0].setImm(ARM64CC::getInvertedCondCode(CC));
-  } else {
-    // Folded compare-and-branch
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown conditional branch!");
-    case ARM64::CBZW:
-      Cond[1].setImm(ARM64::CBNZW);
-      break;
-    case ARM64::CBNZW:
-      Cond[1].setImm(ARM64::CBZW);
-      break;
-    case ARM64::CBZX:
-      Cond[1].setImm(ARM64::CBNZX);
-      break;
-    case ARM64::CBNZX:
-      Cond[1].setImm(ARM64::CBZX);
-      break;
-    case ARM64::TBZ:
-      Cond[1].setImm(ARM64::TBNZ);
-      break;
-    case ARM64::TBNZ:
-      Cond[1].setImm(ARM64::TBZ);
-      break;
-    }
-  }
-
-  return false;
-}
-
-unsigned ARM64InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
-  MachineBasicBlock::iterator I = MBB.end();
-  if (I == MBB.begin())
-    return 0;
-  --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return 0;
-    --I;
-  }
-  if (!isUncondBranchOpcode(I->getOpcode()) &&
-      !isCondBranchOpcode(I->getOpcode()))
-    return 0;
-
-  // Remove the branch.
-  I->eraseFromParent();
-
-  I = MBB.end();
-
-  if (I == MBB.begin())
-    return 1;
-  --I;
-  if (!isCondBranchOpcode(I->getOpcode()))
-    return 1;
-
-  // Remove the branch.
-  I->eraseFromParent();
-  return 2;
-}
-
-void ARM64InstrInfo::instantiateCondBranch(
-    MachineBasicBlock &MBB, DebugLoc DL, MachineBasicBlock *TBB,
-    const SmallVectorImpl<MachineOperand> &Cond) const {
-  if (Cond[0].getImm() != -1) {
-    // Regular Bcc
-    BuildMI(&MBB, DL, get(ARM64::Bcc)).addImm(Cond[0].getImm()).addMBB(TBB);
-  } else {
-    // Folded compare-and-branch
-    const MachineInstrBuilder MIB =
-        BuildMI(&MBB, DL, get(Cond[1].getImm())).addReg(Cond[2].getReg());
-    if (Cond.size() > 3)
-      MIB.addImm(Cond[3].getImm());
-    MIB.addMBB(TBB);
-  }
-}
-
-unsigned ARM64InstrInfo::InsertBranch(
-    MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-    const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const {
-  // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
-
-  if (FBB == 0) {
-    if (Cond.empty()) // Unconditional branch?
-      BuildMI(&MBB, DL, get(ARM64::B)).addMBB(TBB);
-    else
-      instantiateCondBranch(MBB, DL, TBB, Cond);
-    return 1;
-  }
-
-  // Two-way conditional branch.
-  instantiateCondBranch(MBB, DL, TBB, Cond);
-  BuildMI(&MBB, DL, get(ARM64::B)).addMBB(FBB);
-  return 2;
-}
-
-// Find the original register that VReg is copied from.
-static unsigned removeCopies(const MachineRegisterInfo &MRI, unsigned VReg) {
-  while (TargetRegisterInfo::isVirtualRegister(VReg)) {
-    const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-    if (!DefMI->isFullCopy())
-      return VReg;
-    VReg = DefMI->getOperand(1).getReg();
-  }
-  return VReg;
-}
-
-// Determine if VReg is defined by an instruction that can be folded into a
-// csel instruction. If so, return the folded opcode, and the replacement
-// register.
-static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
-                                unsigned *NewVReg = 0) {
-  VReg = removeCopies(MRI, VReg);
-  if (!TargetRegisterInfo::isVirtualRegister(VReg))
-    return 0;
-
-  bool Is64Bit = ARM64::GPR64allRegClass.hasSubClassEq(MRI.getRegClass(VReg));
-  const MachineInstr *DefMI = MRI.getVRegDef(VReg);
-  unsigned Opc = 0;
-  unsigned SrcOpNum = 0;
-  switch (DefMI->getOpcode()) {
-  case ARM64::ADDSXri:
-  case ARM64::ADDSWri:
-    // if CPSR is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
-      return 0;
-  // fall-through to ADDXri and ADDWri.
-  case ARM64::ADDXri:
-  case ARM64::ADDWri:
-    // add x, 1 -> csinc.
-    if (!DefMI->getOperand(2).isImm() || DefMI->getOperand(2).getImm() != 1 ||
-        DefMI->getOperand(3).getImm() != 0)
-      return 0;
-    SrcOpNum = 1;
-    Opc = Is64Bit ? ARM64::CSINCXr : ARM64::CSINCWr;
-    break;
-
-  case ARM64::ORNXrr:
-  case ARM64::ORNWrr: {
-    // not x -> csinv, represented as orn dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSINVXr : ARM64::CSINVWr;
-    break;
-  }
-
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSWrr:
-    // if CPSR is used, do not fold.
-    if (DefMI->findRegisterDefOperandIdx(ARM64::CPSR, true) == -1)
-      return 0;
-  // fall-through to SUBXrr and SUBWrr.
-  case ARM64::SUBXrr:
-  case ARM64::SUBWrr: {
-    // neg x -> csneg, represented as sub dst, xzr, src.
-    unsigned ZReg = removeCopies(MRI, DefMI->getOperand(1).getReg());
-    if (ZReg != ARM64::XZR && ZReg != ARM64::WZR)
-      return 0;
-    SrcOpNum = 2;
-    Opc = Is64Bit ? ARM64::CSNEGXr : ARM64::CSNEGWr;
-    break;
-  }
-  default:
-    return 0;
-  }
-  assert(Opc && SrcOpNum && "Missing parameters");
-
-  if (NewVReg)
-    *NewVReg = DefMI->getOperand(SrcOpNum).getReg();
-  return Opc;
-}
-
-bool ARM64InstrInfo::canInsertSelect(
-    const MachineBasicBlock &MBB, const SmallVectorImpl<MachineOperand> &Cond,
-    unsigned TrueReg, unsigned FalseReg, int &CondCycles, int &TrueCycles,
-    int &FalseCycles) const {
-  // Check register classes.
-  const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-  const TargetRegisterClass *RC =
-      RI.getCommonSubClass(MRI.getRegClass(TrueReg), MRI.getRegClass(FalseReg));
-  if (!RC)
-    return false;
-
-  // Expanding cbz/tbz requires an extra cycle of latency on the condition.
-  unsigned ExtraCondLat = Cond.size() != 1;
-
-  // GPRs are handled by csel.
-  // FIXME: Fold in x+1, -x, and ~x when applicable.
-  if (ARM64::GPR64allRegClass.hasSubClassEq(RC) ||
-      ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-    // Single-cycle csel, csinc, csinv, and csneg.
-    CondCycles = 1 + ExtraCondLat;
-    TrueCycles = FalseCycles = 1;
-    if (canFoldIntoCSel(MRI, TrueReg))
-      TrueCycles = 0;
-    else if (canFoldIntoCSel(MRI, FalseReg))
-      FalseCycles = 0;
-    return true;
-  }
-
-  // Scalar floating point is handled by fcsel.
-  // FIXME: Form fabs, fmin, and fmax when applicable.
-  if (ARM64::FPR64RegClass.hasSubClassEq(RC) ||
-      ARM64::FPR32RegClass.hasSubClassEq(RC)) {
-    CondCycles = 5 + ExtraCondLat;
-    TrueCycles = FalseCycles = 2;
-    return true;
-  }
-
-  // Can't do vectors.
-  return false;
-}
-
-void ARM64InstrInfo::insertSelect(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I, DebugLoc DL,
-                                  unsigned DstReg,
-                                  const SmallVectorImpl<MachineOperand> &Cond,
-                                  unsigned TrueReg, unsigned FalseReg) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
-
-  // Parse the condition code, see parseCondBranch() above.
-  ARM64CC::CondCode CC;
-  switch (Cond.size()) {
-  default:
-    llvm_unreachable("Unknown condition opcode in Cond");
-  case 1: // b.cc
-    CC = ARM64CC::CondCode(Cond[0].getImm());
-    break;
-  case 3: { // cbz/cbnz
-    // We must insert a compare against 0.
-    bool Is64Bit;
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::CBZW:
-      Is64Bit = 0;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBZX:
-      Is64Bit = 1;
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::CBNZW:
-      Is64Bit = 0;
-      CC = ARM64CC::NE;
-      break;
-    case ARM64::CBNZX:
-      Is64Bit = 1;
-      CC = ARM64CC::NE;
-      break;
-    }
-    unsigned SrcReg = Cond[2].getReg();
-    if (Is64Bit) {
-      // cmp reg, #0 is actually subs xzr, reg, #0.
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR64spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSXri), ARM64::XZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    } else {
-      MRI.constrainRegClass(SrcReg, &ARM64::GPR32spRegClass);
-      BuildMI(MBB, I, DL, get(ARM64::SUBSWri), ARM64::WZR)
-          .addReg(SrcReg)
-          .addImm(0)
-          .addImm(0);
-    }
-    break;
-  }
-  case 4: { // tbz/tbnz
-    // We must insert a tst instruction.
-    switch (Cond[1].getImm()) {
-    default:
-      llvm_unreachable("Unknown branch opcode in Cond");
-    case ARM64::TBZ:
-      CC = ARM64CC::EQ;
-      break;
-    case ARM64::TBNZ:
-      CC = ARM64CC::NE;
-      break;
-    }
-    // cmp reg, #foo is actually ands xzr, reg, #1<<foo.
-    BuildMI(MBB, I, DL, get(ARM64::ANDSXri), ARM64::XZR)
-        .addReg(Cond[2].getReg())
-        .addImm(ARM64_AM::encodeLogicalImmediate(1ull << Cond[3].getImm(), 64));
-    break;
-  }
-  }
-
-  unsigned Opc = 0;
-  const TargetRegisterClass *RC = 0;
-  bool TryFold = false;
-  if (MRI.constrainRegClass(DstReg, &ARM64::GPR64RegClass)) {
-    RC = &ARM64::GPR64RegClass;
-    Opc = ARM64::CSELXr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::GPR32RegClass)) {
-    RC = &ARM64::GPR32RegClass;
-    Opc = ARM64::CSELWr;
-    TryFold = true;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR64RegClass)) {
-    RC = &ARM64::FPR64RegClass;
-    Opc = ARM64::FCSELDrrr;
-  } else if (MRI.constrainRegClass(DstReg, &ARM64::FPR32RegClass)) {
-    RC = &ARM64::FPR32RegClass;
-    Opc = ARM64::FCSELSrrr;
-  }
-  assert(RC && "Unsupported regclass");
-
-  // Try folding simple instructions into the csel.
-  if (TryFold) {
-    unsigned NewVReg = 0;
-    unsigned FoldedOpc = canFoldIntoCSel(MRI, TrueReg, &NewVReg);
-    if (FoldedOpc) {
-      // The folded opcodes csinc, csinc and csneg apply the operation to
-      // FalseReg, so we need to invert the condition.
-      CC = ARM64CC::getInvertedCondCode(CC);
-      TrueReg = FalseReg;
-    } else
-      FoldedOpc = canFoldIntoCSel(MRI, FalseReg, &NewVReg);
-
-    // Fold the operation. Leave any dead instructions for DCE to clean up.
-    if (FoldedOpc) {
-      FalseReg = NewVReg;
-      Opc = FoldedOpc;
-      // The extends the live range of NewVReg.
-      MRI.clearKillFlags(NewVReg);
-    }
-  }
-
-  // Pull all virtual register into the appropriate class.
-  MRI.constrainRegClass(TrueReg, RC);
-  MRI.constrainRegClass(FalseReg, RC);
-
-  // Insert the csel.
-  BuildMI(MBB, I, DL, get(Opc), DstReg).addReg(TrueReg).addReg(FalseReg).addImm(
-      CC);
-}
-
-bool ARM64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                           unsigned &SrcReg, unsigned &DstReg,
-                                           unsigned &SubIdx) const {
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  case ARM64::SBFMXri: // aka sxtw
-  case ARM64::UBFMXri: // aka uxtw
-    // Check for the 32 -> 64 bit extension case, these instructions can do
-    // much more.
-    if (MI.getOperand(2).getImm() != 0 || MI.getOperand(3).getImm() != 31)
-      return false;
-    // This is a signed or unsigned 32 -> 64 bit extension.
-    SrcReg = MI.getOperand(1).getReg();
-    DstReg = MI.getOperand(0).getReg();
-    SubIdx = ARM64::sub_32;
-    return true;
-  }
-}
-
-/// analyzeCompare - For a comparison instruction, return the source registers
-/// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-/// Return true if the comparison instruction can be analyzed.
-bool ARM64InstrInfo::analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                                    unsigned &SrcReg2, int &CmpMask,
-                                    int &CmpValue) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWrs:
-  case ARM64::SUBSWrx:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXrs:
-  case ARM64::SUBSXrx:
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWrs:
-  case ARM64::ADDSWrx:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXrs:
-  case ARM64::ADDSXrx:
-    // Replace SUBSWrr with SUBWrr if CPSR is not used.
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = MI->getOperand(2).getReg();
-    CmpMask = ~0;
-    CmpValue = 0;
-    return true;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    SrcReg = MI->getOperand(1).getReg();
-    SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI->getOperand(2).getImm();
-    return true;
-  }
-
-  return false;
-}
-
-static bool UpdateOperandRegClass(MachineInstr *Instr) {
-  MachineBasicBlock *MBB = Instr->getParent();
-  assert(MBB && "Can't get MachineBasicBlock here");
-  MachineFunction *MF = MBB->getParent();
-  assert(MF && "Can't get MachineFunction here");
-  const TargetMachine *TM = &MF->getTarget();
-  const TargetInstrInfo *TII = TM->getInstrInfo();
-  const TargetRegisterInfo *TRI = TM->getRegisterInfo();
-  MachineRegisterInfo *MRI = &MF->getRegInfo();
-
-  for (unsigned OpIdx = 0, EndIdx = Instr->getNumOperands(); OpIdx < EndIdx;
-       ++OpIdx) {
-    MachineOperand &MO = Instr->getOperand(OpIdx);
-    const TargetRegisterClass *OpRegCstraints =
-        Instr->getRegClassConstraint(OpIdx, TII, TRI);
-
-    // If there's no constraint, there's nothing to do.
-    if (!OpRegCstraints)
-      continue;
-    // If the operand is a frame index, there's nothing to do here.
-    // A frame index operand will resolve correctly during PEI.
-    if (MO.isFI())
-      continue;
-
-    assert(MO.isReg() &&
-           "Operand has register constraints without being a register!");
-
-    unsigned Reg = MO.getReg();
-    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-      if (!OpRegCstraints->contains(Reg))
-        return false;
-    } else if (!OpRegCstraints->hasSubClassEq(MRI->getRegClass(Reg)) &&
-               !MRI->constrainRegClass(Reg, OpRegCstraints))
-      return false;
-  }
-
-  return true;
-}
-
-/// optimizeCompareInstr - Convert the instruction supplying the argument to the
-/// comparison into one that sets the zero bit in the flags register.
-bool ARM64InstrInfo::optimizeCompareInstr(
-    MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
-    int CmpValue, const MachineRegisterInfo *MRI) const {
-
-  // Replace SUBSWrr with SUBWrr if CPSR is not used.
-  int Cmp_CPSR = CmpInstr->findRegisterDefOperandIdx(ARM64::CPSR, true);
-  if (Cmp_CPSR != -1) {
-    unsigned NewOpc;
-    switch (CmpInstr->getOpcode()) {
-    default:
-      return false;
-    case ARM64::ADDSWrr:      NewOpc = ARM64::ADDWrr; break;
-    case ARM64::ADDSWri:      NewOpc = ARM64::ADDWri; break;
-    case ARM64::ADDSWrs:      NewOpc = ARM64::ADDWrs; break;
-    case ARM64::ADDSWrx:      NewOpc = ARM64::ADDWrx; break;
-    case ARM64::ADDSXrr:      NewOpc = ARM64::ADDXrr; break;
-    case ARM64::ADDSXri:      NewOpc = ARM64::ADDXri; break;
-    case ARM64::ADDSXrs:      NewOpc = ARM64::ADDXrs; break;
-    case ARM64::ADDSXrx:      NewOpc = ARM64::ADDXrx; break;
-    case ARM64::SUBSWrr:      NewOpc = ARM64::SUBWrr; break;
-    case ARM64::SUBSWri:      NewOpc = ARM64::SUBWri; break;
-    case ARM64::SUBSWrs:      NewOpc = ARM64::SUBWrs; break;
-    case ARM64::SUBSWrx:      NewOpc = ARM64::SUBWrx; break;
-    case ARM64::SUBSXrr:      NewOpc = ARM64::SUBXrr; break;
-    case ARM64::SUBSXri:      NewOpc = ARM64::SUBXri; break;
-    case ARM64::SUBSXrs:      NewOpc = ARM64::SUBXrs; break;
-    case ARM64::SUBSXrx:      NewOpc = ARM64::SUBXrx; break;
-    }
-
-    const MCInstrDesc &MCID = get(NewOpc);
-    CmpInstr->setDesc(MCID);
-    CmpInstr->RemoveOperand(Cmp_CPSR);
-    bool succeeded = UpdateOperandRegClass(CmpInstr);
-    (void)succeeded;
-    assert(succeeded && "Some operands reg class are incompatible!");
-    return true;
-  }
-
-  // Continue only if we have a "ri" where immediate is zero.
-  if (CmpValue != 0 || SrcReg2 != 0)
-    return false;
-
-  // CmpInstr is a Compare instruction if destination register is not used.
-  if (!MRI->use_nodbg_empty(CmpInstr->getOperand(0).getReg()))
-    return false;
-
-  // Get the unique definition of SrcReg.
-  MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
-  if (!MI)
-    return false;
-
-  // We iterate backward, starting from the instruction before CmpInstr and
-  // stop when reaching the definition of the source register or done with the
-  // basic block, to check whether CPSR is used or modified in between.
-  MachineBasicBlock::iterator I = CmpInstr, E = MI,
-                              B = CmpInstr->getParent()->begin();
-
-  // Early exit if CmpInstr is at the beginning of the BB.
-  if (I == B)
-    return false;
-
-  // Check whether the definition of SrcReg is in the same basic block as
-  // Compare. If not, we can't optimize away the Compare.
-  if (MI->getParent() != CmpInstr->getParent())
-    return false;
-
-  // Check that CPSR isn't set between the comparison instruction and the one we
-  // want to change.
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  for (--I; I != E; --I) {
-    const MachineInstr &Instr = *I;
-
-    if (Instr.modifiesRegister(ARM64::CPSR, TRI) ||
-        Instr.readsRegister(ARM64::CPSR, TRI))
-      // This instruction modifies or uses CPSR after the one we want to
-      // change. We can't do this transformation.
-      return false;
-    if (I == B)
-      // The 'and' is below the comparison instruction.
-      return false;
-  }
-
-  unsigned NewOpc = MI->getOpcode();
-  switch (MI->getOpcode()) {
-  default:
-    return false;
-  case ARM64::ADDSWrr:
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXrr:
-  case ARM64::ADDSXri:
-  case ARM64::SUBSWrr:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXrr:
-  case ARM64::SUBSXri:
-    break;
-  case ARM64::ADDWrr:    NewOpc = ARM64::ADDSWrr; break;
-  case ARM64::ADDWri:    NewOpc = ARM64::ADDSWri; break;
-  case ARM64::ADDXrr:    NewOpc = ARM64::ADDSXrr; break;
-  case ARM64::ADDXri:    NewOpc = ARM64::ADDSXri; break;
-  case ARM64::ADCWr:     NewOpc = ARM64::ADCSWr; break;
-  case ARM64::ADCXr:     NewOpc = ARM64::ADCSXr; break;
-  case ARM64::SUBWrr:    NewOpc = ARM64::SUBSWrr; break;
-  case ARM64::SUBWri:    NewOpc = ARM64::SUBSWri; break;
-  case ARM64::SUBXrr:    NewOpc = ARM64::SUBSXrr; break;
-  case ARM64::SUBXri:    NewOpc = ARM64::SUBSXri; break;
-  case ARM64::SBCWr:     NewOpc = ARM64::SBCSWr; break;
-  case ARM64::SBCXr:     NewOpc = ARM64::SBCSXr; break;
-  case ARM64::ANDWri:    NewOpc = ARM64::ANDSWri; break;
-  case ARM64::ANDXri:    NewOpc = ARM64::ANDSXri; break;
-  }
-
-  // Scan forward for the use of CPSR.
-  // When checking against MI: if it's a conditional code requires
-  // checking of V bit, then this is not safe to do.
-  // It is safe to remove CmpInstr if CPSR is redefined or killed.
-  // If we are done with the basic block, we need to check whether CPSR is
-  // live-out.
-  bool IsSafe = false;
-  for (MachineBasicBlock::iterator I = CmpInstr,
-                                   E = CmpInstr->getParent()->end();
-       !IsSafe && ++I != E;) {
-    const MachineInstr &Instr = *I;
-    for (unsigned IO = 0, EO = Instr.getNumOperands(); !IsSafe && IO != EO;
-         ++IO) {
-      const MachineOperand &MO = Instr.getOperand(IO);
-      if (MO.isRegMask() && MO.clobbersPhysReg(ARM64::CPSR)) {
-        IsSafe = true;
-        break;
-      }
-      if (!MO.isReg() || MO.getReg() != ARM64::CPSR)
-        continue;
-      if (MO.isDef()) {
-        IsSafe = true;
-        break;
-      }
-
-      // Decode the condition code.
-      unsigned Opc = Instr.getOpcode();
-      ARM64CC::CondCode CC;
-      switch (Opc) {
-      default:
-        return false;
-      case ARM64::Bcc:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 2).getImm();
-        break;
-      case ARM64::CSINVWr:
-      case ARM64::CSINVXr:
-      case ARM64::CSINCWr:
-      case ARM64::CSINCXr:
-      case ARM64::CSELWr:
-      case ARM64::CSELXr:
-      case ARM64::CSNEGWr:
-      case ARM64::CSNEGXr:
-        CC = (ARM64CC::CondCode)Instr.getOperand(IO - 1).getImm();
-        break;
-      }
-
-      // It is not safe to remove Compare instruction if Overflow(V) is used.
-      switch (CC) {
-      default:
-        // CPSR can be used multiple times, we should continue.
-        break;
-      case ARM64CC::VS:
-      case ARM64CC::VC:
-      case ARM64CC::GE:
-      case ARM64CC::LT:
-      case ARM64CC::GT:
-      case ARM64CC::LE:
-        return false;
-      }
-    }
-  }
-
-  // If CPSR is not killed nor re-defined, we should check whether it is
-  // live-out. If it is live-out, do not optimize.
-  if (!IsSafe) {
-    MachineBasicBlock *MBB = CmpInstr->getParent();
-    for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-                                          SE = MBB->succ_end();
-         SI != SE; ++SI)
-      if ((*SI)->isLiveIn(ARM64::CPSR))
-        return false;
-  }
-
-  // Update the instruction to set CPSR.
-  MI->setDesc(get(NewOpc));
-  CmpInstr->eraseFromParent();
-  bool succeeded = UpdateOperandRegClass(MI);
-  (void)succeeded;
-  assert(succeeded && "Some operands reg class are incompatible!");
-  MI->addRegisterDefined(ARM64::CPSR, TRI);
-  return true;
-}
-
-// Return true if this instruction simply sets its single destination register
-// to zero. This is equivalent to a register rename of the zero-register.
-bool ARM64InstrInfo::isGPRZero(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVZWi:
-  case ARM64::MOVZXi: // movz Rd, #0 (LSL #0)
-    if (MI->getOperand(1).isImm() && MI->getOperand(1).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 3 &&
-             MI->getOperand(2).getImm() == 0 && "invalid MOVZi operands");
-      return true;
-    }
-    break;
-  case ARM64::ANDWri: // and Rd, Rzr, #imm
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  case ARM64::ANDXri:
-    return MI->getOperand(1).getReg() == ARM64::XZR;
-  case TargetOpcode::COPY:
-    return MI->getOperand(1).getReg() == ARM64::WZR;
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isGPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // GPR32 copies will by lowered to ORRXrs
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::GPR32RegClass.contains(DstReg) ||
-            ARM64::GPR64RegClass.contains(DstReg));
-  }
-  case ARM64::ORRXrs: // orr Xd, Xzr, Xm (LSL #0)
-    if (MI->getOperand(1).getReg() == ARM64::XZR) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ORRrs operands");
-      return true;
-    }
-  case ARM64::ADDXri: // add Xd, Xn, #0 (LSL #0)
-    if (MI->getOperand(2).getImm() == 0) {
-      assert(MI->getDesc().getNumOperands() == 4 &&
-             MI->getOperand(3).getImm() == 0 && "invalid ADDXri operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return true if this instruction simply renames a general register without
-// modifying bits.
-bool ARM64InstrInfo::isFPRCopy(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case TargetOpcode::COPY: {
-    // FPR64 copies will by lowered to ORR.16b
-    unsigned DstReg = MI->getOperand(0).getReg();
-    return (ARM64::FPR64RegClass.contains(DstReg) ||
-            ARM64::FPR128RegClass.contains(DstReg));
-  }
-  case ARM64::ORRv16i8:
-    if (MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-      assert(MI->getDesc().getNumOperands() == 3 && MI->getOperand(0).isReg() &&
-             "invalid ORRv16i8 operands");
-      return true;
-    }
-  }
-  return false;
-}
-
-unsigned ARM64InstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
-                                             int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRWui:
-  case ARM64::LDRXui:
-  case ARM64::LDRBui:
-  case ARM64::LDRHui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-
-  return 0;
-}
-
-unsigned ARM64InstrInfo::isStoreToStackSlot(const MachineInstr *MI,
-                                            int &FrameIndex) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::STRWui:
-  case ARM64::STRXui:
-  case ARM64::STRBui:
-  case ARM64::STRHui:
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-    if (MI->getOperand(0).getSubReg() == 0 && MI->getOperand(1).isFI() &&
-        MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 0) {
-      FrameIndex = MI->getOperand(1).getIndex();
-      return MI->getOperand(0).getReg();
-    }
-    break;
-  }
-  return 0;
-}
-
-/// Return true if this is load/store scales or extends its register offset.
-/// This refers to scaling a dynamic index as opposed to scaled immediates.
-/// MI should be a memory op that allows scaled addressing.
-bool ARM64InstrInfo::isScaledAddr(const MachineInstr *MI) const {
-  switch (MI->getOpcode()) {
-  default:
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::LDRBro:
-  case ARM64::LDRDro:
-  case ARM64::LDRHHro:
-  case ARM64::LDRHro:
-  case ARM64::LDRQro:
-  case ARM64::LDRSBWro:
-  case ARM64::LDRSBXro:
-  case ARM64::LDRSHWro:
-  case ARM64::LDRSHXro:
-  case ARM64::LDRSWro:
-  case ARM64::LDRSro:
-  case ARM64::LDRWro:
-  case ARM64::LDRXro:
-  case ARM64::STRBBro:
-  case ARM64::STRBro:
-  case ARM64::STRDro:
-  case ARM64::STRHHro:
-  case ARM64::STRHro:
-  case ARM64::STRQro:
-  case ARM64::STRSro:
-  case ARM64::STRWro:
-  case ARM64::STRXro:
-    unsigned Val = MI->getOperand(3).getImm();
-    ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-    return (ExtType != ARM64_AM::UXTX) || ARM64_AM::getMemDoShift(Val);
-  }
-  return false;
-}
-
-/// Check all MachineMemOperands for a hint to suppress pairing.
-bool ARM64InstrInfo::isLdStPairSuppressed(const MachineInstr *MI) const {
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  for (MachineInstr::mmo_iterator MM = MI->memoperands_begin(),
-                                  E = MI->memoperands_end();
-       MM != E; ++MM) {
-
-    if ((*MM)->getFlags() &
-        (MOSuppressPair << MachineMemOperand::MOTargetStartBit)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-/// Set a flag on the first MachineMemOperand to suppress pairing.
-void ARM64InstrInfo::suppressLdStPair(MachineInstr *MI) const {
-  if (MI->memoperands_empty())
-    return;
-
-  assert(MOSuppressPair < (1 << MachineMemOperand::MOTargetNumBits) &&
-         "Too many target MO flags");
-  (*MI->memoperands_begin())
-      ->setFlags(MOSuppressPair << MachineMemOperand::MOTargetStartBit);
-}
-
-bool ARM64InstrInfo::getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                                          unsigned &Offset,
-                                          const TargetRegisterInfo *TRI) const {
-  switch (LdSt->getOpcode()) {
-  default:
-    return false;
-  case ARM64::STRSui:
-  case ARM64::STRDui:
-  case ARM64::STRQui:
-  case ARM64::STRXui:
-  case ARM64::STRWui:
-  case ARM64::LDRSui:
-  case ARM64::LDRDui:
-  case ARM64::LDRQui:
-  case ARM64::LDRXui:
-  case ARM64::LDRWui:
-    if (!LdSt->getOperand(1).isReg() || !LdSt->getOperand(2).isImm())
-      return false;
-    BaseReg = LdSt->getOperand(1).getReg();
-    MachineFunction &MF = *LdSt->getParent()->getParent();
-    unsigned Width = getRegClass(LdSt->getDesc(), 0, TRI, MF)->getSize();
-    Offset = LdSt->getOperand(2).getImm() * Width;
-    return true;
-  };
-}
-
-/// Detect opportunities for ldp/stp formation.
-///
-/// Only called for LdSt for which getLdStBaseRegImmOfs returns true.
-bool ARM64InstrInfo::shouldClusterLoads(MachineInstr *FirstLdSt,
-                                        MachineInstr *SecondLdSt,
-                                        unsigned NumLoads) const {
-  // Only cluster up to a single pair.
-  if (NumLoads > 1)
-    return false;
-  if (FirstLdSt->getOpcode() != SecondLdSt->getOpcode())
-    return false;
-  // getLdStBaseRegImmOfs guarantees that oper 2 isImm.
-  unsigned Ofs1 = FirstLdSt->getOperand(2).getImm();
-  // Allow 6 bits of positive range.
-  if (Ofs1 > 64)
-    return false;
-  // The caller should already have ordered First/SecondLdSt by offset.
-  unsigned Ofs2 = SecondLdSt->getOperand(2).getImm();
-  return Ofs1 + 1 == Ofs2;
-}
-
-bool ARM64InstrInfo::shouldScheduleAdjacent(MachineInstr *First,
-                                            MachineInstr *Second) const {
-  // Cyclone can fuse CMN, CMP followed by Bcc.
-
-  // FIXME: B0 can also fuse:
-  // AND, BIC, ORN, ORR, or EOR (optional S) followed by Bcc or CBZ or CBNZ.
-  if (Second->getOpcode() != ARM64::Bcc)
-    return false;
-  switch (First->getOpcode()) {
-  default:
-    return false;
-  case ARM64::SUBSWri:
-  case ARM64::ADDSWri:
-  case ARM64::ANDSWri:
-  case ARM64::SUBSXri:
-  case ARM64::ADDSXri:
-  case ARM64::ANDSXri:
-    return true;
-  }
-}
-
-MachineInstr *ARM64InstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                                       int FrameIx,
-                                                       uint64_t Offset,
-                                                       const MDNode *MDPtr,
-                                                       DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM64::DBG_VALUE))
-                                .addFrameIndex(FrameIx)
-                                .addImm(0)
-                                .addImm(Offset)
-                                .addMetadata(MDPtr);
-  return &*MIB;
-}
-
-static const MachineInstrBuilder &AddSubReg(const MachineInstrBuilder &MIB,
-                                            unsigned Reg, unsigned SubIdx,
-                                            unsigned State,
-                                            const TargetRegisterInfo *TRI) {
-  if (!SubIdx)
-    return MIB.addReg(Reg, State);
-
-  if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return MIB.addReg(TRI->getSubReg(Reg, SubIdx), State);
-  return MIB.addReg(Reg, State, SubIdx);
-}
-
-static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
-                                        unsigned NumRegs) {
-  // We really want the positive remainder mod 32 here, that happens to be
-  // easily obtainable with a mask.
-  return ((DestReg - SrcReg) & 0x1f) < NumRegs;
-}
-
-void ARM64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
-                                      MachineBasicBlock::iterator I,
-                                      DebugLoc DL, unsigned DestReg,
-                                      unsigned SrcReg, bool KillSrc,
-                                      unsigned Opcode,
-                                      llvm::ArrayRef<unsigned> Indices) const {
-  const TargetRegisterInfo *TRI = &getRegisterInfo();
-  uint16_t DestEncoding = TRI->getEncodingValue(DestReg);
-  uint16_t SrcEncoding = TRI->getEncodingValue(SrcReg);
-  unsigned NumRegs = Indices.size();
-
-  int SubReg = 0, End = NumRegs, Incr = 1;
-  if (forwardCopyWillClobberTuple(DestEncoding, SrcEncoding, NumRegs)) {
-    SubReg = NumRegs - 1;
-    End = -1;
-    Incr = -1;
-  }
-
-  for (; SubReg != End; SubReg += Incr) {
-    const MachineInstrBuilder &MIB = BuildMI(MBB, I, DL, get(Opcode));
-    AddSubReg(MIB, DestReg, Indices[SubReg], RegState::Define, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], 0, TRI);
-    AddSubReg(MIB, SrcReg, Indices[SubReg], getKillRegState(KillSrc), TRI);
-  }
-}
-
-void ARM64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator I, DebugLoc DL,
-                                 unsigned DestReg, unsigned SrcReg,
-                                 bool KillSrc) const {
-  if (ARM64::GPR32spRegClass.contains(DestReg) &&
-      (ARM64::GPR32spRegClass.contains(SrcReg) || SrcReg == ARM64::WZR)) {
-    const TargetRegisterInfo *TRI = &getRegisterInfo();
-
-    if (DestReg == ARM64::WSP || SrcReg == ARM64::WSP) {
-      // If either operand is WSP, expand to ADD #0.
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestRegX)
-            .addReg(SrcRegX, RegState::Undef)
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        BuildMI(MBB, I, DL, get(ARM64::ADDWri), DestReg)
-            .addReg(SrcReg, getKillRegState(KillSrc))
-            .addImm(0)
-            .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-      }
-    } else if (SrcReg == ARM64::WZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZWi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      if (Subtarget.hasZeroCycleRegMove()) {
-        // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
-        unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, ARM64::sub_32,
-                                                     &ARM64::GPR64spRegClass);
-        unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, ARM64::sub_32,
-                                                    &ARM64::GPR64spRegClass);
-        // This instruction is reading and writing X registers.  This may upset
-        // the register scavenger and machine verifier, so we need to indicate
-        // that we are reading an undefined value from SrcRegX, but a proper
-        // value from SrcReg.
-        BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestRegX)
-            .addReg(ARM64::XZR)
-            .addReg(SrcRegX, RegState::Undef)
-            .addReg(SrcReg, RegState::Implicit | getKillRegState(KillSrc));
-      } else {
-        // Otherwise, expand to ORR WZR.
-        BuildMI(MBB, I, DL, get(ARM64::ORRWrr), DestReg)
-            .addReg(ARM64::WZR)
-            .addReg(SrcReg, getKillRegState(KillSrc));
-      }
-    }
-    return;
-  }
-
-  if (ARM64::GPR64spRegClass.contains(DestReg) &&
-      (ARM64::GPR64spRegClass.contains(SrcReg) || SrcReg == ARM64::XZR)) {
-    if (DestReg == ARM64::SP || SrcReg == ARM64::SP) {
-      // If either operand is SP, expand to ADD #0.
-      BuildMI(MBB, I, DL, get(ARM64::ADDXri), DestReg)
-          .addReg(SrcReg, getKillRegState(KillSrc))
-          .addImm(0)
-          .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else if (SrcReg == ARM64::XZR && Subtarget.hasZeroCycleZeroing()) {
-      BuildMI(MBB, I, DL, get(ARM64::MOVZXi), DestReg).addImm(0).addImm(
-          ARM64_AM::getShifterImm(ARM64_AM::LSL, 0));
-    } else {
-      // Otherwise, expand to ORR XZR.
-      BuildMI(MBB, I, DL, get(ARM64::ORRXrr), DestReg)
-          .addReg(ARM64::XZR)
-          .addReg(SrcReg, getKillRegState(KillSrc));
-    }
-    return;
-  }
-
-  // Copy a DDDD register quad by copying the individual sub-registers.
-  if (ARM64::DDDDRegClass.contains(DestReg) &&
-      ARM64::DDDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2, ARM64::dsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DDD register triple by copying the individual sub-registers.
-  if (ARM64::DDDRegClass.contains(DestReg) &&
-      ARM64::DDDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1,
-                                        ARM64::dsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a DD register pair by copying the individual sub-registers.
-  if (ARM64::DDRegClass.contains(DestReg) &&
-      ARM64::DDRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::dsub0, ARM64::dsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv8i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQQ register quad by copying the individual sub-registers.
-  if (ARM64::QQQQRegClass.contains(DestReg) &&
-      ARM64::QQQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2, ARM64::qsub3 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQQ register triple by copying the individual sub-registers.
-  if (ARM64::QQQRegClass.contains(DestReg) &&
-      ARM64::QQQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1,
-                                        ARM64::qsub2 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  // Copy a QQ register pair by copying the individual sub-registers.
-  if (ARM64::QQRegClass.contains(DestReg) &&
-      ARM64::QQRegClass.contains(SrcReg)) {
-    static const unsigned Indices[] = { ARM64::qsub0, ARM64::qsub1 };
-    copyPhysRegTuple(MBB, I, DL, DestReg, SrcReg, KillSrc, ARM64::ORRv16i8,
-                     Indices);
-    return;
-  }
-
-  if (ARM64::FPR128RegClass.contains(DestReg) &&
-      ARM64::FPR128RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::dsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::dsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::ssub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::ssub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR16RegClass.contains(DestReg) &&
-      ARM64::FPR16RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::hsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::hsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  if (ARM64::FPR8RegClass.contains(DestReg) &&
-      ARM64::FPR8RegClass.contains(SrcReg)) {
-    DestReg =
-        RI.getMatchingSuperReg(DestReg, ARM64::bsub, &ARM64::FPR128RegClass);
-    SrcReg =
-        RI.getMatchingSuperReg(SrcReg, ARM64::bsub, &ARM64::FPR128RegClass);
-    BuildMI(MBB, I, DL, get(ARM64::ORRv16i8), DestReg).addReg(SrcReg).addReg(
-        SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  // Copies between GPR64 and FPR64.
-  if (ARM64::FPR64RegClass.contains(DestReg) &&
-      ARM64::GPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVXDr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR64RegClass.contains(DestReg) &&
-      ARM64::FPR64RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVDXr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  // Copies between GPR32 and FPR32.
-  if (ARM64::FPR32RegClass.contains(DestReg) &&
-      ARM64::GPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVWSr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-  if (ARM64::GPR32RegClass.contains(DestReg) &&
-      ARM64::FPR32RegClass.contains(SrcReg)) {
-    BuildMI(MBB, I, DL, get(ARM64::FMOVSWr), DestReg)
-        .addReg(SrcReg, getKillRegState(KillSrc));
-    return;
-  }
-
-  assert(0 && "unimplemented reg-to-reg copy");
-}
-
-void ARM64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                         MachineBasicBlock::iterator MBBI,
-                                         unsigned SrcReg, bool isKill, int FI,
-                                         const TargetRegisterClass *RC,
-                                         const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRWui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR32RegClass);
-      else
-        assert(SrcReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::STRXui;
-      if (TargetRegisterInfo::isVirtualRegister(SrcReg))
-        MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      else
-        assert(SrcReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::STRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Twov1d, Offset = false;
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Threev1d, Offset = false;
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Fourv1d, Offset = false;
-    else if (ARM64::QQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Twov2d, Offset = false;
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Threev2d, Offset = false;
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::ST1Fourv2d, Offset = false;
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(SrcReg, getKillRegState(isKill))
-                                      .addFrameIndex(FI);
-
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void ARM64InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MBBI,
-                                          unsigned DestReg, int FI,
-                                          const TargetRegisterClass *RC,
-                                          const TargetRegisterInfo *TRI) const {
-  DebugLoc DL;
-  if (MBBI != MBB.end())
-    DL = MBBI->getDebugLoc();
-  MachineFunction &MF = *MBB.getParent();
-  MachineFrameInfo &MFI = *MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
-  MachinePointerInfo PtrInfo(PseudoSourceValue::getFixedStack(FI));
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
-
-  unsigned Opc = 0;
-  bool Offset = true;
-  switch (RC->getSize()) {
-  case 1:
-    if (ARM64::FPR8RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRBui;
-    break;
-  case 2:
-    if (ARM64::FPR16RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRHui;
-    break;
-  case 4:
-    if (ARM64::GPR32allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRWui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR32RegClass);
-      else
-        assert(DestReg != ARM64::WSP);
-    } else if (ARM64::FPR32RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRSui;
-    break;
-  case 8:
-    if (ARM64::GPR64allRegClass.hasSubClassEq(RC)) {
-      Opc = ARM64::LDRXui;
-      if (TargetRegisterInfo::isVirtualRegister(DestReg))
-        MF.getRegInfo().constrainRegClass(DestReg, &ARM64::GPR64RegClass);
-      else
-        assert(DestReg != ARM64::SP);
-    } else if (ARM64::FPR64RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRDui;
-    break;
-  case 16:
-    if (ARM64::FPR128RegClass.hasSubClassEq(RC))
-      Opc = ARM64::LDRQui;
-    else if (ARM64::DDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Twov1d, Offset = false;
-    break;
-  case 24:
-    if (ARM64::DDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Threev1d, Offset = false;
-    break;
-  case 32:
-    if (ARM64::DDDDRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Fourv1d, Offset = false;
-    else if (ARM64::QQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Twov2d, Offset = false;
-    break;
-  case 48:
-    if (ARM64::QQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Threev2d, Offset = false;
-    break;
-  case 64:
-    if (ARM64::QQQQRegClass.hasSubClassEq(RC))
-      Opc = ARM64::LD1Fourv2d, Offset = false;
-    break;
-  }
-  assert(Opc && "Unknown register class");
-
-  const MachineInstrBuilder &MI = BuildMI(MBB, MBBI, DL, get(Opc))
-                                      .addReg(DestReg, getDefRegState(true))
-                                      .addFrameIndex(FI);
-  if (Offset)
-    MI.addImm(0);
-  MI.addMemOperand(MMO);
-}
-
-void llvm::emitFrameOffset(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg, int Offset,
-                           const ARM64InstrInfo *TII, MachineInstr::MIFlag Flag,
-                           bool SetCPSR) {
-  if (DestReg == SrcReg && Offset == 0)
-    return;
-
-  bool isSub = Offset < 0;
-  if (isSub)
-    Offset = -Offset;
-
-  // FIXME: If the offset won't fit in 24-bits, compute the offset into a
-  // scratch register.  If DestReg is a virtual register, use it as the
-  // scratch register; otherwise, create a new virtual register (to be
-  // replaced by the scavenger at the end of PEI).  That case can be optimized
-  // slightly if DestReg is SP which is always 16-byte aligned, so the scratch
-  // register can be loaded with offset%8 and the add/sub can use an extending
-  // instruction with LSL#3.
-  // Currently the function handles any offsets but generates a poor sequence
-  // of code.
-  //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
-
-  unsigned Opc;
-  if (SetCPSR)
-    Opc = isSub ? ARM64::SUBSXri : ARM64::ADDSXri;
-  else
-    Opc = isSub ? ARM64::SUBXri : ARM64::ADDXri;
-  const unsigned MaxEncoding = 0xfff;
-  const unsigned ShiftSize = 12;
-  const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
-  while (((unsigned)Offset) >= (1 << ShiftSize)) {
-    unsigned ThisVal;
-    if (((unsigned)Offset) > MaxEncodableValue) {
-      ThisVal = MaxEncodableValue;
-    } else {
-      ThisVal = Offset & MaxEncodableValue;
-    }
-    assert((ThisVal >> ShiftSize) <= MaxEncoding &&
-           "Encoding cannot handle value that big");
-    BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-        .addReg(SrcReg)
-        .addImm(ThisVal >> ShiftSize)
-        .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, ShiftSize))
-        .setMIFlag(Flag);
-
-    SrcReg = DestReg;
-    Offset -= ThisVal;
-    if (Offset == 0)
-      return;
-  }
-  BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
-      .addReg(SrcReg)
-      .addImm(Offset)
-      .addImm(ARM64_AM::getShifterImm(ARM64_AM::LSL, 0))
-      .setMIFlag(Flag);
-}
-
-MachineInstr *
-ARM64InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                                      const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const {
-  // This is a bit of a hack. Consider this instruction:
-  //
-  //   %vreg0<def> = COPY %SP; GPR64all:%vreg0
-  //
-  // We explicitly chose GPR64all for the virtual register so such a copy might
-  // be eliminated by RegisterCoalescer. However, that may not be possible, and
-  // %vreg0 may even spill. We can't spill %SP, and since it is in the GPR64all
-  // register class, TargetInstrInfo::foldMemoryOperand() is going to try.
-  //
-  // To prevent that, we are going to constrain the %vreg0 register class here.
-  //
-  // <rdar://problem/11522048>
-  //
-  if (MI->isCopy()) {
-    unsigned DstReg = MI->getOperand(0).getReg();
-    unsigned SrcReg = MI->getOperand(1).getReg();
-    if (SrcReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(DstReg)) {
-      MF.getRegInfo().constrainRegClass(DstReg, &ARM64::GPR64RegClass);
-      return 0;
-    }
-    if (DstReg == ARM64::SP && TargetRegisterInfo::isVirtualRegister(SrcReg)) {
-      MF.getRegInfo().constrainRegClass(SrcReg, &ARM64::GPR64RegClass);
-      return 0;
-    }
-  }
-
-  // Cannot fold.
-  return 0;
-}
-
-int llvm::isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                                  bool *OutUseUnscaledOp,
-                                  unsigned *OutUnscaledOp,
-                                  int *EmittableOffset) {
-  int Scale = 1;
-  bool IsSigned = false;
-  // The ImmIdx should be changed case by case if it is not 2.
-  unsigned ImmIdx = 2;
-  unsigned UnscaledOp = 0;
-  // Set output values in case of early exit.
-  if (EmittableOffset)
-    *EmittableOffset = 0;
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = false;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = 0;
-  switch (MI.getOpcode()) {
-  default:
-    assert(0 && "unhandled opcode in rewriteARM64FrameIndex");
-  // Vector spills/fills can't take an immediate offset.
-  case ARM64::LD1Twov2d:
-  case ARM64::LD1Threev2d:
-  case ARM64::LD1Fourv2d:
-  case ARM64::LD1Twov1d:
-  case ARM64::LD1Threev1d:
-  case ARM64::LD1Fourv1d:
-  case ARM64::ST1Twov2d:
-  case ARM64::ST1Threev2d:
-  case ARM64::ST1Fourv2d:
-  case ARM64::ST1Twov1d:
-  case ARM64::ST1Threev1d:
-  case ARM64::ST1Fourv1d:
-    return ARM64FrameOffsetCannotUpdate;
-  case ARM64::PRFMui:
-    Scale = 8;
-    UnscaledOp = ARM64::PRFUMi;
-    break;
-  case ARM64::LDRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURXi;
-    break;
-  case ARM64::LDRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURWi;
-    break;
-  case ARM64::LDRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBi;
-    break;
-  case ARM64::LDRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHi;
-    break;
-  case ARM64::LDRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSi;
-    break;
-  case ARM64::LDRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::LDURDi;
-    break;
-  case ARM64::LDRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::LDURQi;
-    break;
-  case ARM64::LDRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURBBi;
-    break;
-  case ARM64::LDRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURHHi;
-    break;
-  case ARM64::LDRSBXui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBXi;
-    break;
-  case ARM64::LDRSBWui:
-    Scale = 1;
-    UnscaledOp = ARM64::LDURSBWi;
-    break;
-  case ARM64::LDRSHXui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHXi;
-    break;
-  case ARM64::LDRSHWui:
-    Scale = 2;
-    UnscaledOp = ARM64::LDURSHWi;
-    break;
-  case ARM64::LDRSWui:
-    Scale = 4;
-    UnscaledOp = ARM64::LDURSWi;
-    break;
-
-  case ARM64::STRXui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURXi;
-    break;
-  case ARM64::STRWui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURWi;
-    break;
-  case ARM64::STRBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBi;
-    break;
-  case ARM64::STRHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHi;
-    break;
-  case ARM64::STRSui:
-    Scale = 4;
-    UnscaledOp = ARM64::STURSi;
-    break;
-  case ARM64::STRDui:
-    Scale = 8;
-    UnscaledOp = ARM64::STURDi;
-    break;
-  case ARM64::STRQui:
-    Scale = 16;
-    UnscaledOp = ARM64::STURQi;
-    break;
-  case ARM64::STRBBui:
-    Scale = 1;
-    UnscaledOp = ARM64::STURBBi;
-    break;
-  case ARM64::STRHHui:
-    Scale = 2;
-    UnscaledOp = ARM64::STURHHi;
-    break;
-
-  case ARM64::LDPXi:
-  case ARM64::LDPDi:
-  case ARM64::STPXi:
-  case ARM64::STPDi:
-    IsSigned = true;
-    Scale = 8;
-    break;
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-    IsSigned = true;
-    Scale = 16;
-    break;
-  case ARM64::LDPWi:
-  case ARM64::LDPSi:
-  case ARM64::STPWi:
-  case ARM64::STPSi:
-    IsSigned = true;
-    Scale = 4;
-    break;
-
-  case ARM64::LDURXi:
-  case ARM64::LDURWi:
-  case ARM64::LDURBi:
-  case ARM64::LDURHi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::STURWi:
-  case ARM64::STURBi:
-  case ARM64::STURHi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURBBi:
-  case ARM64::STURHHi:
-    Scale = 1;
-    break;
-  }
-
-  Offset += MI.getOperand(ImmIdx).getImm() * Scale;
-
-  bool useUnscaledOp = false;
-  // If the offset doesn't match the scale, we rewrite the instruction to
-  // use the unscaled instruction instead. Likewise, if we have a negative
-  // offset (and have an unscaled op to use).
-  if ((Offset & (Scale - 1)) != 0 || (Offset < 0 && UnscaledOp != 0))
-    useUnscaledOp = true;
-
-  // Use an unscaled addressing mode if the instruction has a negative offset
-  // (or if the instruction is already using an unscaled addressing mode).
-  unsigned MaskBits;
-  if (IsSigned) {
-    // ldp/stp instructions.
-    MaskBits = 7;
-    Offset /= Scale;
-  } else if (UnscaledOp == 0 || useUnscaledOp) {
-    MaskBits = 9;
-    IsSigned = true;
-    Scale = 1;
-  } else {
-    MaskBits = 12;
-    IsSigned = false;
-    Offset /= Scale;
-  }
-
-  // Attempt to fold address computation.
-  int MaxOff = (1 << (MaskBits - IsSigned)) - 1;
-  int MinOff = (IsSigned ? (-MaxOff - 1) : 0);
-  if (Offset >= MinOff && Offset <= MaxOff) {
-    if (EmittableOffset)
-      *EmittableOffset = Offset;
-    Offset = 0;
-  } else {
-    int NewOff = Offset < 0 ? MinOff : MaxOff;
-    if (EmittableOffset)
-      *EmittableOffset = NewOff;
-    Offset = (Offset - NewOff) * Scale;
-  }
-  if (OutUseUnscaledOp)
-    *OutUseUnscaledOp = useUnscaledOp;
-  if (OutUnscaledOp)
-    *OutUnscaledOp = UnscaledOp;
-  return ARM64FrameOffsetCanUpdate |
-         (Offset == 0 ? ARM64FrameOffsetIsLegal : 0);
-}
-
-bool llvm::rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                  unsigned FrameReg, int &Offset,
-                                  const ARM64InstrInfo *TII) {
-  unsigned Opcode = MI.getOpcode();
-  unsigned ImmIdx = FrameRegIdx + 1;
-
-  if (Opcode == ARM64::ADDSXri || Opcode == ARM64::ADDXri) {
-    Offset += MI.getOperand(ImmIdx).getImm();
-    emitFrameOffset(*MI.getParent(), MI, MI.getDebugLoc(),
-                    MI.getOperand(0).getReg(), FrameReg, Offset, TII,
-                    MachineInstr::NoFlags, (Opcode == ARM64::ADDSXri));
-    MI.eraseFromParent();
-    Offset = 0;
-    return true;
-  }
-
-  int NewOffset;
-  unsigned UnscaledOp;
-  bool UseUnscaledOp;
-  int Status = isARM64FrameOffsetLegal(MI, Offset, &UseUnscaledOp, &UnscaledOp,
-                                       &NewOffset);
-  if (Status & ARM64FrameOffsetCanUpdate) {
-    if (Status & ARM64FrameOffsetIsLegal)
-      // Replace the FrameIndex with FrameReg.
-      MI.getOperand(FrameRegIdx).ChangeToRegister(FrameReg, false);
-    if (UseUnscaledOp)
-      MI.setDesc(TII->get(UnscaledOp));
-
-    MI.getOperand(ImmIdx).ChangeToImmediate(NewOffset);
-    return Offset == 0;
-  }
-
-  return false;
-}
-
-void ARM64InstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
-  NopInst.setOpcode(ARM64::HINT);
-  NopInst.addOperand(MCOperand::CreateImm(0));
-}
diff --git a/lib/Target/ARM64/ARM64InstrInfo.h b/lib/Target/ARM64/ARM64InstrInfo.h
deleted file mode 100644
index 2591ca0..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.h
+++ /dev/null
@@ -1,219 +0,0 @@
-//===- ARM64InstrInfo.h - ARM64 Instruction Information ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetInstrInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64INSTRINFO_H
-#define LLVM_TARGET_ARM64INSTRINFO_H
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "llvm/Target/TargetInstrInfo.h"
-
-#define GET_INSTRINFO_HEADER
-#include "ARM64GenInstrInfo.inc"
-
-namespace llvm {
-
-class ARM64Subtarget;
-class ARM64TargetMachine;
-
-class ARM64InstrInfo : public ARM64GenInstrInfo {
-  // Reserve bits in the MachineMemOperand target hint flags, starting at 1.
-  // They will be shifted into MOTargetHintStart when accessed.
-  enum TargetMemOperandFlags {
-    MOSuppressPair = 1
-  };
-
-  const ARM64RegisterInfo RI;
-  const ARM64Subtarget &Subtarget;
-
-public:
-  explicit ARM64InstrInfo(const ARM64Subtarget &STI);
-
-  /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
-  /// such, whenever a client has an instance of instruction info, it should
-  /// always be able to get register info as well (through this method).
-  const ARM64RegisterInfo &getRegisterInfo() const { return RI; }
-
-  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
-
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
-
-  unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const override;
-  unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const override;
-
-  /// \brief Does this instruction set its full destination register to zero?
-  bool isGPRZero(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename a GPR without modifying bits?
-  bool isGPRCopy(const MachineInstr *MI) const;
-
-  /// \brief Does this instruction rename an FPR without modifying bits?
-  bool isFPRCopy(const MachineInstr *MI) const;
-
-  /// Return true if this is load/store scales or extends its register offset.
-  /// This refers to scaling a dynamic index as opposed to scaled immediates.
-  /// MI should be a memory op that allows scaled addressing.
-  bool isScaledAddr(const MachineInstr *MI) const;
-
-  /// Return true if pairing the given load or store is hinted to be
-  /// unprofitable.
-  bool isLdStPairSuppressed(const MachineInstr *MI) const;
-
-  /// Hint that pairing the given load or store is unprofitable.
-  void suppressLdStPair(MachineInstr *MI) const;
-
-  bool getLdStBaseRegImmOfs(MachineInstr *LdSt, unsigned &BaseReg,
-                            unsigned &Offset,
-                            const TargetRegisterInfo *TRI) const override;
-
-  bool enableClusterLoads() const override { return true; }
-
-  bool shouldClusterLoads(MachineInstr *FirstLdSt, MachineInstr *SecondLdSt,
-                          unsigned NumLoads) const override;
-
-  bool shouldScheduleAdjacent(MachineInstr *First,
-                              MachineInstr *Second) const override;
-
-  MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
-                                         uint64_t Offset, const MDNode *MDPtr,
-                                         DebugLoc DL) const;
-  void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                        DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                        bool KillSrc, unsigned Opcode,
-                        llvm::ArrayRef<unsigned> Indices) const;
-  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                   DebugLoc DL, unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const override;
-
-  void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
-                           bool isKill, int FrameIndex,
-                           const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const override;
-
-  void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
-                            int FrameIndex, const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const override;
-
-  MachineInstr *
-  foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
-                        const SmallVectorImpl<unsigned> &Ops,
-                        int FrameIndex) const override;
-
-  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                     MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify = false) const override;
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                        MachineBasicBlock *FBB,
-                        const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const override;
-  bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool canInsertSelect(const MachineBasicBlock &,
-                       const SmallVectorImpl<MachineOperand> &Cond, unsigned,
-                       unsigned, int &, int &, int &) const override;
-  void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    DebugLoc DL, unsigned DstReg,
-                    const SmallVectorImpl<MachineOperand> &Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
-  void getNoopForMachoTarget(MCInst &NopInst) const override;
-
-  /// analyzeCompare - For a comparison instruction, return the source registers
-  /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
-  /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr *MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
-                      int &CmpValue) const override;
-  /// optimizeCompareInstr - Convert the instruction supplying the argument to
-  /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
-                            const MachineRegisterInfo *MRI) const override;
-
-private:
-  void instantiateCondBranch(MachineBasicBlock &MBB, DebugLoc DL,
-                             MachineBasicBlock *TBB,
-                             const SmallVectorImpl<MachineOperand> &Cond) const;
-};
-
-/// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
-/// plus Offset.  This is intended to be used from within the prolog/epilog
-/// insertion (PEI) pass, where a virtual scratch register may be allocated
-/// if necessary, to be replaced by the scavenger at the end of PEI.
-void emitFrameOffset(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     DebugLoc DL, unsigned DestReg, unsigned SrcReg, int Offset,
-                     const ARM64InstrInfo *TII,
-                     MachineInstr::MIFlag = MachineInstr::NoFlags,
-                     bool SetCPSR = false);
-
-/// rewriteARM64FrameIndex - Rewrite MI to access 'Offset' bytes from the
-/// FP. Return false if the offset could not be handled directly in MI, and
-/// return the left-over portion by reference.
-bool rewriteARM64FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                            unsigned FrameReg, int &Offset,
-                            const ARM64InstrInfo *TII);
-
-/// \brief Use to report the frame offset status in isARM64FrameOffsetLegal.
-enum ARM64FrameOffsetStatus {
-  ARM64FrameOffsetCannotUpdate = 0x0, ///< Offset cannot apply.
-  ARM64FrameOffsetIsLegal = 0x1,      ///< Offset is legal.
-  ARM64FrameOffsetCanUpdate = 0x2     ///< Offset can apply, at least partly.
-};
-
-/// \brief Check if the @p Offset is a valid frame offset for @p MI.
-/// The returned value reports the validity of the frame offset for @p MI.
-/// It uses the values defined by ARM64FrameOffsetStatus for that.
-/// If result == ARM64FrameOffsetCannotUpdate, @p MI cannot be updated to
-/// use an offset.eq
-/// If result & ARM64FrameOffsetIsLegal, @p Offset can completely be
-/// rewriten in @p MI.
-/// If result & ARM64FrameOffsetCanUpdate, @p Offset contains the
-/// amount that is off the limit of the legal offset.
-/// If set, @p OutUseUnscaledOp will contain the whether @p MI should be
-/// turned into an unscaled operator, which opcode is in @p OutUnscaledOp.
-/// If set, @p EmittableOffset contains the amount that can be set in @p MI
-/// (possibly with @p OutUnscaledOp if OutUseUnscaledOp is true) and that
-/// is a legal offset.
-int isARM64FrameOffsetLegal(const MachineInstr &MI, int &Offset,
-                            bool *OutUseUnscaledOp = NULL,
-                            unsigned *OutUnscaledOp = NULL,
-                            int *EmittableOffset = NULL);
-
-static inline bool isUncondBranchOpcode(int Opc) { return Opc == ARM64::B; }
-
-static inline bool isCondBranchOpcode(int Opc) {
-  switch (Opc) {
-  case ARM64::Bcc:
-  case ARM64::CBZW:
-  case ARM64::CBZX:
-  case ARM64::CBNZW:
-  case ARM64::CBNZX:
-  case ARM64::TBZ:
-  case ARM64::TBNZ:
-    return true;
-  default:
-    return false;
-  }
-}
-
-static inline bool isIndirectBranchOpcode(int Opc) { return Opc == ARM64::BR; }
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64InstrInfo.td b/lib/Target/ARM64/ARM64InstrInfo.td
deleted file mode 100644
index 2fe1720..0000000
--- a/lib/Target/ARM64/ARM64InstrInfo.td
+++ /dev/null
@@ -1,4458 +0,0 @@
-//===- ARM64InstrInfo.td - Describe the ARM64 Instructions -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// ARM64 Instruction definitions.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ARM64-specific DAG Nodes.
-//
-
-// SDTBinaryArithWithFlagsOut - RES1, FLAGS = op LHS, RHS
-def SDTBinaryArithWithFlagsOut : SDTypeProfile<2, 2,
-                                              [SDTCisSameAs<0, 2>,
-                                               SDTCisSameAs<0, 3>,
-                                               SDTCisInt<0>, SDTCisVT<1, i32>]>;
-
-// SDTBinaryArithWithFlagsIn - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsIn : SDTypeProfile<1, 3,
-                                            [SDTCisSameAs<0, 1>,
-                                             SDTCisSameAs<0, 2>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<3, i32>]>;
-
-// SDTBinaryArithWithFlagsInOut - RES1, FLAGS = op LHS, RHS, FLAGS
-def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
-                                            [SDTCisSameAs<0, 2>,
-                                             SDTCisSameAs<0, 3>,
-                                             SDTCisInt<0>,
-                                             SDTCisVT<1, i32>,
-                                             SDTCisVT<4, i32>]>;
-
-def SDT_ARM64Brcond  : SDTypeProfile<0, 3,
-                                     [SDTCisVT<0, OtherVT>, SDTCisVT<1, i32>,
-                                      SDTCisVT<2, i32>]>;
-def SDT_ARM64cbz : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisVT<1, OtherVT>]>;
-def SDT_ARM64tbz : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisVT<1, i64>,
-                                        SDTCisVT<2, OtherVT>]>;
-
-
-def SDT_ARM64CSel  : SDTypeProfile<1, 4,
-                                   [SDTCisSameAs<0, 1>,
-                                    SDTCisSameAs<0, 2>,
-                                    SDTCisInt<3>,
-                                    SDTCisVT<4, i32>]>;
-def SDT_ARM64FCmp   : SDTypeProfile<0, 2,
-                                   [SDTCisFP<0>,
-                                    SDTCisSameAs<0, 1>]>;
-def SDT_ARM64Dup   : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
-def SDT_ARM64DupLane   : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
-def SDT_ARM64Zip   : SDTypeProfile<1, 2, [SDTCisVec<0>,
-                                          SDTCisSameAs<0, 1>,
-                                          SDTCisSameAs<0, 2>]>;
-def SDT_ARM64MOVIedit : SDTypeProfile<1, 1, [SDTCisInt<1>]>;
-def SDT_ARM64MOVIshift : SDTypeProfile<1, 2, [SDTCisInt<1>, SDTCisInt<2>]>;
-def SDT_ARM64vecimm : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisInt<2>, SDTCisInt<3>]>;
-def SDT_ARM64UnaryVec: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                          SDTCisSameAs<0,2>, SDTCisInt<3>]>;
-def SDT_ARM64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
-
-def SDT_ARM64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
-def SDT_ARM64fcmpz : SDTypeProfile<1, 1, []>;
-def SDT_ARM64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
-def SDT_ARM64binvec : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>]>;
-def SDT_ARM64trivec : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                           SDTCisSameAs<0,2>,
-                                           SDTCisSameAs<0,3>]>;
-def SDT_ARM64TCRET : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
-def SDT_ARM64PREFETCH : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>]>;
-
-def SDT_ARM64ITOF  : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
-
-def SDT_ARM64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
-                                                 SDTCisPtrTy<1>]>;
-def SDT_ARM64WrapperLarge : SDTypeProfile<1, 4,
-                                        [SDTCisVT<0, i64>, SDTCisVT<1, i32>,
-                                         SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
-                                         SDTCisSameAs<1, 4>]>;
-
-
-// Node definitions.
-def ARM64adrp          : SDNode<"ARM64ISD::ADRP", SDTIntUnaryOp, []>;
-def ARM64addlow        : SDNode<"ARM64ISD::ADDlow", SDTIntBinOp, []>;
-def ARM64LOADgot       : SDNode<"ARM64ISD::LOADgot", SDTIntUnaryOp>;
-def ARM64callseq_start : SDNode<"ISD::CALLSEQ_START",
-                                SDCallSeqStart<[ SDTCisVT<0, i32> ]>,
-                                [SDNPHasChain, SDNPOutGlue]>;
-def ARM64callseq_end   : SDNode<"ISD::CALLSEQ_END",
-                                SDCallSeqEnd<[ SDTCisVT<0, i32>,
-                                               SDTCisVT<1, i32> ]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def ARM64call          : SDNode<"ARM64ISD::CALL",
-                                SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
-                                 SDNPVariadic]>;
-def ARM64brcond        : SDNode<"ARM64ISD::BRCOND", SDT_ARM64Brcond,
-                                [SDNPHasChain]>;
-def ARM64cbz           : SDNode<"ARM64ISD::CBZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64cbnz           : SDNode<"ARM64ISD::CBNZ", SDT_ARM64cbz,
-                                [SDNPHasChain]>;
-def ARM64tbz           : SDNode<"ARM64ISD::TBZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-def ARM64tbnz           : SDNode<"ARM64ISD::TBNZ", SDT_ARM64tbz,
-                                [SDNPHasChain]>;
-
-
-def ARM64csel          : SDNode<"ARM64ISD::CSEL", SDT_ARM64CSel>;
-def ARM64csinv         : SDNode<"ARM64ISD::CSINV", SDT_ARM64CSel>;
-def ARM64csneg         : SDNode<"ARM64ISD::CSNEG", SDT_ARM64CSel>;
-def ARM64csinc         : SDNode<"ARM64ISD::CSINC", SDT_ARM64CSel>;
-def ARM64retflag       : SDNode<"ARM64ISD::RET_FLAG", SDTNone,
-                                [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
-def ARM64adc       : SDNode<"ARM64ISD::ADC",  SDTBinaryArithWithFlagsIn >;
-def ARM64sbc       : SDNode<"ARM64ISD::SBC",  SDTBinaryArithWithFlagsIn>;
-def ARM64add_flag  : SDNode<"ARM64ISD::ADDS",  SDTBinaryArithWithFlagsOut,
-                            [SDNPCommutative]>;
-def ARM64sub_flag  : SDNode<"ARM64ISD::SUBS",  SDTBinaryArithWithFlagsOut>;
-def ARM64and_flag  : SDNode<"ARM64ISD::ANDS",  SDTBinaryArithWithFlagsOut>;
-def ARM64adc_flag  : SDNode<"ARM64ISD::ADCS",  SDTBinaryArithWithFlagsInOut>;
-def ARM64sbc_flag  : SDNode<"ARM64ISD::SBCS",  SDTBinaryArithWithFlagsInOut>;
-
-def ARM64threadpointer : SDNode<"ARM64ISD::THREAD_POINTER", SDTPtrLeaf>;
-
-def ARM64fcmp      : SDNode<"ARM64ISD::FCMP", SDT_ARM64FCmp>;
-
-def ARM64fmax      : SDNode<"ARM64ISD::FMAX", SDTFPBinOp>;
-def ARM64fmin      : SDNode<"ARM64ISD::FMIN", SDTFPBinOp>;
-
-def ARM64dup       : SDNode<"ARM64ISD::DUP", SDT_ARM64Dup>;
-def ARM64duplane8  : SDNode<"ARM64ISD::DUPLANE8", SDT_ARM64DupLane>;
-def ARM64duplane16 : SDNode<"ARM64ISD::DUPLANE16", SDT_ARM64DupLane>;
-def ARM64duplane32 : SDNode<"ARM64ISD::DUPLANE32", SDT_ARM64DupLane>;
-def ARM64duplane64 : SDNode<"ARM64ISD::DUPLANE64", SDT_ARM64DupLane>;
-
-def ARM64zip1      : SDNode<"ARM64ISD::ZIP1", SDT_ARM64Zip>;
-def ARM64zip2      : SDNode<"ARM64ISD::ZIP2", SDT_ARM64Zip>;
-def ARM64uzp1      : SDNode<"ARM64ISD::UZP1", SDT_ARM64Zip>;
-def ARM64uzp2      : SDNode<"ARM64ISD::UZP2", SDT_ARM64Zip>;
-def ARM64trn1      : SDNode<"ARM64ISD::TRN1", SDT_ARM64Zip>;
-def ARM64trn2      : SDNode<"ARM64ISD::TRN2", SDT_ARM64Zip>;
-
-def ARM64movi_edit : SDNode<"ARM64ISD::MOVIedit", SDT_ARM64MOVIedit>;
-def ARM64movi_shift : SDNode<"ARM64ISD::MOVIshift", SDT_ARM64MOVIshift>;
-def ARM64movi_msl : SDNode<"ARM64ISD::MOVImsl", SDT_ARM64MOVIshift>;
-def ARM64mvni_shift : SDNode<"ARM64ISD::MVNIshift", SDT_ARM64MOVIshift>;
-def ARM64mvni_msl : SDNode<"ARM64ISD::MVNImsl", SDT_ARM64MOVIshift>;
-def ARM64movi : SDNode<"ARM64ISD::MOVI", SDT_ARM64MOVIedit>;
-def ARM64fmov : SDNode<"ARM64ISD::FMOV", SDT_ARM64MOVIedit>;
-
-def ARM64rev16 : SDNode<"ARM64ISD::REV16", SDT_ARM64UnaryVec>;
-def ARM64rev32 : SDNode<"ARM64ISD::REV32", SDT_ARM64UnaryVec>;
-def ARM64rev64 : SDNode<"ARM64ISD::REV64", SDT_ARM64UnaryVec>;
-def ARM64ext : SDNode<"ARM64ISD::EXT", SDT_ARM64ExtVec>;
-
-def ARM64vashr : SDNode<"ARM64ISD::VASHR", SDT_ARM64vshift>;
-def ARM64vlshr : SDNode<"ARM64ISD::VLSHR", SDT_ARM64vshift>;
-def ARM64vshl : SDNode<"ARM64ISD::VSHL", SDT_ARM64vshift>;
-def ARM64sqshli : SDNode<"ARM64ISD::SQSHL_I", SDT_ARM64vshift>;
-def ARM64uqshli : SDNode<"ARM64ISD::UQSHL_I", SDT_ARM64vshift>;
-def ARM64sqshlui : SDNode<"ARM64ISD::SQSHLU_I", SDT_ARM64vshift>;
-def ARM64srshri : SDNode<"ARM64ISD::SRSHR_I", SDT_ARM64vshift>;
-def ARM64urshri : SDNode<"ARM64ISD::URSHR_I", SDT_ARM64vshift>;
-
-def ARM64not: SDNode<"ARM64ISD::NOT", SDT_ARM64unvec>;
-def ARM64bit: SDNode<"ARM64ISD::BIT", SDT_ARM64trivec>;
-
-def ARM64cmeq: SDNode<"ARM64ISD::CMEQ", SDT_ARM64binvec>;
-def ARM64cmge: SDNode<"ARM64ISD::CMGE", SDT_ARM64binvec>;
-def ARM64cmgt: SDNode<"ARM64ISD::CMGT", SDT_ARM64binvec>;
-def ARM64cmhi: SDNode<"ARM64ISD::CMHI", SDT_ARM64binvec>;
-def ARM64cmhs: SDNode<"ARM64ISD::CMHS", SDT_ARM64binvec>;
-
-def ARM64fcmeq: SDNode<"ARM64ISD::FCMEQ", SDT_ARM64fcmp>;
-def ARM64fcmge: SDNode<"ARM64ISD::FCMGE", SDT_ARM64fcmp>;
-def ARM64fcmgt: SDNode<"ARM64ISD::FCMGT", SDT_ARM64fcmp>;
-
-def ARM64cmeqz: SDNode<"ARM64ISD::CMEQz", SDT_ARM64unvec>;
-def ARM64cmgez: SDNode<"ARM64ISD::CMGEz", SDT_ARM64unvec>;
-def ARM64cmgtz: SDNode<"ARM64ISD::CMGTz", SDT_ARM64unvec>;
-def ARM64cmlez: SDNode<"ARM64ISD::CMLEz", SDT_ARM64unvec>;
-def ARM64cmltz: SDNode<"ARM64ISD::CMLTz", SDT_ARM64unvec>;
-def ARM64cmtst : PatFrag<(ops node:$LHS, node:$RHS),
-                         (ARM64not (ARM64cmeqz (and node:$LHS, node:$RHS)))>;
-
-def ARM64fcmeqz: SDNode<"ARM64ISD::FCMEQz", SDT_ARM64fcmpz>;
-def ARM64fcmgez: SDNode<"ARM64ISD::FCMGEz", SDT_ARM64fcmpz>;
-def ARM64fcmgtz: SDNode<"ARM64ISD::FCMGTz", SDT_ARM64fcmpz>;
-def ARM64fcmlez: SDNode<"ARM64ISD::FCMLEz", SDT_ARM64fcmpz>;
-def ARM64fcmltz: SDNode<"ARM64ISD::FCMLTz", SDT_ARM64fcmpz>;
-
-def ARM64bici: SDNode<"ARM64ISD::BICi", SDT_ARM64vecimm>;
-def ARM64orri: SDNode<"ARM64ISD::ORRi", SDT_ARM64vecimm>;
-
-def ARM64neg : SDNode<"ARM64ISD::NEG", SDT_ARM64unvec>;
-
-def ARM64tcret: SDNode<"ARM64ISD::TC_RETURN", SDT_ARM64TCRET,
-                  [SDNPHasChain,  SDNPOptInGlue, SDNPVariadic]>;
-
-def ARM64Prefetch        : SDNode<"ARM64ISD::PREFETCH", SDT_ARM64PREFETCH,
-                               [SDNPHasChain, SDNPSideEffect]>;
-
-def ARM64sitof: SDNode<"ARM64ISD::SITOF", SDT_ARM64ITOF>;
-def ARM64uitof: SDNode<"ARM64ISD::UITOF", SDT_ARM64ITOF>;
-
-def ARM64tlsdesc_call : SDNode<"ARM64ISD::TLSDESC_CALL", SDT_ARM64TLSDescCall,
-                               [SDNPInGlue, SDNPOutGlue, SDNPHasChain,
-                                SDNPVariadic]>;
-
-def ARM64WrapperLarge : SDNode<"ARM64ISD::WrapperLarge", SDT_ARM64WrapperLarge>;
-
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-
-// ARM64 Instruction Predicate Definitions.
-//
-def HasZCZ    : Predicate<"Subtarget->hasZeroCycleZeroing()">;
-def NoZCZ     : Predicate<"!Subtarget->hasZeroCycleZeroing()">;
-def IsDarwin  : Predicate<"Subtarget->isTargetDarwin()">;
-def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">;
-def ForCodeSize   : Predicate<"ForCodeSize">;
-def NotForCodeSize   : Predicate<"!ForCodeSize">;
-
-include "ARM64InstrFormats.td"
-
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Miscellaneous instructions.
-//===----------------------------------------------------------------------===//
-
-let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in {
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              [(ARM64callseq_start timm:$amt)]>;
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            [(ARM64callseq_end timm:$amt1, timm:$amt2)]>;
-} // Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1
-
-let isReMaterializable = 1, isCodeGenOnly = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, they can be
-// removed, along with the ARM64Wrapper node.
-
-let AddedComplexity = 10 in
-def LOADgot : Pseudo<(outs GPR64:$dst), (ins i64imm:$addr),
-                     [(set GPR64:$dst, (ARM64LOADgot tglobaladdr:$addr))]>,
-              Sched<[WriteLDAdr]>;
-
-// The MOVaddr instruction should match only when the add is not folded
-// into a load or store address.
-def MOVaddr
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaladdr:$hi),
-                                            tglobaladdr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrJT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tjumptable:$hi),
-                                             tjumptable:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrCP
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tconstpool:$hi),
-                                             tconstpool:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrBA
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tblockaddress:$hi),
-                                             tblockaddress:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrTLS
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp tglobaltlsaddr:$hi),
-                                            tglobaltlsaddr:$low))]>,
-      Sched<[WriteAdrAdr]>;
-def MOVaddrEXT
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$hi, i64imm:$low),
-             [(set GPR64:$dst, (ARM64addlow (ARM64adrp texternalsym:$hi),
-                                            texternalsym:$low))]>,
-      Sched<[WriteAdrAdr]>;
-
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64LOADgot tglobaltlsaddr:$addr),
-          (LOADgot tglobaltlsaddr:$addr)>;
-
-def : Pat<(ARM64LOADgot texternalsym:$addr),
-          (LOADgot texternalsym:$addr)>;
-
-def : Pat<(ARM64LOADgot tconstpool:$addr),
-          (LOADgot tconstpool:$addr)>;
-
-//===----------------------------------------------------------------------===//
-// System instructions.
-//===----------------------------------------------------------------------===//
-
-def HINT  : HintI<"hint">;
-def : InstAlias<"nop",  (HINT 0b000)>;
-def : InstAlias<"yield",(HINT 0b001)>;
-def : InstAlias<"wfe",  (HINT 0b010)>;
-def : InstAlias<"wfi",  (HINT 0b011)>;
-def : InstAlias<"sev",  (HINT 0b100)>;
-def : InstAlias<"sevl", (HINT 0b101)>;
-
-  // As far as LLVM is concerned this writes to the system's exclusive monitors.
-let mayLoad = 1, mayStore = 1 in
-def CLREX : CRmSystemI<imm0_15, 0b010, "clrex">;
-
-def DMB   : CRmSystemI<barrier_op, 0b101, "dmb">;
-def DSB   : CRmSystemI<barrier_op, 0b100, "dsb">;
-def ISB   : CRmSystemI<barrier_op, 0b110, "isb">;
-def : InstAlias<"clrex", (CLREX 0xf)>;
-def : InstAlias<"isb", (ISB 0xf)>;
-
-def MRS    : MRSI;
-def MSR    : MSRI;
-def MSRcpsr: MSRcpsrI;
-
-// The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(ARM64threadpointer), (MRS 0xde82)>;
-
-// Generic system instructions
-def SYS    : SystemI<0, "sys">;
-def SYSxt  : SystemXtI<0, "sys">;
-def SYSLxt : SystemLXtI<1, "sysl">;
-
-//===----------------------------------------------------------------------===//
-// Move immediate instructions.
-//===----------------------------------------------------------------------===//
-
-defm MOVK : InsertImmediate<0b11, "movk">;
-defm MOVN : MoveImmediate<0b00, "movn">;
-
-let PostEncoderMethod = "fixMOVZ" in
-defm MOVZ : MoveImmediate<0b10, "movz">;
-
-def : InstAlias<"movk $dst, $imm", (MOVKWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movk $dst, $imm", (MOVKXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movn $dst, $imm", (MOVNXi GPR64:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZWi GPR32:$dst, imm0_65535:$imm, 0)>;
-def : InstAlias<"movz $dst, $imm", (MOVZXi GPR64:$dst, imm0_65535:$imm, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movn $Rd, $sym", (MOVNXi GPR64:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g3:$sym, 48)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g2:$sym, 32)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g1:$sym, 16)>;
-def : InstAlias<"movz $Rd, $sym", (MOVZWi GPR32:$Rd, movz_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKXi GPR64:$Rd, movk_symbol_g0:$sym, 0)>;
-
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g2:$sym, 32)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g1:$sym, 16)>;
-def : InstAlias<"movk $Rd, $sym", (MOVKWi GPR32:$Rd, movk_symbol_g0:$sym, 0)>;
-
-let isReMaterializable = 1, isCodeGenOnly = 1, isMoveImm = 1,
-    isAsCheapAsAMove = 1 in {
-// FIXME: The following pseudo instructions are only needed because remat
-// cannot handle multiple instructions.  When that changes, we can select
-// directly to the real instructions and get rid of these pseudos.
-
-def MOVi32imm
-    : Pseudo<(outs GPR32:$dst), (ins i32imm:$src),
-             [(set GPR32:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-def MOVi64imm
-    : Pseudo<(outs GPR64:$dst), (ins i64imm:$src),
-             [(set GPR64:$dst, imm:$src)]>,
-      Sched<[WriteImm]>;
-} // isReMaterializable, isCodeGenOnly
-
-def : Pat<(ARM64WrapperLarge tglobaladdr:$g3, tglobaladdr:$g2,
-                             tglobaladdr:$g1, tglobaladdr:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tglobaladdr:$g3, 48),
-                                  tglobaladdr:$g2, 32),
-                          tglobaladdr:$g1, 16),
-                  tglobaladdr:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tblockaddress:$g3, tblockaddress:$g2,
-                             tblockaddress:$g1, tblockaddress:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tblockaddress:$g3, 48),
-                                  tblockaddress:$g2, 32),
-                          tblockaddress:$g1, 16),
-                  tblockaddress:$g0, 0)>;
-
-def : Pat<(ARM64WrapperLarge tconstpool:$g3, tconstpool:$g2,
-                             tconstpool:$g1, tconstpool:$g0),
-          (MOVKXi (MOVKXi (MOVKXi (MOVZXi tconstpool:$g3, 48),
-                                  tconstpool:$g2, 32),
-                          tconstpool:$g1, 16),
-                  tconstpool:$g0, 0)>;
-
-
-//===----------------------------------------------------------------------===//
-// Arithmetic instructions.
-//===----------------------------------------------------------------------===//
-
-// Add/subtract with carry.
-defm ADC : AddSubCarry<0, "adc", "adcs", ARM64adc, ARM64adc_flag>;
-defm SBC : AddSubCarry<1, "sbc", "sbcs", ARM64sbc, ARM64sbc_flag>;
-
-def : InstAlias<"ngc $dst, $src",  (SBCWr  GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngc $dst, $src",  (SBCXr  GPR64:$dst, XZR, GPR64:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSWr GPR32:$dst, WZR, GPR32:$src)>;
-def : InstAlias<"ngcs $dst, $src", (SBCSXr GPR64:$dst, XZR, GPR64:$src)>;
-
-// Add/subtract
-defm ADD : AddSub<0, "add", add>;
-defm SUB : AddSub<1, "sub">;
-
-defm ADDS : AddSubS<0, "adds", ARM64add_flag>;
-defm SUBS : AddSubS<1, "subs", ARM64sub_flag>;
-
-// Use SUBS instead of SUB to enable CSE between SUBS and SUB.
-def : Pat<(sub GPR32sp:$Rn, addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32sp:$Rn, addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64sp:$Rn, addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64sp:$Rn, addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, GPR32:$Rm),
-          (SUBSWrr GPR32:$Rn, GPR32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, GPR64:$Rm),
-          (SUBSXrr GPR64:$Rn, GPR64:$Rm)>;
-def : Pat<(sub GPR32:$Rn, arith_shifted_reg32:$Rm),
-          (SUBSWrs GPR32:$Rn, arith_shifted_reg32:$Rm)>;
-def : Pat<(sub GPR64:$Rn, arith_shifted_reg64:$Rm),
-          (SUBSXrs GPR64:$Rn, arith_shifted_reg64:$Rm)>;
-def : Pat<(sub GPR32sp:$R2, arith_extended_reg32<i32>:$R3),
-          (SUBSWrx GPR32sp:$R2, arith_extended_reg32<i32>:$R3)>;
-def : Pat<(sub GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3),
-          (SUBSXrx GPR64sp:$R2, arith_extended_reg32to64<i64>:$R3)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(add GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(add GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(sub GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(sub GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"neg $dst, $src", (SUBWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"neg $dst, $src", (SUBXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"neg $dst, $src, $shift",
-                (SUBWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
-def : InstAlias<"neg $dst, $src, $shift",
-                (SUBXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
-
-// Because of the immediate format for add/sub-imm instructions, the
-// expression (add x, -1) must be transformed to (SUB{W,X}ri x, 1).
-//  These patterns capture that transformation.
-let AddedComplexity = 1 in {
-def : Pat<(ARM64add_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (SUBSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64add_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (SUBSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-def : Pat<(ARM64sub_flag GPR32:$Rn, neg_addsub_shifted_imm32:$imm),
-          (ADDSWri GPR32:$Rn, neg_addsub_shifted_imm32:$imm)>;
-def : Pat<(ARM64sub_flag GPR64:$Rn, neg_addsub_shifted_imm64:$imm),
-          (ADDSXri GPR64:$Rn, neg_addsub_shifted_imm64:$imm)>;
-}
-
-def : InstAlias<"negs $dst, $src", (SUBSWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"negs $dst, $src", (SUBSXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"negs $dst, $src, $shift",
-                (SUBSWrs GPR32:$dst, WZR, GPR32:$src, arith_shift:$shift)>;
-def : InstAlias<"negs $dst, $src, $shift",
-                (SUBSXrs GPR64:$dst, XZR, GPR64:$src, arith_shift:$shift)>;
-
-// Unsigned/Signed divide
-defm UDIV : Div<0, "udiv", udiv>;
-defm SDIV : Div<1, "sdiv", sdiv>;
-let isCodeGenOnly = 1 in {
-defm UDIV_Int : Div<0, "udiv", int_arm64_udiv>;
-defm SDIV_Int : Div<1, "sdiv", int_arm64_sdiv>;
-}
-
-// Variable shift
-defm ASRV : Shift<0b10, "asrv", sra>;
-defm LSLV : Shift<0b00, "lslv", shl>;
-defm LSRV : Shift<0b01, "lsrv", srl>;
-defm RORV : Shift<0b11, "rorv", rotr>;
-
-def : ShiftAlias<"asr", ASRVWr, GPR32>;
-def : ShiftAlias<"asr", ASRVXr, GPR64>;
-def : ShiftAlias<"lsl", LSLVWr, GPR32>;
-def : ShiftAlias<"lsl", LSLVXr, GPR64>;
-def : ShiftAlias<"lsr", LSRVWr, GPR32>;
-def : ShiftAlias<"lsr", LSRVXr, GPR64>;
-def : ShiftAlias<"ror", RORVWr, GPR32>;
-def : ShiftAlias<"ror", RORVXr, GPR64>;
-
-// Multiply-add
-let AddedComplexity = 7 in {
-defm MADD : MulAccum<0, "madd", add>;
-defm MSUB : MulAccum<1, "msub", sub>;
-
-def : Pat<(i32 (mul GPR32:$Rn, GPR32:$Rm)),
-          (MADDWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (mul GPR64:$Rn, GPR64:$Rm)),
-          (MADDXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-
-def : Pat<(i32 (ineg (mul GPR32:$Rn, GPR32:$Rm))),
-          (MSUBWrrr GPR32:$Rn, GPR32:$Rm, WZR)>;
-def : Pat<(i64 (ineg (mul GPR64:$Rn, GPR64:$Rm))),
-          (MSUBXrrr GPR64:$Rn, GPR64:$Rm, XZR)>;
-} // AddedComplexity = 7
-
-let AddedComplexity = 5 in {
-def SMADDLrrr : WideMulAccum<0, 0b001, "smaddl", add, sext>;
-def SMSUBLrrr : WideMulAccum<1, 0b001, "smsubl", sub, sext>;
-def UMADDLrrr : WideMulAccum<0, 0b101, "umaddl", add, zext>;
-def UMSUBLrrr : WideMulAccum<1, 0b101, "umsubl", sub, zext>;
-
-def : Pat<(i64 (mul (sext GPR32:$Rn), (sext GPR32:$Rm))),
-          (SMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (mul (zext GPR32:$Rn), (zext GPR32:$Rm))),
-          (UMADDLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-
-def : Pat<(i64 (ineg (mul (sext GPR32:$Rn), (sext GPR32:$Rm)))),
-          (SMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-def : Pat<(i64 (ineg (mul (zext GPR32:$Rn), (zext GPR32:$Rm)))),
-          (UMSUBLrrr GPR32:$Rn, GPR32:$Rm, XZR)>;
-} // AddedComplexity = 5
-
-def : MulAccumWAlias<"mul", MADDWrrr>;
-def : MulAccumXAlias<"mul", MADDXrrr>;
-def : MulAccumWAlias<"mneg", MSUBWrrr>;
-def : MulAccumXAlias<"mneg", MSUBXrrr>;
-def : WideMulAccumAlias<"smull", SMADDLrrr>;
-def : WideMulAccumAlias<"smnegl", SMSUBLrrr>;
-def : WideMulAccumAlias<"umull", UMADDLrrr>;
-def : WideMulAccumAlias<"umnegl", UMSUBLrrr>;
-
-// Multiply-high
-def SMULHrr : MulHi<0b010, "smulh", mulhs>;
-def UMULHrr : MulHi<0b110, "umulh", mulhu>;
-
-// CRC32
-def CRC32Brr : BaseCRC32<0, 0b00, 0, GPR32, int_arm64_crc32b, "crc32b">;
-def CRC32Hrr : BaseCRC32<0, 0b01, 0, GPR32, int_arm64_crc32h, "crc32h">;
-def CRC32Wrr : BaseCRC32<0, 0b10, 0, GPR32, int_arm64_crc32w, "crc32w">;
-def CRC32Xrr : BaseCRC32<1, 0b11, 0, GPR64, int_arm64_crc32x, "crc32x">;
-
-def CRC32CBrr : BaseCRC32<0, 0b00, 1, GPR32, int_arm64_crc32cb, "crc32cb">;
-def CRC32CHrr : BaseCRC32<0, 0b01, 1, GPR32, int_arm64_crc32ch, "crc32ch">;
-def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_arm64_crc32cw, "crc32cw">;
-def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_arm64_crc32cx, "crc32cx">;
-
-
-//===----------------------------------------------------------------------===//
-// Logical instructions.
-//===----------------------------------------------------------------------===//
-
-// (immediate)
-defm ANDS : LogicalImmS<0b11, "ands", ARM64and_flag>;
-defm AND  : LogicalImm<0b00, "and", and>;
-defm EOR  : LogicalImm<0b10, "eor", xor>;
-defm ORR  : LogicalImm<0b01, "orr", or>;
-
-def : InstAlias<"mov $dst, $imm", (ORRWri GPR32sp:$dst, WZR,
-                                          logical_imm32:$imm)>;
-def : InstAlias<"mov $dst, $imm", (ORRXri GPR64sp:$dst, XZR,
-                                          logical_imm64:$imm)>;
-
-
-// (register)
-defm ANDS : LogicalRegS<0b11, 0, "ands">;
-defm BICS : LogicalRegS<0b11, 1, "bics">;
-defm AND  : LogicalReg<0b00, 0, "and", and>;
-defm BIC  : LogicalReg<0b00, 1, "bic",
-                       BinOpFrag<(and node:$LHS, (not node:$RHS))>>;
-defm EON  : LogicalReg<0b10, 1, "eon",
-                       BinOpFrag<(xor node:$LHS, (not node:$RHS))>>;
-defm EOR  : LogicalReg<0b10, 0, "eor", xor>;
-defm ORN  : LogicalReg<0b01, 1, "orn",
-                       BinOpFrag<(or node:$LHS, (not node:$RHS))>>;
-defm ORR  : LogicalReg<0b01, 0, "orr", or>;
-
-def : InstAlias<"mov $dst, $src", (ORRWrs GPR32:$dst, WZR, GPR32:$src, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDWri GPR32sp:$dst, GPR32sp:$src, 0, 0)>;
-def : InstAlias<"mov $dst, $src", (ORRXrs GPR64:$dst, XZR, GPR64:$src, 0)>;
-def : InstAlias<"mov $dst, $src",
-                (ADDXri GPR64sp:$dst, GPR64sp:$src, 0, 0)>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWri WZR, GPR32:$src1, logical_imm32:$src2)>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXri XZR, GPR64:$src1, logical_imm64:$src2)>;
-
-def : InstAlias<"tst $src1, $src2",
-                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, 0)>;
-def : InstAlias<"tst $src1, $src2",
-                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, 0)>;
-
-def : InstAlias<"tst $src1, $src2, $sh",
-                (ANDSWrs WZR, GPR32:$src1, GPR32:$src2, logical_shift:$sh)>;
-def : InstAlias<"tst $src1, $src2, $sh",
-                (ANDSXrs XZR, GPR64:$src1, GPR64:$src2, logical_shift:$sh)>;
-
-def : InstAlias<"mvn $Wd, $Wm",
-                (ORNWrs GPR32:$Wd, WZR, GPR32:$Wm, 0)>;
-def : InstAlias<"mvn $Xd, $Xm",
-                (ORNXrs GPR64:$Xd, XZR, GPR64:$Xm, 0)>;
-
-def : Pat<(not GPR32:$Wm), (ORNWrr WZR, GPR32:$Wm)>;
-def : Pat<(not GPR64:$Xm), (ORNXrr XZR, GPR64:$Xm)>;
-
-
-//===----------------------------------------------------------------------===//
-// One operand data processing instructions.
-//===----------------------------------------------------------------------===//
-
-defm CLS    : OneOperandData<0b101, "cls">;
-defm CLZ    : OneOperandData<0b100, "clz", ctlz>;
-defm RBIT   : OneOperandData<0b000, "rbit">;
-def  REV16Wr : OneWRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-def  REV16Xr : OneXRegData<0b001, "rev16",
-                                  UnOpFrag<(rotr (bswap node:$LHS), (i64 16))>>;
-
-def : Pat<(cttz GPR32:$Rn),
-          (CLZWr (RBITWr GPR32:$Rn))>;
-def : Pat<(cttz GPR64:$Rn),
-          (CLZXr (RBITXr GPR64:$Rn))>;
-
-// Unlike the other one operand instructions, the instructions with the "rev"
-// mnemonic do *not* just different in the size bit, but actually use different
-// opcode bits for the different sizes.
-def REVWr   : OneWRegData<0b010, "rev", bswap>;
-def REVXr   : OneXRegData<0b011, "rev", bswap>;
-def REV32Xr : OneXRegData<0b010, "rev32",
-                                 UnOpFrag<(rotr (bswap node:$LHS), (i64 32))>>;
-
-//===----------------------------------------------------------------------===//
-// Bitfield immediate extraction instruction.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in
-defm EXTR : ExtractImm<"extr">;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRWrri GPR32:$dst, GPR32:$src, GPR32:$src, imm0_31:$shift)>;
-def : InstAlias<"ror $dst, $src, $shift",
-            (EXTRXrri GPR64:$dst, GPR64:$src, GPR64:$src, imm0_63:$shift)>;
-
-def : Pat<(rotr GPR32:$Rn, (i64 imm0_31:$imm)),
-          (EXTRWrri GPR32:$Rn, GPR32:$Rn, imm0_31:$imm)>;
-def : Pat<(rotr GPR64:$Rn, (i64 imm0_63:$imm)),
-          (EXTRXrri GPR64:$Rn, GPR64:$Rn, imm0_63:$imm)>;
-
-//===----------------------------------------------------------------------===//
-// Other bitfield immediate instructions.
-//===----------------------------------------------------------------------===//
-let neverHasSideEffects = 1 in {
-defm BFM  : BitfieldImmWith2RegArgs<0b01, "bfm">;
-defm SBFM : BitfieldImm<0b00, "sbfm">;
-defm UBFM : BitfieldImm<0b10, "ubfm">;
-}
-
-def i32shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (32 - N->getZExtValue()) & 0x1f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i32shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 31 - shift_amt)
-def i32shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 31 - shift_amt)
-def i32shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 31 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_a : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = (64 - N->getZExtValue()) & 0x3f;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def i64shift_b : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(7, 63 - shift_amt)
-def i64shift_sext_i8 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 7 ? 7 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(15, 63 - shift_amt)
-def i64shift_sext_i16 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 15 ? 15 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-// min(31, 63 - shift_amt)
-def i64shift_sext_i32 : Operand<i64>, SDNodeXForm<imm, [{
-  uint64_t enc = 63 - N->getZExtValue();
-  enc = enc > 31 ? 31 : enc;
-  return CurDAG->getTargetConstant(enc, MVT::i64);
-}]>;
-
-def : Pat<(shl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, (i64 (i32shift_a imm0_31:$imm)),
-                              (i64 (i32shift_b imm0_31:$imm)))>;
-def : Pat<(shl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_b imm0_63:$imm)))>;
-
-let AddedComplexity = 10 in {
-def : Pat<(sra GPR32:$Rn, (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(sra GPR64:$Rn, (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-}
-
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"asr $dst, $src, $shift",
-                (SBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"sxtb $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"sxth $dst, $src", (SBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"sxth $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"sxtw $dst, $src", (SBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-def : Pat<(srl GPR32:$Rn, (i64 imm0_31:$imm)),
-          (UBFMWri GPR32:$Rn, imm0_31:$imm, 31)>;
-def : Pat<(srl GPR64:$Rn, (i64 imm0_63:$imm)),
-          (UBFMXri GPR64:$Rn, imm0_63:$imm, 63)>;
-
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMWri GPR32:$dst, GPR32:$src, imm0_31:$shift, 31)>;
-def : InstAlias<"lsr $dst, $src, $shift",
-                (UBFMXri GPR64:$dst, GPR64:$src, imm0_63:$shift, 63)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 7)>;
-def : InstAlias<"uxtb $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 7)>;
-def : InstAlias<"uxth $dst, $src", (UBFMWri GPR32:$dst, GPR32:$src, 0, 15)>;
-def : InstAlias<"uxth $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 15)>;
-def : InstAlias<"uxtw $dst, $src", (UBFMXri GPR64:$dst, GPR64:$src, 0, 31)>;
-
-//===----------------------------------------------------------------------===//
-// Conditionally set flags instructions.
-//===----------------------------------------------------------------------===//
-defm CCMN : CondSetFlagsImm<0, "ccmn">;
-defm CCMP : CondSetFlagsImm<1, "ccmp">;
-
-defm CCMN : CondSetFlagsReg<0, "ccmn">;
-defm CCMP : CondSetFlagsReg<1, "ccmp">;
-
-//===----------------------------------------------------------------------===//
-// Conditional select instructions.
-//===----------------------------------------------------------------------===//
-defm CSEL  : CondSelect<0, 0b00, "csel">;
-
-def inc : PatFrag<(ops node:$in), (add node:$in, 1)>;
-defm CSINC : CondSelectOp<0, 0b01, "csinc", inc>;
-defm CSINV : CondSelectOp<1, 0b00, "csinv", not>;
-defm CSNEG : CondSelectOp<1, 0b01, "csneg", ineg>;
-
-def : Pat<(ARM64csinv GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSINVWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinv GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSINVXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSNEGWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csneg GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSNEGXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR32:$tval, GPR32:$fval, (i32 imm:$cc), CPSR),
-          (CSINCWr GPR32:$tval, GPR32:$fval, (i32 imm:$cc))>;
-def : Pat<(ARM64csinc GPR64:$tval, GPR64:$fval, (i32 imm:$cc), CPSR),
-          (CSINCXr GPR64:$tval, GPR64:$fval, (i32 imm:$cc))>;
-
-def : Pat<(ARM64csel (i32 0), (i32 1), (i32 imm:$cc), CPSR),
-          (CSINCWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 1), (i32 imm:$cc), CPSR),
-          (CSINCXr XZR, XZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i32 0), (i32 -1), (i32 imm:$cc), CPSR),
-          (CSINVWr WZR, WZR, (i32 imm:$cc))>;
-def : Pat<(ARM64csel (i64 0), (i64 -1), (i32 imm:$cc), CPSR),
-          (CSINVXr XZR, XZR, (i32 imm:$cc))>;
-
-// The inverse of the condition code from the alias instruction is what is used
-// in the aliased instruction. The parser all ready inverts the condition code
-// for these aliases.
-// FIXME: Is this the correct way to handle these aliases?
-def : InstAlias<"cset $dst, $cc", (CSINCWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
-def : InstAlias<"cset $dst, $cc", (CSINCXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
-
-def : InstAlias<"csetm $dst, $cc", (CSINVWr GPR32:$dst, WZR, WZR, ccode:$cc)>;
-def : InstAlias<"csetm $dst, $cc", (CSINVXr GPR64:$dst, XZR, XZR, ccode:$cc)>;
-
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cinc $dst, $src, $cc",
-                (CSINCXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cinv $dst, $src, $cc",
-                (CSINVXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGWr GPR32:$dst, GPR32:$src, GPR32:$src, ccode:$cc)>;
-def : InstAlias<"cneg $dst, $src, $cc",
-                (CSNEGXr GPR64:$dst, GPR64:$src, GPR64:$src, ccode:$cc)>;
-
-//===----------------------------------------------------------------------===//
-// PC-relative instructions.
-//===----------------------------------------------------------------------===//
-let isReMaterializable = 1 in {
-let neverHasSideEffects = 1, mayStore = 0, mayLoad = 0 in {
-def ADR  : ADRI<0, "adr", adrlabel, []>;
-} // neverHasSideEffects = 1
-
-def ADRP : ADRI<1, "adrp", adrplabel,
-                [(set GPR64:$Xd, (ARM64adrp tglobaladdr:$label))]>;
-} // isReMaterializable = 1
-
-// page address of a constant pool entry, block address
-def : Pat<(ARM64adrp tconstpool:$cp), (ADRP tconstpool:$cp)>;
-def : Pat<(ARM64adrp tblockaddress:$cp), (ADRP tblockaddress:$cp)>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (register) instructions.
-//===----------------------------------------------------------------------===//
-
-let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
-def RET  : BranchReg<0b0010, "ret", []>;
-def DRPS : SpecialReturn<0b0101, "drps">;
-def ERET : SpecialReturn<0b0100, "eret">;
-} // isReturn = 1, isTerminator = 1, isBarrier = 1
-
-// Default to the LR register.
-def : InstAlias<"ret", (RET LR)>;
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(ARM64call GPR64:$Rn)]>;
-} // isCall
-
-let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
-def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
-} // isBranch, isTerminator, isBarrier, isIndirectBranch
-
-// Create a separate pseudo-instruction for codegen to use so that we don't
-// flag lr as used in every function. It'll be restored before the RET by the
-// epilogue if it's legitimately used.
-def RET_ReallyLR : Pseudo<(outs), (ins), [(ARM64retflag)]> {
-  let isTerminator = 1;
-  let isBarrier = 1;
-  let isReturn = 1;
-}
-
-// This is a directive-like pseudo-instruction. The purpose is to insert an
-// R_AARCH64_TLSDESC_CALL relocation at the offset of the following instruction
-// (which in the usual case is a BLR).
-let hasSideEffects = 1 in
-def TLSDESCCALL : Pseudo<(outs), (ins i64imm:$sym), []> {
-  let AsmString = ".tlsdesccall $sym";
-}
-
-// Pseudo-instruction representing a BLR with attached TLSDESC relocation. It
-// gets expanded to two MCInsts during lowering.
-let isCall = 1, Defs = [LR] in
-def TLSDESC_BLR
-    : Pseudo<(outs), (ins GPR64:$dest, i64imm:$sym),
-             [(ARM64tlsdesc_call GPR64:$dest, tglobaltlsaddr:$sym)]>;
-
-def : Pat<(ARM64tlsdesc_call GPR64:$dest, texternalsym:$sym),
-          (TLSDESC_BLR GPR64:$dest, texternalsym:$sym)>;
-//===----------------------------------------------------------------------===//
-// Conditional branch (immediate) instruction.
-//===----------------------------------------------------------------------===//
-def Bcc : BranchCond;
-
-//===----------------------------------------------------------------------===//
-// Compare-and-branch instructions.
-//===----------------------------------------------------------------------===//
-defm CBZ  : CmpBranch<0, "cbz", ARM64cbz>;
-defm CBNZ : CmpBranch<1, "cbnz", ARM64cbnz>;
-
-//===----------------------------------------------------------------------===//
-// Test-bit-and-branch instructions.
-//===----------------------------------------------------------------------===//
-def TBZ  : TestBranch<0, "tbz", ARM64tbz>;
-def TBNZ : TestBranch<1, "tbnz", ARM64tbnz>;
-
-//===----------------------------------------------------------------------===//
-// Unconditional branch (immediate) instructions.
-//===----------------------------------------------------------------------===//
-let isBranch = 1, isTerminator = 1, isBarrier = 1 in {
-def B  : BranchImm<0, "b", [(br bb:$addr)]>;
-} // isBranch, isTerminator, isBarrier
-
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BL : CallImm<1, "bl", [(ARM64call tglobaladdr:$addr)]>;
-} // isCall
-def : Pat<(ARM64call texternalsym:$func), (BL texternalsym:$func)>;
-
-//===----------------------------------------------------------------------===//
-// Exception generation instructions.
-//===----------------------------------------------------------------------===//
-def BRK   : ExceptionGeneration<0b001, 0b00, "brk">;
-def DCPS1 : ExceptionGeneration<0b101, 0b01, "dcps1">;
-def DCPS2 : ExceptionGeneration<0b101, 0b10, "dcps2">;
-def DCPS3 : ExceptionGeneration<0b101, 0b11, "dcps3">;
-def HLT   : ExceptionGeneration<0b010, 0b00, "hlt">;
-def HVC   : ExceptionGeneration<0b000, 0b10, "hvc">;
-def SMC   : ExceptionGeneration<0b000, 0b11, "smc">;
-def SVC   : ExceptionGeneration<0b000, 0b01, "svc">;
-
-// DCPSn defaults to an immediate operand of zero if unspecified.
-def : InstAlias<"dcps1", (DCPS1 0)>;
-def : InstAlias<"dcps2", (DCPS2 0)>;
-def : InstAlias<"dcps3", (DCPS3 0)>;
-
-//===----------------------------------------------------------------------===//
-// Load instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-def LDPWi : LoadPairOffset<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
-def LDPXi : LoadPairOffset<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
-def LDPSi : LoadPairOffset<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
-def LDPDi : LoadPairOffset<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
-def LDPQi : LoadPairOffset<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
-
-def LDPSWi : LoadPairOffset<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
-
-// Pair (pre-indexed)
-def LDPWpre : LoadPairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "ldp">;
-def LDPXpre : LoadPairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "ldp">;
-def LDPSpre : LoadPairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "ldp">;
-def LDPDpre : LoadPairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "ldp">;
-def LDPQpre : LoadPairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "ldp">;
-
-def LDPSWpre : LoadPairPreIdx<0b01, 0, GPR64, am_indexed32simm7, "ldpsw">;
-
-// Pair (post-indexed)
-def LDPWpost : LoadPairPostIdx<0b00, 0, GPR32, simm7s4, "ldp">;
-def LDPXpost : LoadPairPostIdx<0b10, 0, GPR64, simm7s8, "ldp">;
-def LDPSpost : LoadPairPostIdx<0b00, 1, FPR32, simm7s4, "ldp">;
-def LDPDpost : LoadPairPostIdx<0b01, 1, FPR64, simm7s8, "ldp">;
-def LDPQpost : LoadPairPostIdx<0b10, 1, FPR128, simm7s16, "ldp">;
-
-def LDPSWpost : LoadPairPostIdx<0b01, 0, GPR64, simm7s4, "ldpsw">;
-
-
-// Pair (no allocate)
-def LDNPWi : LoadPairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "ldnp">;
-def LDNPXi : LoadPairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "ldnp">;
-def LDNPSi : LoadPairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "ldnp">;
-def LDNPDi : LoadPairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "ldnp">;
-def LDNPQi : LoadPairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "ldnp">;
-
-//---
-// (register offset)
-//---
-
-let AddedComplexity = 10 in {
-// Integer
-def LDRBBro : Load8RO<0b00,  0, 0b01, GPR32, "ldrb",
-                      [(set GPR32:$Rt, (zextloadi8 ro_indexed8:$addr))]>;
-def LDRHHro : Load16RO<0b01, 0, 0b01, GPR32, "ldrh",
-                      [(set GPR32:$Rt, (zextloadi16 ro_indexed16:$addr))]>;
-def LDRWro  : Load32RO<0b10,   0, 0b01, GPR32, "ldr",
-                      [(set GPR32:$Rt, (load ro_indexed32:$addr))]>;
-def LDRXro  : Load64RO<0b11,   0, 0b01, GPR64, "ldr",
-                      [(set GPR64:$Rt, (load ro_indexed64:$addr))]>;
-
-// Floating-point
-def LDRBro : Load8RO<0b00,   1, 0b01, FPR8,   "ldr",
-                      [(set FPR8:$Rt, (load ro_indexed8:$addr))]>;
-def LDRHro : Load16RO<0b01,  1, 0b01, FPR16,  "ldr",
-                      [(set FPR16:$Rt, (load ro_indexed16:$addr))]>;
-def LDRSro : Load32RO<0b10,    1, 0b01, FPR32,  "ldr",
-                      [(set (f32 FPR32:$Rt), (load ro_indexed32:$addr))]>;
-def LDRDro : Load64RO<0b11,    1, 0b01, FPR64,  "ldr",
-                      [(set (f64 FPR64:$Rt), (load ro_indexed64:$addr))]>;
-def LDRQro : Load128RO<0b00,    1, 0b11, FPR128, "ldr", []> {
-  let mayLoad = 1;
-}
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 ro_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 ro_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load ro_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (LDRDro ro_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load ro_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDro ro_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v1f64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v8i8 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v4i16 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v2i32 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-def : Pat<(v1i64 (load ro_indexed64:$addr)), (LDRDro ro_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v2f64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v16i8 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v8i16 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v4i32 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(v2i64 (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-def : Pat<(f128  (load ro_indexed128:$addr)), (LDRQro ro_indexed128:$addr)>;
-
-// Load sign-extended half-word
-def LDRSHWro : Load16RO<0b01, 0, 0b11, GPR32, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-def LDRSHXro : Load16RO<0b01, 0, 0b10, GPR64, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 ro_indexed16:$addr))]>;
-
-// Load sign-extended byte
-def LDRSBWro : Load8RO<0b00, 0, 0b11, GPR32, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-def LDRSBXro : Load8RO<0b00, 0, 0b10, GPR64, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 ro_indexed8:$addr))]>;
-
-// Load sign-extended word
-def LDRSWro  : Load32RO<0b10, 0, 0b10, GPR64, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 ro_indexed32:$addr))]>;
-
-// Pre-fetch.
-def PRFMro : PrefetchRO<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, ro_indexed64:$addr)]>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 ro_indexed16:$addr)), (LDRHHro ro_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 ro_indexed8:$addr)), (LDRBBro ro_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 ro_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWro ro_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 ro_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHro ro_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 ro_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBro ro_indexed8:$addr), sub_32)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unsigned immediate)
-//---
-def LDRXui : LoadUI<0b11, 0, 0b01, GPR64, am_indexed64, "ldr",
-                    [(set GPR64:$Rt, (load am_indexed64:$addr))]>;
-def LDRWui : LoadUI<0b10, 0, 0b01, GPR32, am_indexed32, "ldr",
-                    [(set GPR32:$Rt, (load am_indexed32:$addr))]>;
-def LDRBui : LoadUI<0b00, 1, 0b01, FPR8, am_indexed8, "ldr",
-                    [(set FPR8:$Rt, (load am_indexed8:$addr))]>;
-def LDRHui : LoadUI<0b01, 1, 0b01, FPR16, am_indexed16, "ldr",
-                    [(set FPR16:$Rt, (load am_indexed16:$addr))]>;
-def LDRSui : LoadUI<0b10, 1, 0b01, FPR32, am_indexed32, "ldr",
-                    [(set (f32 FPR32:$Rt), (load am_indexed32:$addr))]>;
-def LDRDui : LoadUI<0b11, 1, 0b01, FPR64, am_indexed64, "ldr",
-                    [(set (f64 FPR64:$Rt), (load am_indexed64:$addr))]>;
-def LDRQui : LoadUI<0b00, 1, 0b11, FPR128, am_indexed128, "ldr",
-                    [(set (f128 FPR128:$Rt), (load am_indexed128:$addr))]>;
-
-// For regular load, we do not have any alignment requirement.
-// Thus, it is safe to directly map the vector loads with interesting
-// addressing modes.
-// FIXME: We could do the same for bitconvert to floating point vectors.
-def : Pat <(v8i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v8i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v16i8 (scalar_to_vector (i32 (extloadi8 am_indexed8:$addr)))),
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub)>;
-def : Pat <(v4i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v4i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v8i16 (scalar_to_vector (i32 (extloadi16 am_indexed16:$addr)))),
-           (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub)>;
-def : Pat <(v2i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v4i32 (scalar_to_vector (i32 (load am_indexed32:$addr)))),
-           (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub)>;
-def : Pat <(v1i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (LDRDui am_indexed64:$addr)>;
-def : Pat <(v2i64 (scalar_to_vector (i64 (load am_indexed64:$addr)))),
-           (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                          (LDRDui am_indexed64:$addr), dsub)>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v1f64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v8i8 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v4i16 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v2i32 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-def : Pat<(v1i64 (load am_indexed64:$addr)), (LDRDui am_indexed64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v2f64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v16i8 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v8i16 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v4i32 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(v2i64 (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-def : Pat<(f128  (load am_indexed128:$addr)), (LDRQui am_indexed128:$addr)>;
-
-def LDRHHui : LoadUI<0b01, 0, 0b01, GPR32, am_indexed16, "ldrh",
-                     [(set GPR32:$Rt, (zextloadi16 am_indexed16:$addr))]>;
-def LDRBBui : LoadUI<0b00, 0, 0b01, GPR32, am_indexed8, "ldrb",
-                     [(set GPR32:$Rt, (zextloadi8 am_indexed8:$addr))]>;
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-
-// zextloadi1 -> zextloadi8
-def : Pat<(i32 (zextloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (zextloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// extload -> zextload
-def : Pat<(i32 (extloadi16 am_indexed16:$addr)), (LDRHHui am_indexed16:$addr)>;
-def : Pat<(i32 (extloadi8 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i32 (extloadi1 am_indexed8:$addr)), (LDRBBui am_indexed8:$addr)>;
-def : Pat<(i64 (extloadi32 am_indexed32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_indexed16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRHHui am_indexed16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_indexed8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDRBBui am_indexed8:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDRSHWui : LoadUI<0b01, 0, 0b11, GPR32, am_indexed16, "ldrsh",
-                      [(set GPR32:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-def LDRSHXui : LoadUI<0b01, 0, 0b10, GPR64, am_indexed16, "ldrsh",
-                      [(set GPR64:$Rt, (sextloadi16 am_indexed16:$addr))]>;
-
-// load sign-extended byte
-def LDRSBWui : LoadUI<0b00, 0, 0b11, GPR32, am_indexed8, "ldrsb",
-                      [(set GPR32:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-def LDRSBXui : LoadUI<0b00, 0, 0b10, GPR64, am_indexed8, "ldrsb",
-                      [(set GPR64:$Rt, (sextloadi8 am_indexed8:$addr))]>;
-
-// load sign-extended word
-def LDRSWui  : LoadUI<0b10, 0, 0b10, GPR64, am_indexed32, "ldrsw",
-                      [(set GPR64:$Rt, (sextloadi32 am_indexed32:$addr))]>;
-
-// load zero-extended word
-def : Pat<(i64 (zextloadi32 am_indexed32:$addr)),
- (SUBREG_TO_REG (i64 0), (LDRWui am_indexed32:$addr), sub_32)>;
-
-// Pre-fetch.
-def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
-                        [(ARM64Prefetch imm:$Rt, am_indexed64:$addr)]>;
-
-//---
-// (literal)
-def LDRWl : LoadLiteral<0b00, 0, GPR32, "ldr">;
-def LDRXl : LoadLiteral<0b01, 0, GPR64, "ldr">;
-def LDRSl : LoadLiteral<0b00, 1, FPR32, "ldr">;
-def LDRDl : LoadLiteral<0b01, 1, FPR64, "ldr">;
-def LDRQl : LoadLiteral<0b10, 1, FPR128, "ldr">;
-
-// load sign-extended word
-def LDRSWl : LoadLiteral<0b10, 0, GPR64, "ldrsw">;
-
-// prefetch
-def PRFMl : PrefetchLiteral<0b11, 0, "prfm", []>;
-//                   [(ARM64Prefetch imm:$Rt, tglobaladdr:$label)]>;
-
-//---
-// (unscaled immediate)
-def LDURXi : LoadUnscaled<0b11, 0, 0b01, GPR64, am_unscaled64, "ldur",
-                          [(set GPR64:$Rt, (load am_unscaled64:$addr))]>;
-def LDURWi : LoadUnscaled<0b10, 0, 0b01, GPR32, am_unscaled32, "ldur",
-                          [(set GPR32:$Rt, (load am_unscaled32:$addr))]>;
-def LDURBi : LoadUnscaled<0b00, 1, 0b01, FPR8,  am_unscaled8, "ldur",
-                          [(set FPR8:$Rt, (load am_unscaled8:$addr))]>;
-def LDURHi : LoadUnscaled<0b01, 1, 0b01, FPR16, am_unscaled16, "ldur",
-                          [(set FPR16:$Rt, (load am_unscaled16:$addr))]>;
-def LDURSi : LoadUnscaled<0b10, 1, 0b01, FPR32, am_unscaled32, "ldur",
-                          [(set (f32 FPR32:$Rt), (load am_unscaled32:$addr))]>;
-def LDURDi : LoadUnscaled<0b11, 1, 0b01, FPR64, am_unscaled64, "ldur",
-                          [(set (f64 FPR64:$Rt), (load am_unscaled64:$addr))]>;
-def LDURQi : LoadUnscaled<0b00, 1, 0b11, FPR128, am_unscaled128, "ldur",
-                        [(set (v2f64 FPR128:$Rt), (load am_unscaled128:$addr))]>;
-
-def LDURHHi
-    : LoadUnscaled<0b01, 0, 0b01, GPR32, am_unscaled16, "ldurh",
-                   [(set GPR32:$Rt, (zextloadi16 am_unscaled16:$addr))]>;
-def LDURBBi
-    : LoadUnscaled<0b00, 0, 0b01, GPR32, am_unscaled8, "ldurb",
-                   [(set GPR32:$Rt, (zextloadi8 am_unscaled8:$addr))]>;
-
-// Match all load 64 bits width whose type is compatible with FPR64
-def : Pat<(v2f32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v1f64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v8i8 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v4i16 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v2i32 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-def : Pat<(v1i64 (load am_unscaled64:$addr)), (LDURDi am_unscaled64:$addr)>;
-
-// Match all load 128 bits width whose type is compatible with FPR128
-def : Pat<(v4f32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v2f64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v16i8 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v8i16 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v4i32 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(v2i64 (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-def : Pat<(f128  (load am_unscaled128:$addr)), (LDURQi am_unscaled128:$addr)>;
-
-//  anyext -> zext
-def : Pat<(i32 (extloadi16 am_unscaled16:$addr)), (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (extloadi8 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (extloadi1 am_unscaled8:$addr)), (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (extloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (extloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (extloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (extloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-// unscaled zext
-def : Pat<(i32 (zextloadi16 am_unscaled16:$addr)),
-    (LDURHHi am_unscaled16:$addr)>;
-def : Pat<(i32 (zextloadi8 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i32 (zextloadi1 am_unscaled8:$addr)),
-    (LDURBBi am_unscaled8:$addr)>;
-def : Pat<(i64 (zextloadi32 am_unscaled32:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURWi am_unscaled32:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi1 am_unscaled8:$addr)),
-    (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-
-
-//---
-// LDR mnemonics fall back to LDUR for negative or unaligned offsets.
-
-// Define new assembler match classes as we want to only match these when
-// the don't otherwise match the scaled addressing mode for LDR/STR. Don't
-// associate a DiagnosticType either, as we want the diagnostic for the
-// canonical form (the scaled operand) to take precedence.
-def MemoryUnscaledFB8Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB8";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB16Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB16";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB32Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB32";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB64Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB64";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def MemoryUnscaledFB128Operand : AsmOperandClass {
-  let Name = "MemoryUnscaledFB128";
-  let RenderMethod = "addMemoryUnscaledOperands";
-}
-def am_unscaled_fb8 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB8Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb16 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB16Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb32 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB32Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb64 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB64Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def am_unscaled_fb128 : Operand<i64> {
-  let ParserMatchClass = MemoryUnscaledFB128Operand;
-  let MIOperandInfo = (ops GPR64sp:$base, i64imm:$offset);
-}
-def : InstAlias<"ldr $Rt, $addr", (LDURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"ldr $Rt, $addr", (LDURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
-
-// zextload -> i64
-def : Pat<(i64 (zextloadi8 am_unscaled8:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURBBi am_unscaled8:$addr), sub_32)>;
-def : Pat<(i64 (zextloadi16 am_unscaled16:$addr)),
-  (SUBREG_TO_REG (i64 0), (LDURHHi am_unscaled16:$addr), sub_32)>;
-
-// load sign-extended half-word
-def LDURSHWi
-    : LoadUnscaled<0b01, 0, 0b11, GPR32, am_unscaled16, "ldursh",
-                   [(set GPR32:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-def LDURSHXi
-    : LoadUnscaled<0b01, 0, 0b10, GPR64, am_unscaled16, "ldursh",
-                   [(set GPR64:$Rt, (sextloadi16 am_unscaled16:$addr))]>;
-
-// load sign-extended byte
-def LDURSBWi
-    : LoadUnscaled<0b00, 0, 0b11, GPR32, am_unscaled8, "ldursb",
-                   [(set GPR32:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-def LDURSBXi
-    : LoadUnscaled<0b00, 0, 0b10, GPR64, am_unscaled8, "ldursb",
-                   [(set GPR64:$Rt, (sextloadi8 am_unscaled8:$addr))]>;
-
-// load sign-extended word
-def LDURSWi
-    : LoadUnscaled<0b10, 0, 0b10, GPR64, am_unscaled32, "ldursw",
-                   [(set GPR64:$Rt, (sextloadi32 am_unscaled32:$addr))]>;
-
-// zero and sign extending aliases from generic LDR* mnemonics to LDUR*.
-def : InstAlias<"ldrb $Rt, $addr", (LDURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrh $Rt, $addr", (LDURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsb $Rt, $addr", (LDURSBWi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrsb $Rt, $addr", (LDURSBXi GPR64:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"ldrsh $Rt, $addr", (LDURSHWi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsh $Rt, $addr", (LDURSHXi GPR64:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"ldrsw $Rt, $addr", (LDURSWi GPR64:$Rt, am_unscaled_fb32:$addr)>;
-
-// Pre-fetch.
-def PRFUMi : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
-                               [(ARM64Prefetch imm:$Rt, am_unscaled64:$addr)]>;
-
-//---
-// (unscaled immediate, unprivileged)
-def LDTRXi : LoadUnprivileged<0b11, 0, 0b01, GPR64, "ldtr">;
-def LDTRWi : LoadUnprivileged<0b10, 0, 0b01, GPR32, "ldtr">;
-
-def LDTRHi : LoadUnprivileged<0b01, 0, 0b01, GPR32, "ldtrh">;
-def LDTRBi : LoadUnprivileged<0b00, 0, 0b01, GPR32, "ldtrb">;
-
-// load sign-extended half-word
-def LDTRSHWi : LoadUnprivileged<0b01, 0, 0b11, GPR32, "ldtrsh">;
-def LDTRSHXi : LoadUnprivileged<0b01, 0, 0b10, GPR64, "ldtrsh">;
-
-// load sign-extended byte
-def LDTRSBWi : LoadUnprivileged<0b00, 0, 0b11, GPR32, "ldtrsb">;
-def LDTRSBXi : LoadUnprivileged<0b00, 0, 0b10, GPR64, "ldtrsb">;
-
-// load sign-extended word
-def LDTRSWi  : LoadUnprivileged<0b10, 0, 0b10, GPR64, "ldtrsw">;
-
-//---
-// (immediate pre-indexed)
-def LDRWpre : LoadPreIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpre : LoadPreIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpre : LoadPreIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpre : LoadPreIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpre : LoadPreIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpre : LoadPreIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpre : LoadPreIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpre : LoadPreIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpre : LoadPreIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpre : LoadPreIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpre : LoadPreIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpre : LoadPreIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpre : LoadPreIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpre : LoadPreIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPreIdxPseudo.
-def LDRDpre_isel  : LoadPreIdxPseudo<FPR64>;
-def LDRSpre_isel  : LoadPreIdxPseudo<FPR32>;
-def LDRXpre_isel  : LoadPreIdxPseudo<GPR64>;
-def LDRWpre_isel  : LoadPreIdxPseudo<GPR32>;
-def LDRHHpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRBBpre_isel : LoadPreIdxPseudo<GPR32>;
-
-def LDRSWpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSHWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSHXpre_isel : LoadPreIdxPseudo<GPR64>;
-def LDRSBWpre_isel : LoadPreIdxPseudo<GPR32>;
-def LDRSBXpre_isel : LoadPreIdxPseudo<GPR64>;
-
-//---
-// (immediate post-indexed)
-def LDRWpost : LoadPostIdx<0b10, 0, 0b01, GPR32, "ldr">;
-def LDRXpost : LoadPostIdx<0b11, 0, 0b01, GPR64, "ldr">;
-def LDRBpost : LoadPostIdx<0b00, 1, 0b01, FPR8,  "ldr">;
-def LDRHpost : LoadPostIdx<0b01, 1, 0b01, FPR16, "ldr">;
-def LDRSpost : LoadPostIdx<0b10, 1, 0b01, FPR32, "ldr">;
-def LDRDpost : LoadPostIdx<0b11, 1, 0b01, FPR64, "ldr">;
-def LDRQpost : LoadPostIdx<0b00, 1, 0b11, FPR128, "ldr">;
-
-// load sign-extended half-word
-def LDRSHWpost : LoadPostIdx<0b01, 0, 0b11, GPR32, "ldrsh">;
-def LDRSHXpost : LoadPostIdx<0b01, 0, 0b10, GPR64, "ldrsh">;
-
-// load sign-extended byte
-def LDRSBWpost : LoadPostIdx<0b00, 0, 0b11, GPR32, "ldrsb">;
-def LDRSBXpost : LoadPostIdx<0b00, 0, 0b10, GPR64, "ldrsb">;
-
-// load zero-extended byte
-def LDRBBpost : LoadPostIdx<0b00, 0, 0b01, GPR32, "ldrb">;
-def LDRHHpost : LoadPostIdx<0b01, 0, 0b01, GPR32, "ldrh">;
-
-// load sign-extended word
-def LDRSWpost : LoadPostIdx<0b10, 0, 0b10, GPR64, "ldrsw">;
-
-// ISel pseudos and patterns. See expanded comment on LoadPostIdxPseudo.
-def LDRDpost_isel  : LoadPostIdxPseudo<FPR64>;
-def LDRSpost_isel  : LoadPostIdxPseudo<FPR32>;
-def LDRXpost_isel  : LoadPostIdxPseudo<GPR64>;
-def LDRWpost_isel  : LoadPostIdxPseudo<GPR32>;
-def LDRHHpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRBBpost_isel : LoadPostIdxPseudo<GPR32>;
-
-def LDRSWpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSHWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSHXpost_isel : LoadPostIdxPseudo<GPR64>;
-def LDRSBWpost_isel : LoadPostIdxPseudo<GPR32>;
-def LDRSBXpost_isel : LoadPostIdxPseudo<GPR64>;
-
-//===----------------------------------------------------------------------===//
-// Store instructions.
-//===----------------------------------------------------------------------===//
-
-// Pair (indexed, offset)
-// FIXME: Use dedicated range-checked addressing mode operand here.
-def STPWi : StorePairOffset<0b00, 0, GPR32, am_indexed32simm7, "stp">;
-def STPXi : StorePairOffset<0b10, 0, GPR64, am_indexed64simm7, "stp">;
-def STPSi : StorePairOffset<0b00, 1, FPR32, am_indexed32simm7, "stp">;
-def STPDi : StorePairOffset<0b01, 1, FPR64, am_indexed64simm7, "stp">;
-def STPQi : StorePairOffset<0b10, 1, FPR128, am_indexed128simm7, "stp">;
-
-// Pair (pre-indexed)
-def STPWpre : StorePairPreIdx<0b00, 0, GPR32, am_indexed32simm7, "stp">;
-def STPXpre : StorePairPreIdx<0b10, 0, GPR64, am_indexed64simm7, "stp">;
-def STPSpre : StorePairPreIdx<0b00, 1, FPR32, am_indexed32simm7, "stp">;
-def STPDpre : StorePairPreIdx<0b01, 1, FPR64, am_indexed64simm7, "stp">;
-def STPQpre : StorePairPreIdx<0b10, 1, FPR128, am_indexed128simm7, "stp">;
-
-// Pair (pre-indexed)
-def STPWpost : StorePairPostIdx<0b00, 0, GPR32, simm7s4, "stp">;
-def STPXpost : StorePairPostIdx<0b10, 0, GPR64, simm7s8, "stp">;
-def STPSpost : StorePairPostIdx<0b00, 1, FPR32, simm7s4, "stp">;
-def STPDpost : StorePairPostIdx<0b01, 1, FPR64, simm7s8, "stp">;
-def STPQpost : StorePairPostIdx<0b10, 1, FPR128, simm7s16, "stp">;
-
-// Pair (no allocate)
-def STNPWi : StorePairNoAlloc<0b00, 0, GPR32, am_indexed32simm7, "stnp">;
-def STNPXi : StorePairNoAlloc<0b10, 0, GPR64, am_indexed64simm7, "stnp">;
-def STNPSi : StorePairNoAlloc<0b00, 1, FPR32, am_indexed32simm7, "stnp">;
-def STNPDi : StorePairNoAlloc<0b01, 1, FPR64, am_indexed64simm7, "stnp">;
-def STNPQi : StorePairNoAlloc<0b10, 1, FPR128, am_indexed128simm7, "stnp">;
-
-//---
-// (Register offset)
-
-let AddedComplexity = 10 in {
-
-// Integer
-def STRHHro : Store16RO<0b01, 0, 0b00, GPR32, "strh",
-                            [(truncstorei16 GPR32:$Rt, ro_indexed16:$addr)]>;
-def STRBBro : Store8RO<0b00,  0, 0b00, GPR32, "strb",
-                            [(truncstorei8 GPR32:$Rt, ro_indexed8:$addr)]>;
-def STRWro  : Store32RO<0b10,   0, 0b00, GPR32, "str",
-                            [(store GPR32:$Rt, ro_indexed32:$addr)]>;
-def STRXro  : Store64RO<0b11,   0, 0b00, GPR64, "str",
-                            [(store GPR64:$Rt, ro_indexed64:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei8 GPR64:$Rt, ro_indexed8:$addr),
-           (STRBBro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed8:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, ro_indexed16:$addr),
-           (STRHHro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed16:$addr)>;
-def : Pat<(truncstorei32 GPR64:$Rt, ro_indexed32:$addr),
-           (STRWro (EXTRACT_SUBREG GPR64:$Rt, sub_32), ro_indexed32:$addr)>;
-
-
-// Floating-point
-def STRBro : Store8RO<0b00,  1, 0b00, FPR8,  "str",
-                            [(store FPR8:$Rt, ro_indexed8:$addr)]>;
-def STRHro : Store16RO<0b01, 1, 0b00, FPR16, "str",
-                            [(store FPR16:$Rt, ro_indexed16:$addr)]>;
-def STRSro : Store32RO<0b10,   1, 0b00, FPR32, "str",
-                            [(store (f32 FPR32:$Rt), ro_indexed32:$addr)]>;
-def STRDro : Store64RO<0b11,   1, 0b00, FPR64, "str",
-                            [(store (f64 FPR64:$Rt), ro_indexed64:$addr)]>;
-def STRQro : Store128RO<0b00,   1, 0b10, FPR128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), ro_indexed64:$addr),
-          (STRDro FPR64:$Rn, ro_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-def : Pat<(store (f128 FPR128:$Rn),  ro_indexed128:$addr),
-          (STRQro FPR128:$Rn, ro_indexed128:$addr)>;
-
-//---
-// (unsigned immediate)
-def STRXui : StoreUI<0b11, 0, 0b00, GPR64, am_indexed64, "str",
-                     [(store GPR64:$Rt, am_indexed64:$addr)]>;
-def STRWui : StoreUI<0b10, 0, 0b00, GPR32, am_indexed32, "str",
-                     [(store GPR32:$Rt, am_indexed32:$addr)]>;
-def STRBui : StoreUI<0b00, 1, 0b00, FPR8, am_indexed8, "str",
-                     [(store FPR8:$Rt, am_indexed8:$addr)]>;
-def STRHui : StoreUI<0b01, 1, 0b00, FPR16, am_indexed16, "str",
-                     [(store FPR16:$Rt, am_indexed16:$addr)]>;
-def STRSui : StoreUI<0b10, 1, 0b00, FPR32, am_indexed32, "str",
-                     [(store (f32 FPR32:$Rt), am_indexed32:$addr)]>;
-def STRDui : StoreUI<0b11, 1, 0b00, FPR64, am_indexed64, "str",
-                     [(store (f64 FPR64:$Rt), am_indexed64:$addr)]>;
-def STRQui : StoreUI<0b00, 1, 0b10, FPR128, am_indexed128, "str", []> {
-  let mayStore = 1;
-}
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_indexed64:$addr),
-          (STRDui FPR64:$Rn, am_indexed64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-def : Pat<(store (f128  FPR128:$Rn), am_indexed128:$addr),
-          (STRQui FPR128:$Rn, am_indexed128:$addr)>;
-
-def STRHHui : StoreUI<0b01, 0, 0b00, GPR32, am_indexed16, "strh",
-                      [(truncstorei16 GPR32:$Rt, am_indexed16:$addr)]>;
-def STRBBui : StoreUI<0b00, 0, 0b00, GPR32, am_indexed8,  "strb",
-                      [(truncstorei8 GPR32:$Rt, am_indexed8:$addr)]>;
-
-// truncstore i64
-def : Pat<(truncstorei32 GPR64:$Rt, am_indexed32:$addr),
-  (STRWui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_indexed16:$addr),
-  (STRHHui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_indexed8:$addr),
-  (STRBBui (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_indexed8:$addr)>;
-
-} // AddedComplexity = 10
-
-//---
-// (unscaled immediate)
-def STURXi : StoreUnscaled<0b11, 0, 0b00, GPR64, am_unscaled64, "stur",
-                           [(store GPR64:$Rt, am_unscaled64:$addr)]>;
-def STURWi : StoreUnscaled<0b10, 0, 0b00, GPR32, am_unscaled32, "stur",
-                           [(store GPR32:$Rt, am_unscaled32:$addr)]>;
-def STURBi : StoreUnscaled<0b00, 1, 0b00, FPR8,  am_unscaled8, "stur",
-                           [(store FPR8:$Rt, am_unscaled8:$addr)]>;
-def STURHi : StoreUnscaled<0b01, 1, 0b00, FPR16, am_unscaled16, "stur",
-                           [(store FPR16:$Rt, am_unscaled16:$addr)]>;
-def STURSi : StoreUnscaled<0b10, 1, 0b00, FPR32, am_unscaled32, "stur",
-                           [(store (f32 FPR32:$Rt), am_unscaled32:$addr)]>;
-def STURDi : StoreUnscaled<0b11, 1, 0b00, FPR64, am_unscaled64, "stur",
-                           [(store (f64 FPR64:$Rt), am_unscaled64:$addr)]>;
-def STURQi : StoreUnscaled<0b00, 1, 0b10, FPR128, am_unscaled128, "stur",
-                           [(store (v2f64 FPR128:$Rt), am_unscaled128:$addr)]>;
-def STURHHi : StoreUnscaled<0b01, 0, 0b00, GPR32, am_unscaled16, "sturh",
-                            [(truncstorei16 GPR32:$Rt, am_unscaled16:$addr)]>;
-def STURBBi : StoreUnscaled<0b00, 0, 0b00, GPR32, am_unscaled8, "sturb",
-                            [(truncstorei8 GPR32:$Rt, am_unscaled8:$addr)]>;
-
-// Match all store 64 bits width whose type is compatible with FPR64
-def : Pat<(store (v2f32 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v1f64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v8i8 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v4i16 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v2i32 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-def : Pat<(store (v1i64 FPR64:$Rn), am_unscaled64:$addr),
-          (STURDi FPR64:$Rn, am_unscaled64:$addr)>;
-
-// Match all store 128 bits width whose type is compatible with FPR128
-def : Pat<(store (v4f32 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v2f64 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v16i8 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v8i16 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v4i32 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (v2i64 FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-def : Pat<(store (f128  FPR128:$Rn), am_unscaled128:$addr),
-          (STURQi FPR128:$Rn, am_unscaled128:$addr)>;
-
-// unscaled i64 truncating stores
-def : Pat<(truncstorei32 GPR64:$Rt, am_unscaled32:$addr),
-  (STURWi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled32:$addr)>;
-def : Pat<(truncstorei16 GPR64:$Rt, am_unscaled16:$addr),
-  (STURHHi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled16:$addr)>;
-def : Pat<(truncstorei8 GPR64:$Rt, am_unscaled8:$addr),
-  (STURBBi (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_unscaled8:$addr)>;
-
-//---
-// STR mnemonics fall back to STUR for negative or unaligned offsets.
-def : InstAlias<"str $Rt, $addr", (STURXi GPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURWi GPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURBi FPR8:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURHi FPR16:$Rt, am_unscaled_fb16:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURSi FPR32:$Rt, am_unscaled_fb32:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURDi FPR64:$Rt, am_unscaled_fb64:$addr)>;
-def : InstAlias<"str $Rt, $addr", (STURQi FPR128:$Rt, am_unscaled_fb128:$addr)>;
-
-def : InstAlias<"strb $Rt, $addr", (STURBBi GPR32:$Rt, am_unscaled_fb8:$addr)>;
-def : InstAlias<"strh $Rt, $addr", (STURHHi GPR32:$Rt, am_unscaled_fb16:$addr)>;
-
-//---
-// (unscaled immediate, unprivileged)
-def STTRWi : StoreUnprivileged<0b10, 0, 0b00, GPR32, "sttr">;
-def STTRXi : StoreUnprivileged<0b11, 0, 0b00, GPR64, "sttr">;
-
-def STTRHi : StoreUnprivileged<0b01, 0, 0b00, GPR32, "sttrh">;
-def STTRBi : StoreUnprivileged<0b00, 0, 0b00, GPR32, "sttrb">;
-
-//---
-// (immediate pre-indexed)
-def STRWpre : StorePreIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpre : StorePreIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpre : StorePreIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpre : StorePreIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpre : StorePreIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpre : StorePreIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpre : StorePreIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpre : StorePreIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpre : StorePreIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePreIdxPseudo.
-defm STRDpre : StorePreIdxPseudo<FPR64, f64, pre_store>;
-defm STRSpre : StorePreIdxPseudo<FPR32, f32, pre_store>;
-defm STRXpre : StorePreIdxPseudo<GPR64, i64, pre_store>;
-defm STRWpre : StorePreIdxPseudo<GPR32, i32, pre_store>;
-defm STRHHpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti16>;
-defm STRBBpre : StorePreIdxPseudo<GPR32, i32, pre_truncsti8>;
-// truncstore i64
-def : Pat<(pre_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(pre_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpre_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-//---
-// (immediate post-indexed)
-def STRWpost : StorePostIdx<0b10, 0, 0b00, GPR32, "str">;
-def STRXpost : StorePostIdx<0b11, 0, 0b00, GPR64, "str">;
-def STRBpost : StorePostIdx<0b00, 1, 0b00, FPR8,  "str">;
-def STRHpost : StorePostIdx<0b01, 1, 0b00, FPR16, "str">;
-def STRSpost : StorePostIdx<0b10, 1, 0b00, FPR32, "str">;
-def STRDpost : StorePostIdx<0b11, 1, 0b00, FPR64, "str">;
-def STRQpost : StorePostIdx<0b00, 1, 0b10, FPR128, "str">;
-
-def STRBBpost : StorePostIdx<0b00, 0, 0b00, GPR32, "strb">;
-def STRHHpost : StorePostIdx<0b01, 0, 0b00, GPR32, "strh">;
-
-// ISel pseudos and patterns. See expanded comment on StorePostIdxPseudo.
-defm STRDpost : StorePostIdxPseudo<FPR64, f64, post_store, STRDpost>;
-defm STRSpost : StorePostIdxPseudo<FPR32, f32, post_store, STRSpost>;
-defm STRXpost : StorePostIdxPseudo<GPR64, i64, post_store, STRXpost>;
-defm STRWpost : StorePostIdxPseudo<GPR32, i32, post_store, STRWpost>;
-defm STRHHpost : StorePostIdxPseudo<GPR32, i32, post_truncsti16, STRHHpost>;
-defm STRBBpost : StorePostIdxPseudo<GPR32, i32, post_truncsti8, STRBBpost>;
-// truncstore i64
-def : Pat<(post_truncsti32 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRWpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti16 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRHHpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-def : Pat<(post_truncsti8 GPR64:$Rt, am_noindex:$addr, simm9:$off),
-  (STRBBpost_isel (EXTRACT_SUBREG GPR64:$Rt, sub_32), am_noindex:$addr,
-                  simm9:$off)>;
-
-
-//===----------------------------------------------------------------------===//
-// Load/store exclusive instructions.
-//===----------------------------------------------------------------------===//
-
-def LDARW  : LoadAcquire   <0b10, 1, 1, 0, 1, GPR32, "ldar">;
-def LDARX  : LoadAcquire   <0b11, 1, 1, 0, 1, GPR64, "ldar">;
-def LDARB  : LoadAcquire   <0b00, 1, 1, 0, 1, GPR32, "ldarb">;
-def LDARH  : LoadAcquire   <0b01, 1, 1, 0, 1, GPR32, "ldarh">;
-
-def LDAXRW : LoadExclusive <0b10, 0, 1, 0, 1, GPR32, "ldaxr">;
-def LDAXRX : LoadExclusive <0b11, 0, 1, 0, 1, GPR64, "ldaxr">;
-def LDAXRB : LoadExclusive <0b00, 0, 1, 0, 1, GPR32, "ldaxrb">;
-def LDAXRH : LoadExclusive <0b01, 0, 1, 0, 1, GPR32, "ldaxrh">;
-
-def LDXRW  : LoadExclusive <0b10, 0, 1, 0, 0, GPR32, "ldxr">;
-def LDXRX  : LoadExclusive <0b11, 0, 1, 0, 0, GPR64, "ldxr">;
-def LDXRB  : LoadExclusive <0b00, 0, 1, 0, 0, GPR32, "ldxrb">;
-def LDXRH  : LoadExclusive <0b01, 0, 1, 0, 0, GPR32, "ldxrh">;
-
-def STLRW  : StoreRelease  <0b10, 1, 0, 0, 1, GPR32, "stlr">;
-def STLRX  : StoreRelease  <0b11, 1, 0, 0, 1, GPR64, "stlr">;
-def STLRB  : StoreRelease  <0b00, 1, 0, 0, 1, GPR32, "stlrb">;
-def STLRH  : StoreRelease  <0b01, 1, 0, 0, 1, GPR32, "stlrh">;
-
-def STLXRW : StoreExclusive<0b10, 0, 0, 0, 1, GPR32, "stlxr">;
-def STLXRX : StoreExclusive<0b11, 0, 0, 0, 1, GPR64, "stlxr">;
-def STLXRB : StoreExclusive<0b00, 0, 0, 0, 1, GPR32, "stlxrb">;
-def STLXRH : StoreExclusive<0b01, 0, 0, 0, 1, GPR32, "stlxrh">;
-
-def STXRW  : StoreExclusive<0b10, 0, 0, 0, 0, GPR32, "stxr">;
-def STXRX  : StoreExclusive<0b11, 0, 0, 0, 0, GPR64, "stxr">;
-def STXRB  : StoreExclusive<0b00, 0, 0, 0, 0, GPR32, "stxrb">;
-def STXRH  : StoreExclusive<0b01, 0, 0, 0, 0, GPR32, "stxrh">;
-
-def LDAXPW : LoadExclusivePair<0b10, 0, 1, 1, 1, GPR32, "ldaxp">;
-def LDAXPX : LoadExclusivePair<0b11, 0, 1, 1, 1, GPR64, "ldaxp">;
-
-def LDXPW  : LoadExclusivePair<0b10, 0, 1, 1, 0, GPR32, "ldxp">;
-def LDXPX  : LoadExclusivePair<0b11, 0, 1, 1, 0, GPR64, "ldxp">;
-
-def STLXPW : StoreExclusivePair<0b10, 0, 0, 1, 1, GPR32, "stlxp">;
-def STLXPX : StoreExclusivePair<0b11, 0, 0, 1, 1, GPR64, "stlxp">;
-
-def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
-def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
-
-//===----------------------------------------------------------------------===//
-// Scaled floating point to integer conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCVTAS : FPToInteger<0b00, 0b100, "fcvtas", int_arm64_neon_fcvtas>;
-defm FCVTAU : FPToInteger<0b00, 0b101, "fcvtau", int_arm64_neon_fcvtau>;
-defm FCVTMS : FPToInteger<0b10, 0b000, "fcvtms", int_arm64_neon_fcvtms>;
-defm FCVTMU : FPToInteger<0b10, 0b001, "fcvtmu", int_arm64_neon_fcvtmu>;
-defm FCVTNS : FPToInteger<0b00, 0b000, "fcvtns", int_arm64_neon_fcvtns>;
-defm FCVTNU : FPToInteger<0b00, 0b001, "fcvtnu", int_arm64_neon_fcvtnu>;
-defm FCVTPS : FPToInteger<0b01, 0b000, "fcvtps", int_arm64_neon_fcvtps>;
-defm FCVTPU : FPToInteger<0b01, 0b001, "fcvtpu", int_arm64_neon_fcvtpu>;
-defm FCVTZS : FPToInteger<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToInteger<0b11, 0b001, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : FPToInteger<0b11, 0b000, "fcvtzs", int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : FPToInteger<0b11, 0b001, "fcvtzu", int_arm64_neon_fcvtzu>;
-}
-
-//===----------------------------------------------------------------------===//
-// Scaled integer to floating point conversion instructions.
-//===----------------------------------------------------------------------===//
-
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
-
-//===----------------------------------------------------------------------===//
-// Unscaled integer to floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FMOV : UnscaledConversion<"fmov">;
-
-def : Pat<(f32 (fpimm0)), (FMOVWSr WZR)>, Requires<[NoZCZ]>;
-def : Pat<(f64 (fpimm0)), (FMOVXDr XZR)>, Requires<[NoZCZ]>;
-
-def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1i64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (bitconvert GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1i64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (scalar_to_vector GPR64:$Xn)), (FMOVXDr GPR64:$Xn)>;
-def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Xn))), (v1f64 FPR64:$Xn)>;
-
-def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v4i16 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v1i64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))), (FMOVDXr V64:$Vn)>;
-
-def : Pat<(f32 (bitconvert (i32 GPR32:$Xn))), (COPY_TO_REGCLASS GPR32:$Xn,
-                                                                FPR32)>;
-def : Pat<(i32 (bitconvert (f32 FPR32:$Xn))), (COPY_TO_REGCLASS FPR32:$Xn,
-                                                                GPR32)>;
-def : Pat<(f64 (bitconvert (i64 GPR64:$Xn))), (COPY_TO_REGCLASS GPR64:$Xn,
-                                                                FPR64)>;
-def : Pat<(i64 (bitconvert (f64 FPR64:$Xn))), (COPY_TO_REGCLASS FPR64:$Xn,
-                                                                GPR64)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conversion instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCVT : FPConversion<"fcvt">;
-
-def : Pat<(f32_to_f16 FPR32:$Rn),
-          (i32 (COPY_TO_REGCLASS
-                   (f32 (SUBREG_TO_REG (i32 0), (FCVTHSr FPR32:$Rn), hsub)),
-                   GPR32))>;
-
-def FCVTSHpseudo : Pseudo<(outs FPR32:$Rd), (ins FPR32:$Rn),
-                          [(set (f32 FPR32:$Rd), (f16_to_f32 i32:$Rn))]>;
-
-//===----------------------------------------------------------------------===//
-// Floating point single operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FABS   : SingleOperandFPData<0b0001, "fabs", fabs>;
-defm FMOV   : SingleOperandFPData<0b0000, "fmov">;
-defm FNEG   : SingleOperandFPData<0b0010, "fneg", fneg>;
-defm FRINTA : SingleOperandFPData<0b1100, "frinta", frnd>;
-defm FRINTI : SingleOperandFPData<0b1111, "frinti", fnearbyint>;
-defm FRINTM : SingleOperandFPData<0b1010, "frintm", ffloor>;
-defm FRINTN : SingleOperandFPData<0b1000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SingleOperandFPData<0b1001, "frintp", fceil>;
-
-def : Pat<(v1f64 (int_arm64_neon_frintn (v1f64 FPR64:$Rn))),
-          (FRINTNDr FPR64:$Rn)>;
-
-// FRINTX is inserted to set the flags as required by FENV_ACCESS ON behavior
-// in the C spec. Setting hasSideEffects ensures it is not DCE'd.
-// <rdar://problem/13715968>
-// TODO: We should really model the FPSR flags correctly. This is really ugly.
-let hasSideEffects = 1 in {
-defm FRINTX : SingleOperandFPData<0b1110, "frintx", frint>;
-}
-
-defm FRINTZ : SingleOperandFPData<0b1011, "frintz", ftrunc>;
-
-let SchedRW = [WriteFDiv] in {
-defm FSQRT  : SingleOperandFPData<0b0011, "fsqrt", fsqrt>;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating point two operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FADD   : TwoOperandFPData<0b0010, "fadd", fadd>;
-let SchedRW = [WriteFDiv] in {
-defm FDIV   : TwoOperandFPData<0b0001, "fdiv", fdiv>;
-}
-defm FMAXNM : TwoOperandFPData<0b0110, "fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAX   : TwoOperandFPData<0b0100, "fmax", ARM64fmax>;
-defm FMINNM : TwoOperandFPData<0b0111, "fminnm", int_arm64_neon_fminnm>;
-defm FMIN   : TwoOperandFPData<0b0101, "fmin", ARM64fmin>;
-let SchedRW = [WriteFMul] in {
-defm FMUL   : TwoOperandFPData<0b0000, "fmul", fmul>;
-defm FNMUL  : TwoOperandFPDataNeg<0b1000, "fnmul", fmul>;
-}
-defm FSUB   : TwoOperandFPData<0b0011, "fsub", fsub>;
-
-def : Pat<(v1f64 (ARM64fmax (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (ARM64fmin (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fmaxnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMAXNMDrr FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(v1f64 (int_arm64_neon_fminnm (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FMINNMDrr FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point three operand instructions.
-//===----------------------------------------------------------------------===//
-
-defm FMADD  : ThreeOperandFPData<0, 0, "fmadd", fma>;
-defm FMSUB  : ThreeOperandFPData<0, 1, "fmsub",
-     TriOpFrag<(fma node:$LHS, (fneg node:$MHS), node:$RHS)> >;
-defm FNMADD : ThreeOperandFPData<1, 0, "fnmadd",
-     TriOpFrag<(fneg (fma node:$LHS, node:$MHS, node:$RHS))> >;
-defm FNMSUB : ThreeOperandFPData<1, 1, "fnmsub",
-     TriOpFrag<(fma node:$LHS, node:$MHS, (fneg node:$RHS))> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(f32 (fma (fneg FPR32:$Rn), FPR32:$Rm, FPR32:$Rd)),
-          (FMSUBSrrr FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-def : Pat<(f64 (fma (fneg FPR64:$Rn), FPR64:$Rm, FPR64:$Rd)),
-          (FMSUBDrrr FPR64:$Rd, FPR64:$Rn, FPR64:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Floating point comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP  : FPComparison<0, "fcmp", ARM64fcmp>;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional comparison instructions.
-//===----------------------------------------------------------------------===//
-
-defm FCCMPE : FPCondComparison<1, "fccmpe">;
-defm FCCMP  : FPCondComparison<0, "fccmp">;
-
-//===----------------------------------------------------------------------===//
-// Floating point conditional select instruction.
-//===----------------------------------------------------------------------===//
-
-defm FCSEL : FPCondSelect<"fcsel">;
-
-// CSEL instructions providing f128 types need to be handled by a
-// pseudo-instruction since the eventual code will need to introduce basic
-// blocks and control flow.
-def F128CSEL : Pseudo<(outs FPR128:$Rd),
-                      (ins FPR128:$Rn, FPR128:$Rm, ccode:$cond),
-                      [(set (f128 FPR128:$Rd),
-                            (ARM64csel FPR128:$Rn, FPR128:$Rm,
-                                       (i32 imm:$cond), CPSR))]> {
-  let Uses = [CPSR];
-  let usesCustomInserter = 1;
-}
-
-
-//===----------------------------------------------------------------------===//
-// Floating point immediate move.
-//===----------------------------------------------------------------------===//
-
-let isReMaterializable = 1 in {
-defm FMOV : FPMoveImmediate<"fmov">;
-}
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoVectorBHSD<0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CLS    : SIMDTwoVectorBHS<0, 0b00100, "cls", int_arm64_neon_cls>;
-defm CLZ    : SIMDTwoVectorBHS<1, 0b00100, "clz", ctlz>;
-defm CMEQ   : SIMDCmpTwoVector<0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoVector<1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoVector<0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoVector<1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoVector<0, 0b01010, "cmlt", ARM64cmltz>;
-defm CNT    : SIMDTwoVectorB<0, 0b00, 0b00101, "cnt", ctpop>;
-defm FABS   : SIMDTwoVectorFP<0, 1, 0b01111, "fabs", fabs>;
-
-defm FCMEQ  : SIMDFPCmpTwoVector<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDFPCmpTwoVector<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDFPCmpTwoVector<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDFPCmpTwoVector<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDFPCmpTwoVector<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoVectorFPToInt<0,0,0b11100, "fcvtas",int_arm64_neon_fcvtas>;
-defm FCVTAU : SIMDTwoVectorFPToInt<1,0,0b11100, "fcvtau",int_arm64_neon_fcvtau>;
-defm FCVTL  : SIMDFPWidenTwoVector<0, 0, 0b10111, "fcvtl">;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (v4i16 V64:$Rn))),
-          (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (int_arm64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn),
-                                                              (i64 4)))),
-          (FCVTLv8i16 V128:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
-                                                    (i64 2))))),
-          (FCVTLv4i32 V128:$Rn)>;
-
-defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_arm64_neon_fcvtms>;
-defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_arm64_neon_fcvtmu>;
-defm FCVTNS : SIMDTwoVectorFPToInt<0,0,0b11010, "fcvtns",int_arm64_neon_fcvtns>;
-defm FCVTNU : SIMDTwoVectorFPToInt<1,0,0b11010, "fcvtnu",int_arm64_neon_fcvtnu>;
-defm FCVTN  : SIMDFPNarrowTwoVector<0, 0, 0b10110, "fcvtn">;
-def : Pat<(v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn))),
-          (FCVTNv4i16 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd,
-                          (v4i16 (int_arm64_neon_vcvtfp2hf (v4f32 V128:$Rn)))),
-          (FCVTNv8i16 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-def : Pat<(v2f32 (fround (v2f64 V128:$Rn))), (FCVTNv2i32 V128:$Rn)>;
-def : Pat<(concat_vectors V64:$Rd, (v2f32 (fround (v2f64 V128:$Rn)))),
-          (FCVTNv4i32 (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), V128:$Rn)>;
-defm FCVTPS : SIMDTwoVectorFPToInt<0,1,0b11010, "fcvtps",int_arm64_neon_fcvtps>;
-defm FCVTPU : SIMDTwoVectorFPToInt<1,1,0b11010, "fcvtpu",int_arm64_neon_fcvtpu>;
-defm FCVTXN : SIMDFPInexactCvtTwoVector<1, 0, 0b10110, "fcvtxn",
-                                        int_arm64_neon_fcvtxn>;
-defm FCVTZS : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs", fp_to_sint>;
-defm FCVTZU : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu", fp_to_uint>;
-let isCodeGenOnly = 1 in {
-defm FCVTZS_Int : SIMDTwoVectorFPToInt<0, 1, 0b11011, "fcvtzs",
-                                       int_arm64_neon_fcvtzs>;
-defm FCVTZU_Int : SIMDTwoVectorFPToInt<1, 1, 0b11011, "fcvtzu",
-                                       int_arm64_neon_fcvtzu>;
-}
-defm FNEG   : SIMDTwoVectorFP<1, 1, 0b01111, "fneg", fneg>;
-defm FRECPE : SIMDTwoVectorFP<0, 1, 0b11101, "frecpe", int_arm64_neon_frecpe>;
-defm FRINTA : SIMDTwoVectorFP<1, 0, 0b11000, "frinta", frnd>;
-defm FRINTI : SIMDTwoVectorFP<1, 1, 0b11001, "frinti", fnearbyint>;
-defm FRINTM : SIMDTwoVectorFP<0, 0, 0b11001, "frintm", ffloor>;
-defm FRINTN : SIMDTwoVectorFP<0, 0, 0b11000, "frintn", int_arm64_neon_frintn>;
-defm FRINTP : SIMDTwoVectorFP<0, 1, 0b11000, "frintp", fceil>;
-defm FRINTX : SIMDTwoVectorFP<1, 0, 0b11001, "frintx", frint>;
-defm FRINTZ : SIMDTwoVectorFP<0, 1, 0b11001, "frintz", ftrunc>;
-defm FRSQRTE: SIMDTwoVectorFP<1, 1, 0b11101, "frsqrte", int_arm64_neon_frsqrte>;
-defm FSQRT  : SIMDTwoVectorFP<1, 1, 0b11111, "fsqrt", fsqrt>;
-defm NEG    : SIMDTwoVectorBHSD<1, 0b01011, "neg",
-                               UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm NOT    : SIMDTwoVectorB<1, 0b00, 0b00101, "not", vnot>;
-// Aliases for MVN -> NOT.
-def : InstAlias<"mvn.8b $Vd, $Vn", (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn.16b $Vd, $Vn", (NOTv16i8 V128:$Vd, V128:$Vn)>;
-def : InstAlias<"mvn $Vd.8b, $Vn.8b", (NOTv8i8 V64:$Vd, V64:$Vn)>;
-def : InstAlias<"mvn $Vd.16b, $Vn.16b", (NOTv16i8 V128:$Vd, V128:$Vn)>;
-
-def : Pat<(ARM64neg (v8i8  V64:$Rn)),  (NEGv8i8  V64:$Rn)>;
-def : Pat<(ARM64neg (v16i8 V128:$Rn)), (NEGv16i8 V128:$Rn)>;
-def : Pat<(ARM64neg (v4i16 V64:$Rn)),  (NEGv4i16 V64:$Rn)>;
-def : Pat<(ARM64neg (v8i16 V128:$Rn)), (NEGv8i16 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i32 V64:$Rn)),  (NEGv2i32 V64:$Rn)>;
-def : Pat<(ARM64neg (v4i32 V128:$Rn)), (NEGv4i32 V128:$Rn)>;
-def : Pat<(ARM64neg (v2i64 V128:$Rn)), (NEGv2i64 V128:$Rn)>;
-
-def : Pat<(ARM64not (v8i8 V64:$Rn)),   (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v16i8 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(ARM64not (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(ARM64not (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-def : Pat<(vnot (v4i16 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v8i16 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i32 V64:$Rn)),  (NOTv8i8  V64:$Rn)>;
-def : Pat<(vnot (v4i32 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-def : Pat<(vnot (v2i64 V128:$Rn)), (NOTv16i8 V128:$Rn)>;
-
-defm RBIT   : SIMDTwoVectorB<1, 0b01, 0b00101, "rbit", int_arm64_neon_rbit>;
-defm REV16  : SIMDTwoVectorB<0, 0b00, 0b00001, "rev16", ARM64rev16>;
-defm REV32  : SIMDTwoVectorBH<1, 0b00000, "rev32", ARM64rev32>;
-defm REV64  : SIMDTwoVectorBHS<0, 0b00000, "rev64", ARM64rev64>;
-defm SADALP : SIMDLongTwoVectorTied<0, 0b00110, "sadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_saddlp node:$RHS))> >;
-defm SADDLP : SIMDLongTwoVector<0, 0b00010, "saddlp", int_arm64_neon_saddlp>;
-defm SCVTF  : SIMDTwoVectorIntToFP<0, 0, 0b11101, "scvtf", sint_to_fp>;
-defm SHLL   : SIMDVectorLShiftLongBySizeBHS;
-defm SQABS  : SIMDTwoVectorBHSD<0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoVectorBHSD<1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDMixedTwoVector<0, 0b10100, "sqxtn", int_arm64_neon_sqxtn>;
-defm SQXTUN : SIMDMixedTwoVector<1, 0b10010, "sqxtun", int_arm64_neon_sqxtun>;
-defm SUQADD : SIMDTwoVectorBHSDTied<0, 0b00011, "suqadd",int_arm64_neon_suqadd>;
-defm UADALP : SIMDLongTwoVectorTied<1, 0b00110, "uadalp",
-       BinOpFrag<(add node:$LHS, (int_arm64_neon_uaddlp node:$RHS))> >;
-defm UADDLP : SIMDLongTwoVector<1, 0b00010, "uaddlp",
-                    int_arm64_neon_uaddlp>;
-defm UCVTF  : SIMDTwoVectorIntToFP<1, 0, 0b11101, "ucvtf", uint_to_fp>;
-defm UQXTN  : SIMDMixedTwoVector<1, 0b10100, "uqxtn", int_arm64_neon_uqxtn>;
-defm URECPE : SIMDTwoVectorS<0, 1, 0b11100, "urecpe", int_arm64_neon_urecpe>;
-defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_arm64_neon_ursqrte>;
-defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_arm64_neon_usqadd>;
-defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
-
-def : Pat<(v2f32 (ARM64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (ARM64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
-
-// Patterns for vector long shift (by element width). These need to match all
-// three of zext, sext and anyext so it's easier to pull the patterns out of the
-// definition.
-multiclass SIMDVectorLShiftLongBySizeBHSPats<SDPatternOperator ext> {
-  def : Pat<(ARM64vshl (v8i16 (ext (v8i8 V64:$Rn))), (i32 8)),
-            (SHLLv8i8 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v8i16 (ext (extract_high_v16i8 V128:$Rn))), (i32 8)),
-            (SHLLv16i8 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (v4i16 V64:$Rn))), (i32 16)),
-            (SHLLv4i16 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v4i32 (ext (extract_high_v8i16 V128:$Rn))), (i32 16)),
-            (SHLLv8i16 V128:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (v2i32 V64:$Rn))), (i32 32)),
-            (SHLLv2i32 V64:$Rn)>;
-  def : Pat<(ARM64vshl (v2i64 (ext (extract_high_v4i32 V128:$Rn))), (i32 32)),
-            (SHLLv4i32 V128:$Rn)>;
-}
-
-defm : SIMDVectorLShiftLongBySizeBHSPats<anyext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<zext>;
-defm : SIMDVectorLShiftLongBySizeBHSPats<sext>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD     : SIMDThreeSameVector<0, 0b10000, "add", add>;
-defm ADDP    : SIMDThreeSameVector<0, 0b10111, "addp", int_arm64_neon_addp>;
-defm CMEQ    : SIMDThreeSameVector<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE    : SIMDThreeSameVector<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT    : SIMDThreeSameVector<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI    : SIMDThreeSameVector<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS    : SIMDThreeSameVector<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST   : SIMDThreeSameVector<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD    : SIMDThreeSameVectorFP<1,1,0b11010,"fabd", int_arm64_neon_fabd>;
-defm FACGE   : SIMDThreeSameVectorFPCmp<1,0,0b11101,"facge",int_arm64_neon_facge>;
-defm FACGT   : SIMDThreeSameVectorFPCmp<1,1,0b11101,"facgt",int_arm64_neon_facgt>;
-defm FADDP   : SIMDThreeSameVectorFP<1,0,0b11010,"faddp",int_arm64_neon_addp>;
-defm FADD    : SIMDThreeSameVectorFP<0,0,0b11010,"fadd", fadd>;
-defm FCMEQ   : SIMDThreeSameVectorFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE   : SIMDThreeSameVectorFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT   : SIMDThreeSameVectorFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FDIV    : SIMDThreeSameVectorFP<1,0,0b11111,"fdiv", fdiv>;
-defm FMAXNMP : SIMDThreeSameVectorFP<1,0,0b11000,"fmaxnmp", int_arm64_neon_fmaxnmp>;
-defm FMAXNM  : SIMDThreeSameVectorFP<0,0,0b11000,"fmaxnm", int_arm64_neon_fmaxnm>;
-defm FMAXP   : SIMDThreeSameVectorFP<1,0,0b11110,"fmaxp", int_arm64_neon_fmaxp>;
-defm FMAX    : SIMDThreeSameVectorFP<0,0,0b11110,"fmax", ARM64fmax>;
-defm FMINNMP : SIMDThreeSameVectorFP<1,1,0b11000,"fminnmp", int_arm64_neon_fminnmp>;
-defm FMINNM  : SIMDThreeSameVectorFP<0,1,0b11000,"fminnm", int_arm64_neon_fminnm>;
-defm FMINP   : SIMDThreeSameVectorFP<1,1,0b11110,"fminp", int_arm64_neon_fminp>;
-defm FMIN    : SIMDThreeSameVectorFP<0,1,0b11110,"fmin", ARM64fmin>;
-
-// NOTE: The operands of the PatFrag are reordered on FMLA/FMLS because the
-// instruction expects the addend first, while the fma intrinsic puts it last.
-defm FMLA     : SIMDThreeSameVectorFPTied<0, 0, 0b11001, "fmla",
-            TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm FMLS     : SIMDThreeSameVectorFPTied<0, 1, 0b11001, "fmls",
-            TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-
-// The following def pats catch the case where the LHS of an FMA is negated.
-// The TriOpFrag above catches the case where the middle operand is negated.
-def : Pat<(v2f32 (fma (fneg V64:$Rn), V64:$Rm, V64:$Rd)),
-          (FMLSv2f32 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(v4f32 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv4f32 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-def : Pat<(v2f64 (fma (fneg V128:$Rn), V128:$Rm, V128:$Rd)),
-          (FMLSv2f64 V128:$Rd, V128:$Rn, V128:$Rm)>;
-
-defm FMULX    : SIMDThreeSameVectorFP<0,0,0b11011,"fmulx", int_arm64_neon_fmulx>;
-defm FMUL     : SIMDThreeSameVectorFP<1,0,0b11011,"fmul", fmul>;
-defm FRECPS   : SIMDThreeSameVectorFP<0,0,0b11111,"frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeSameVectorFP<0,1,0b11111,"frsqrts", int_arm64_neon_frsqrts>;
-defm FSUB     : SIMDThreeSameVectorFP<0,1,0b11010,"fsub", fsub>;
-defm MLA      : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
-                      TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS      : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
-                      TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MUL      : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
-defm PMUL     : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_arm64_neon_pmul>;
-defm SABA     : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_sabd node:$MHS, node:$RHS))> >;
-defm SABD     : SIMDThreeSameVectorBHS<0,0b01110,"sabd", int_arm64_neon_sabd>;
-defm SHADD    : SIMDThreeSameVectorBHS<0,0b00000,"shadd", int_arm64_neon_shadd>;
-defm SHSUB    : SIMDThreeSameVectorBHS<0,0b00100,"shsub", int_arm64_neon_shsub>;
-defm SMAXP    : SIMDThreeSameVectorBHS<0,0b10100,"smaxp", int_arm64_neon_smaxp>;
-defm SMAX     : SIMDThreeSameVectorBHS<0,0b01100,"smax", int_arm64_neon_smax>;
-defm SMINP    : SIMDThreeSameVectorBHS<0,0b10101,"sminp", int_arm64_neon_sminp>;
-defm SMIN     : SIMDThreeSameVectorBHS<0,0b01101,"smin", int_arm64_neon_smin>;
-defm SQADD    : SIMDThreeSameVector<0,0b00001,"sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeSameVectorHS<0,0b10110,"sqdmulh",int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_arm64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_arm64_neon_srhadd>;
-defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
-defm UABA     : SIMDThreeSameVectorBHSTied<1, 0b01111, "uaba",
-      TriOpFrag<(add node:$LHS, (int_arm64_neon_uabd node:$MHS, node:$RHS))> >;
-defm UABD     : SIMDThreeSameVectorBHS<1,0b01110,"uabd", int_arm64_neon_uabd>;
-defm UHADD    : SIMDThreeSameVectorBHS<1,0b00000,"uhadd", int_arm64_neon_uhadd>;
-defm UHSUB    : SIMDThreeSameVectorBHS<1,0b00100,"uhsub", int_arm64_neon_uhsub>;
-defm UMAXP    : SIMDThreeSameVectorBHS<1,0b10100,"umaxp", int_arm64_neon_umaxp>;
-defm UMAX     : SIMDThreeSameVectorBHS<1,0b01100,"umax", int_arm64_neon_umax>;
-defm UMINP    : SIMDThreeSameVectorBHS<1,0b10101,"uminp", int_arm64_neon_uminp>;
-defm UMIN     : SIMDThreeSameVectorBHS<1,0b01101,"umin", int_arm64_neon_umin>;
-defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_arm64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_arm64_neon_urhadd>;
-defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_arm64_neon_ushl>;
-
-defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
-defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
-                                  BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", ARM64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
-defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
-defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
-                                  BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
-defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
-
-// FIXME: the .16b and .8b variantes should be emitted by the
-// AsmWriter. TableGen's AsmWriter-generator doesn't deal with variant syntaxes
-// in aliases yet though.
-def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.8h, $src.8h|mov.8h\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.4s, $src.4s|mov.4s\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-def : InstAlias<"{mov\t$dst.2d, $src.2d|mov.2d\t$dst, $src}",
-                (ORRv16i8 V128:$dst, V128:$src, V128:$src), 0>;
-
-def : InstAlias<"{mov\t$dst.8b, $src.8b|mov.8b\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.4h, $src.4h|mov.4h\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.2s, $src.2s|mov.2s\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-def : InstAlias<"{mov\t$dst.1d, $src.1d|mov.1d\t$dst, $src}",
-                (ORRv8i8 V64:$dst, V64:$src, V64:$src), 0>;
-
-def : InstAlias<"{cmls\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmls.8b\t$dst, $src1, $src2}",
-                (CMHSv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmls.16b\t$dst, $src1, $src2}",
-                (CMHSv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmls.4h\t$dst, $src1, $src2}",
-                (CMHSv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmls.8h\t$dst, $src1, $src2}",
-                (CMHSv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmls.2s\t$dst, $src1, $src2}",
-                (CMHSv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmls.4s\t$dst, $src1, $src2}",
-                (CMHSv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmls\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmls.2d\t$dst, $src1, $src2}",
-                (CMHSv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlo\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlo.8b\t$dst, $src1, $src2}",
-                (CMHIv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlo.16b\t$dst, $src1, $src2}",
-                (CMHIv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlo.4h\t$dst, $src1, $src2}",
-                (CMHIv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlo.8h\t$dst, $src1, $src2}",
-                (CMHIv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlo.2s\t$dst, $src1, $src2}",
-                (CMHIv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlo.4s\t$dst, $src1, $src2}",
-                (CMHIv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlo\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlo.2d\t$dst, $src1, $src2}",
-                (CMHIv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmle\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmle.8b\t$dst, $src1, $src2}",
-                (CMGEv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmle.16b\t$dst, $src1, $src2}",
-                (CMGEv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmle.4h\t$dst, $src1, $src2}",
-                (CMGEv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmle.8h\t$dst, $src1, $src2}",
-                (CMGEv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmle.2s\t$dst, $src1, $src2}",
-                (CMGEv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmle.4s\t$dst, $src1, $src2}",
-                (CMGEv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmle.2d\t$dst, $src1, $src2}",
-                (CMGEv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{cmlt\t$dst.8b, $src1.8b, $src2.8b" #
-                "|cmlt.8b\t$dst, $src1, $src2}",
-                (CMGTv8i8 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.16b, $src1.16b, $src2.16b" #
-                "|cmlt.16b\t$dst, $src1, $src2}",
-                (CMGTv16i8 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4h, $src1.4h, $src2.4h" #
-                "|cmlt.4h\t$dst, $src1, $src2}",
-                (CMGTv4i16 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.8h, $src1.8h, $src2.8h" #
-                "|cmlt.8h\t$dst, $src1, $src2}",
-                (CMGTv8i16 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|cmlt.2s\t$dst, $src1, $src2}",
-                (CMGTv2i32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|cmlt.4s\t$dst, $src1, $src2}",
-                (CMGTv4i32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{cmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|cmlt.2d\t$dst, $src1, $src2}",
-                (CMGTv2i64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmle.2s\t$dst, $src1, $src2}",
-                (FCMGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmle.4s\t$dst, $src1, $src2}",
-                (FCMGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmle.2d\t$dst, $src1, $src2}",
-                (FCMGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{fcmlt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|fcmlt.2s\t$dst, $src1, $src2}",
-                (FCMGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|fcmlt.4s\t$dst, $src1, $src2}",
-                (FCMGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{fcmlt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|fcmlt.2d\t$dst, $src1, $src2}",
-                (FCMGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{facle\t$dst.2s, $src1.2s, $src2.2s" #
-                "|facle.2s\t$dst, $src1, $src2}",
-                (FACGEv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{facle\t$dst.4s, $src1.4s, $src2.4s" #
-                "|facle.4s\t$dst, $src1, $src2}",
-                (FACGEv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{facle\t$dst.2d, $src1.2d, $src2.2d" #
-                "|facle.2d\t$dst, $src1, $src2}",
-                (FACGEv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-def : InstAlias<"{faclt\t$dst.2s, $src1.2s, $src2.2s" #
-                "|faclt.2s\t$dst, $src1, $src2}",
-                (FACGTv2f32 V64:$dst, V64:$src2, V64:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.4s, $src1.4s, $src2.4s" #
-                "|faclt.4s\t$dst, $src1, $src2}",
-                (FACGTv4f32 V128:$dst, V128:$src2, V128:$src1), 0>;
-def : InstAlias<"{faclt\t$dst.2d, $src1.2d, $src2.2d" #
-                "|faclt.2d\t$dst, $src1, $src2}",
-                (FACGTv2f64 V128:$dst, V128:$src2, V128:$src1), 0>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADD      : SIMDThreeScalarD<0, 0b10000, "add", add>;
-defm CMEQ     : SIMDThreeScalarD<1, 0b10001, "cmeq", ARM64cmeq>;
-defm CMGE     : SIMDThreeScalarD<0, 0b00111, "cmge", ARM64cmge>;
-defm CMGT     : SIMDThreeScalarD<0, 0b00110, "cmgt", ARM64cmgt>;
-defm CMHI     : SIMDThreeScalarD<1, 0b00110, "cmhi", ARM64cmhi>;
-defm CMHS     : SIMDThreeScalarD<1, 0b00111, "cmhs", ARM64cmhs>;
-defm CMTST    : SIMDThreeScalarD<0, 0b10001, "cmtst", ARM64cmtst>;
-defm FABD     : SIMDThreeScalarSD<1, 1, 0b11010, "fabd", int_arm64_sisd_fabd>;
-def : Pat<(v1f64 (int_arm64_neon_fabd (v1f64 FPR64:$Rn), (v1f64 FPR64:$Rm))),
-          (FABD64 FPR64:$Rn, FPR64:$Rm)>;
-defm FACGE    : SIMDThreeScalarFPCmp<1, 0, 0b11101, "facge",
-                                     int_arm64_neon_facge>;
-defm FACGT    : SIMDThreeScalarFPCmp<1, 1, 0b11101, "facgt",
-                                     int_arm64_neon_facgt>;
-defm FCMEQ    : SIMDThreeScalarFPCmp<0, 0, 0b11100, "fcmeq", ARM64fcmeq>;
-defm FCMGE    : SIMDThreeScalarFPCmp<1, 0, 0b11100, "fcmge", ARM64fcmge>;
-defm FCMGT    : SIMDThreeScalarFPCmp<1, 1, 0b11100, "fcmgt", ARM64fcmgt>;
-defm FMULX    : SIMDThreeScalarSD<0, 0, 0b11011, "fmulx", int_arm64_neon_fmulx>;
-defm FRECPS   : SIMDThreeScalarSD<0, 0, 0b11111, "frecps", int_arm64_neon_frecps>;
-defm FRSQRTS  : SIMDThreeScalarSD<0, 1, 0b11111, "frsqrts", int_arm64_neon_frsqrts>;
-defm SQADD    : SIMDThreeScalarBHSD<0, 0b00001, "sqadd", int_arm64_neon_sqadd>;
-defm SQDMULH  : SIMDThreeScalarHS<  0, 0b10110, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDThreeScalarHS<  1, 0b10110, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm SQRSHL   : SIMDThreeScalarBHSD<0, 0b01011, "sqrshl",int_arm64_neon_sqrshl>;
-defm SQSHL    : SIMDThreeScalarBHSD<0, 0b01001, "sqshl", int_arm64_neon_sqshl>;
-defm SQSUB    : SIMDThreeScalarBHSD<0, 0b00101, "sqsub", int_arm64_neon_sqsub>;
-defm SRSHL    : SIMDThreeScalarD<   0, 0b01010, "srshl", int_arm64_neon_srshl>;
-defm SSHL     : SIMDThreeScalarD<   0, 0b01000, "sshl", int_arm64_neon_sshl>;
-defm SUB      : SIMDThreeScalarD<   1, 0b10000, "sub", sub>;
-defm UQADD    : SIMDThreeScalarBHSD<1, 0b00001, "uqadd", int_arm64_neon_uqadd>;
-defm UQRSHL   : SIMDThreeScalarBHSD<1, 0b01011, "uqrshl",int_arm64_neon_uqrshl>;
-defm UQSHL    : SIMDThreeScalarBHSD<1, 0b01001, "uqshl", int_arm64_neon_uqshl>;
-defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_arm64_neon_uqsub>;
-defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_arm64_neon_urshl>;
-defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_arm64_neon_ushl>;
-
-def : InstAlias<"cmls $dst, $src1, $src2",
-                (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmle $dst, $src1, $src2",
-                (CMGEv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmlo $dst, $src1, $src2",
-                (CMHIv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"cmlt $dst, $src1, $src2",
-                (CMGTv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"fcmle $dst, $src1, $src2",
-                (FCMGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"fcmlt $dst, $src1, $src2",
-                (FCMGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"facle $dst, $src1, $src2",
-                (FACGE64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT32 FPR32:$dst, FPR32:$src2, FPR32:$src1)>;
-def : InstAlias<"faclt $dst, $src1, $src2",
-                (FACGT64 FPR64:$dst, FPR64:$src2, FPR64:$src1)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three scalar instructions (mixed operands).
-//===----------------------------------------------------------------------===//
-defm SQDMULL  : SIMDThreeScalarMixedHS<0, 0b11010, "sqdmull",
-                                       int_arm64_neon_sqdmulls_scalar>;
-defm SQDMLAL  : SIMDThreeScalarMixedTiedHS<0, 0b10010, "sqdmlal">;
-defm SQDMLSL  : SIMDThreeScalarMixedTiedHS<0, 0b10110, "sqdmlsl">;
-
-def : Pat<(i64 (int_arm64_neon_sqadd (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLALi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_sqsub (i64 FPR64:$Rd),
-                   (i64 (int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                                        (i32 FPR32:$Rm))))),
-          (SQDMLSLi32 FPR64:$Rd, FPR32:$Rn, FPR32:$Rm)>;
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD two scalar instructions.
-//===----------------------------------------------------------------------===//
-
-defm ABS    : SIMDTwoScalarD<    0, 0b01011, "abs", int_arm64_neon_abs>;
-defm CMEQ   : SIMDCmpTwoScalarD< 0, 0b01001, "cmeq", ARM64cmeqz>;
-defm CMGE   : SIMDCmpTwoScalarD< 1, 0b01000, "cmge", ARM64cmgez>;
-defm CMGT   : SIMDCmpTwoScalarD< 0, 0b01000, "cmgt", ARM64cmgtz>;
-defm CMLE   : SIMDCmpTwoScalarD< 1, 0b01001, "cmle", ARM64cmlez>;
-defm CMLT   : SIMDCmpTwoScalarD< 0, 0b01010, "cmlt", ARM64cmltz>;
-defm FCMEQ  : SIMDCmpTwoScalarSD<0, 1, 0b01101, "fcmeq", ARM64fcmeqz>;
-defm FCMGE  : SIMDCmpTwoScalarSD<1, 1, 0b01100, "fcmge", ARM64fcmgez>;
-defm FCMGT  : SIMDCmpTwoScalarSD<0, 1, 0b01100, "fcmgt", ARM64fcmgtz>;
-defm FCMLE  : SIMDCmpTwoScalarSD<1, 1, 0b01101, "fcmle", ARM64fcmlez>;
-defm FCMLT  : SIMDCmpTwoScalarSD<0, 1, 0b01110, "fcmlt", ARM64fcmltz>;
-defm FCVTAS : SIMDTwoScalarSD<   0, 0, 0b11100, "fcvtas">;
-defm FCVTAU : SIMDTwoScalarSD<   1, 0, 0b11100, "fcvtau">;
-defm FCVTMS : SIMDTwoScalarSD<   0, 0, 0b11011, "fcvtms">;
-defm FCVTMU : SIMDTwoScalarSD<   1, 0, 0b11011, "fcvtmu">;
-defm FCVTNS : SIMDTwoScalarSD<   0, 0, 0b11010, "fcvtns">;
-defm FCVTNU : SIMDTwoScalarSD<   1, 0, 0b11010, "fcvtnu">;
-defm FCVTPS : SIMDTwoScalarSD<   0, 1, 0b11010, "fcvtps">;
-defm FCVTPU : SIMDTwoScalarSD<   1, 1, 0b11010, "fcvtpu">;
-def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDTwoScalarSD<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDTwoScalarSD<   1, 1, 0b11011, "fcvtzu">;
-defm FRECPE : SIMDTwoScalarSD<   0, 1, 0b11101, "frecpe">;
-defm FRECPX : SIMDTwoScalarSD<   0, 1, 0b11111, "frecpx">;
-defm FRSQRTE : SIMDTwoScalarSD<  1, 1, 0b11101, "frsqrte">;
-defm NEG    : SIMDTwoScalarD<    1, 0b01011, "neg",
-                                 UnOpFrag<(sub immAllZerosV, node:$LHS)> >;
-defm SCVTF  : SIMDTwoScalarCVTSD<   0, 0, 0b11101, "scvtf", ARM64sitof>;
-defm SQABS  : SIMDTwoScalarBHSD< 0, 0b00111, "sqabs", int_arm64_neon_sqabs>;
-defm SQNEG  : SIMDTwoScalarBHSD< 1, 0b00111, "sqneg", int_arm64_neon_sqneg>;
-defm SQXTN  : SIMDTwoScalarMixedBHS< 0, 0b10100, "sqxtn", int_arm64_neon_scalar_sqxtn>;
-defm SQXTUN : SIMDTwoScalarMixedBHS< 1, 0b10010, "sqxtun", int_arm64_neon_scalar_sqxtun>;
-defm SUQADD : SIMDTwoScalarBHSDTied< 0, 0b00011, "suqadd",
-                                     int_arm64_neon_suqadd>;
-defm UCVTF  : SIMDTwoScalarCVTSD<   1, 0, 0b11101, "ucvtf", ARM64uitof>;
-defm UQXTN  : SIMDTwoScalarMixedBHS<1, 0b10100, "uqxtn", int_arm64_neon_scalar_uqxtn>;
-defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
-                                    int_arm64_neon_usqadd>;
-
-def : Pat<(v1i64 (int_arm64_neon_fcvtas (v1f64 FPR64:$Rn))),
-          (FCVTASv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtau (v1f64 FPR64:$Rn))),
-          (FCVTAUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtms (v1f64 FPR64:$Rn))),
-          (FCVTMSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtmu (v1f64 FPR64:$Rn))),
-          (FCVTMUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtns (v1f64 FPR64:$Rn))),
-          (FCVTNSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtnu (v1f64 FPR64:$Rn))),
-          (FCVTNUv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtps (v1f64 FPR64:$Rn))),
-          (FCVTPSv1i64 FPR64:$Rn)>;
-def : Pat<(v1i64 (int_arm64_neon_fcvtpu (v1f64 FPR64:$Rn))),
-          (FCVTPUv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpe (f32 FPR32:$Rn))),
-          (FRECPEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpe (f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frecpe (v1f64 FPR64:$Rn))),
-          (FRECPEv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frecpx (f32 FPR32:$Rn))),
-          (FRECPXv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frecpx (f64 FPR64:$Rn))),
-          (FRECPXv1i64 FPR64:$Rn)>;
-
-def : Pat<(f32 (int_arm64_neon_frsqrte (f32 FPR32:$Rn))),
-          (FRSQRTEv1i32 FPR32:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_frsqrte (f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-def : Pat<(v1f64 (int_arm64_neon_frsqrte (v1f64 FPR64:$Rn))),
-          (FRSQRTEv1i64 FPR64:$Rn)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// Here are the patterns for 8 and 16-bits to float.
-// 8-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> float.
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f32 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i32 (INSERT_SUBREG (f32 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// UCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 ro_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBro ro_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_indexed8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRBui am_indexed8:$addr), bsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi8 am_unscaled8:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURBi am_unscaled8:$addr), bsub))>;
-// 16-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 ro_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHro ro_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_indexed16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRHui am_indexed16:$addr), hsub))>;
-def : Pat <(f64 (uint_to_fp (i32 (zextloadi16 am_unscaled16:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURHi am_unscaled16:$addr), hsub))>;
-// 32-bits -> double.
-def : Pat <(f64 (uint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSro ro_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_indexed32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDRSui am_indexed32:$addr), ssub))>;
-def : Pat <(f64 (uint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (UCVTFv1i64 (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                          (LDURSi am_unscaled32:$addr), ssub))>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-//===----------------------------------------------------------------------===//
-// Advanced SIMD three different-sized vector instructions.
-//===----------------------------------------------------------------------===//
-
-defm ADDHN  : SIMDNarrowThreeVectorBHS<0,0b0100,"addhn", int_arm64_neon_addhn>;
-defm SUBHN  : SIMDNarrowThreeVectorBHS<0,0b0110,"subhn", int_arm64_neon_subhn>;
-defm RADDHN : SIMDNarrowThreeVectorBHS<1,0b0100,"raddhn",int_arm64_neon_raddhn>;
-defm RSUBHN : SIMDNarrowThreeVectorBHS<1,0b0110,"rsubhn",int_arm64_neon_rsubhn>;
-defm PMULL  : SIMDDifferentThreeVectorBD<0,0b1110,"pmull",int_arm64_neon_pmull>;
-defm SABAL  : SIMDLongThreeVectorTiedBHSabal<0,0b0101,"sabal",
-                                             int_arm64_neon_sabd>;
-defm SABDL   : SIMDLongThreeVectorBHSabdl<0, 0b0111, "sabdl",
-                                          int_arm64_neon_sabd>;
-defm SADDL   : SIMDLongThreeVectorBHS<   0, 0b0000, "saddl",
-            BinOpFrag<(add (sext node:$LHS), (sext node:$RHS))>>;
-defm SADDW   : SIMDWideThreeVectorBHS<   0, 0b0001, "saddw",
-                 BinOpFrag<(add node:$LHS, (sext node:$RHS))>>;
-defm SMLAL   : SIMDLongThreeVectorTiedBHS<0, 0b1000, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL   : SIMDLongThreeVectorTiedBHS<0, 0b1010, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL   : SIMDLongThreeVectorBHS<0, 0b1100, "smull", int_arm64_neon_smull>;
-defm SQDMLAL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1001, "sqdmlal",
-                                               int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDLongThreeVectorSQDMLXTiedHS<0, 0b1011, "sqdmlsl",
-                                               int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDLongThreeVectorHS<0, 0b1101, "sqdmull",
-                                     int_arm64_neon_sqdmull>;
-defm SSUBL   : SIMDLongThreeVectorBHS<0, 0b0010, "ssubl",
-                 BinOpFrag<(sub (sext node:$LHS), (sext node:$RHS))>>;
-defm SSUBW   : SIMDWideThreeVectorBHS<0, 0b0011, "ssubw",
-                 BinOpFrag<(sub node:$LHS, (sext node:$RHS))>>;
-defm UABAL   : SIMDLongThreeVectorTiedBHSabal<1, 0b0101, "uabal",
-                                              int_arm64_neon_uabd>;
-defm UABDL   : SIMDLongThreeVectorBHSabdl<1, 0b0111, "uabdl",
-                                          int_arm64_neon_uabd>;
-defm UADDL   : SIMDLongThreeVectorBHS<1, 0b0000, "uaddl",
-                 BinOpFrag<(add (zext node:$LHS), (zext node:$RHS))>>;
-defm UADDW   : SIMDWideThreeVectorBHS<1, 0b0001, "uaddw",
-                 BinOpFrag<(add node:$LHS, (zext node:$RHS))>>;
-defm UMLAL   : SIMDLongThreeVectorTiedBHS<1, 0b1000, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDLongThreeVectorTiedBHS<1, 0b1010, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDLongThreeVectorBHS<1, 0b1100, "umull", int_arm64_neon_umull>;
-defm USUBL   : SIMDLongThreeVectorBHS<1, 0b0010, "usubl",
-                 BinOpFrag<(sub (zext node:$LHS), (zext node:$RHS))>>;
-defm USUBW   : SIMDWideThreeVectorBHS<   1, 0b0011, "usubw",
-                 BinOpFrag<(sub node:$LHS, (zext node:$RHS))>>;
-
-// Patterns for 64-bit pmull
-def : Pat<(int_arm64_neon_pmull64 V64:$Rn, V64:$Rm),
-          (PMULLv1i64 V64:$Rn, V64:$Rm)>;
-def : Pat<(int_arm64_neon_pmull64 (vector_extract (v2i64 V128:$Rn), (i64 1)),
-                                  (vector_extract (v2i64 V128:$Rm), (i64 1))),
-          (PMULLv2i64 V128:$Rn, V128:$Rm)>;
-
-// CodeGen patterns for addhn and subhn instructions, which can actually be
-// written in LLVM IR without too much difficulty.
-
-// ADDHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm), (i32 8))))),
-          (ADDHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (ADDHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (ADDHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (ADDHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (ADDHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (add V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (ADDHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-// SUBHN
-def : Pat<(v8i8 (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm), (i32 8))))),
-          (SUBHNv8i16_v8i8 V128:$Rn, V128:$Rm)>;
-def : Pat<(v4i16 (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 16))))),
-          (SUBHNv4i32_v4i16 V128:$Rn, V128:$Rm)>;
-def : Pat<(v2i32 (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                           (i32 32))))),
-          (SUBHNv2i64_v2i32 V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v8i8 V64:$Rd),
-                          (trunc (v8i16 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 8))))),
-          (SUBHNv8i16_v16i8 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v4i16 V64:$Rd),
-                          (trunc (v4i32 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 16))))),
-          (SUBHNv4i32_v8i16 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-def : Pat<(concat_vectors (v2i32 V64:$Rd),
-                          (trunc (v2i64 (ARM64vlshr (sub V128:$Rn, V128:$Rm),
-                                                    (i32 32))))),
-          (SUBHNv2i64_v4i32 (SUBREG_TO_REG (i32 0), V64:$Rd, dsub),
-                            V128:$Rn, V128:$Rm)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD bitwise extract from vector instruction.
-//----------------------------------------------------------------------------
-
-defm EXT : SIMDBitwiseExtract<"ext">;
-
-def : Pat<(v4i16 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v8i16 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v2f32 (ARM64ext V64:$Rn, V64:$Rm, (i32 imm:$imm))),
-          (EXTv8i8 V64:$Rn, V64:$Rm, imm:$imm)>;
-def : Pat<(v4i32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v4f32 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2i64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-def : Pat<(v2f64 (ARM64ext V128:$Rn, V128:$Rm, (i32 imm:$imm))),
-          (EXTv16i8 V128:$Rn, V128:$Rm, imm:$imm)>;
-
-// We use EXT to handle extract_subvector to copy the upper 64-bits of a
-// 128-bit vector.
-def : Pat<(v8i8  (extract_subvector V128:$Rn, (i64 8))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 4))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 2))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-def : Pat<(v1f64 (extract_subvector V128:$Rn, (i64 1))),
-          (EXTRACT_SUBREG (EXTv16i8 V128:$Rn, V128:$Rn, 8), dsub)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD zip vector
-//----------------------------------------------------------------------------
-
-defm TRN1 : SIMDZipVector<0b010, "trn1", ARM64trn1>;
-defm TRN2 : SIMDZipVector<0b110, "trn2", ARM64trn2>;
-defm UZP1 : SIMDZipVector<0b001, "uzp1", ARM64uzp1>;
-defm UZP2 : SIMDZipVector<0b101, "uzp2", ARM64uzp2>;
-defm ZIP1 : SIMDZipVector<0b011, "zip1", ARM64zip1>;
-defm ZIP2 : SIMDZipVector<0b111, "zip2", ARM64zip2>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD TBL/TBX instructions
-//----------------------------------------------------------------------------
-
-defm TBL : SIMDTableLookup<    0, "tbl">;
-defm TBX : SIMDTableLookupTied<1, "tbx">;
-
-def : Pat<(v8i8 (int_arm64_neon_tbl1 (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBLv8i8One VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbl1 (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBLv16i8One V128:$Ri, V128:$Rn)>;
-
-def : Pat<(v8i8 (int_arm64_neon_tbx1 (v8i8 V64:$Rd),
-                  (v16i8 VecListOne128:$Rn), (v8i8 V64:$Ri))),
-          (TBXv8i8One V64:$Rd, VecListOne128:$Rn, V64:$Ri)>;
-def : Pat<(v16i8 (int_arm64_neon_tbx1 (v16i8 V128:$Rd),
-                   (v16i8 V128:$Ri), (v16i8 V128:$Rn))),
-          (TBXv16i8One V128:$Rd, V128:$Ri, V128:$Rn)>;
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar CPY instruction
-//----------------------------------------------------------------------------
-
-defm CPY : SIMDScalarCPY<"cpy">;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar pairwise instructions
-//----------------------------------------------------------------------------
-
-defm ADDP    : SIMDPairwiseScalarD<0, 0b11011, "addp">;
-defm FADDP   : SIMDPairwiseScalarSD<1, 0, 0b01101, "faddp">;
-defm FMAXNMP : SIMDPairwiseScalarSD<1, 0, 0b01100, "fmaxnmp">;
-defm FMAXP   : SIMDPairwiseScalarSD<1, 0, 0b01111, "fmaxp">;
-defm FMINNMP : SIMDPairwiseScalarSD<1, 1, 0b01100, "fminnmp">;
-defm FMINP   : SIMDPairwiseScalarSD<1, 1, 0b01111, "fminp">;
-def : Pat<(i64 (int_arm64_neon_saddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(i64 (int_arm64_neon_uaddv (v2i64 V128:$Rn))),
-          (ADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v2f32 V64:$Rn))),
-          (FADDPv2i32p V64:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_faddv (v4f32 V128:$Rn))),
-          (FADDPv2i32p (EXTRACT_SUBREG (FADDPv4f32 V128:$Rn, V128:$Rn), dsub))>;
-def : Pat<(f64 (int_arm64_neon_faddv (v2f64 V128:$Rn))),
-          (FADDPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxnmv (v2f32 V64:$Rn))),
-          (FMAXNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxnmv (v2f64 V128:$Rn))),
-          (FMAXNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fmaxv (v2f32 V64:$Rn))),
-          (FMAXPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fmaxv (v2f64 V128:$Rn))),
-          (FMAXPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminnmv (v2f32 V64:$Rn))),
-          (FMINNMPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminnmv (v2f64 V128:$Rn))),
-          (FMINNMPv2i64p V128:$Rn)>;
-def : Pat<(f32 (int_arm64_neon_fminv (v2f32 V64:$Rn))),
-          (FMINPv2i32p V64:$Rn)>;
-def : Pat<(f64 (int_arm64_neon_fminv (v2f64 V128:$Rn))),
-          (FMINPv2i64p V128:$Rn)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD INS/DUP instructions
-//----------------------------------------------------------------------------
-
-def DUPv8i8gpr  : SIMDDupFromMain<0, 0b00001, ".8b", v8i8, V64, GPR32>;
-def DUPv16i8gpr : SIMDDupFromMain<1, 0b00001, ".16b", v16i8, V128, GPR32>;
-def DUPv4i16gpr : SIMDDupFromMain<0, 0b00010, ".4h", v4i16, V64, GPR32>;
-def DUPv8i16gpr : SIMDDupFromMain<1, 0b00010, ".8h", v8i16, V128, GPR32>;
-def DUPv2i32gpr : SIMDDupFromMain<0, 0b00100, ".2s", v2i32, V64, GPR32>;
-def DUPv4i32gpr : SIMDDupFromMain<1, 0b00100, ".4s", v4i32, V128, GPR32>;
-def DUPv2i64gpr : SIMDDupFromMain<1, 0b01000, ".2d", v2i64, V128, GPR64>;
-
-def DUPv2i64lane : SIMDDup64FromElement;
-def DUPv2i32lane : SIMDDup32FromElement<0, ".2s", v2i32, V64>;
-def DUPv4i32lane : SIMDDup32FromElement<1, ".4s", v4i32, V128>;
-def DUPv4i16lane : SIMDDup16FromElement<0, ".4h", v4i16, V64>;
-def DUPv8i16lane : SIMDDup16FromElement<1, ".8h", v8i16, V128>;
-def DUPv8i8lane  : SIMDDup8FromElement <0, ".8b", v8i8, V64>;
-def DUPv16i8lane : SIMDDup8FromElement <1, ".16b", v16i8, V128>;
-
-def : Pat<(v2f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v2f32 (DUPv2i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v4f32 (ARM64dup (f32 FPR32:$Rn))),
-          (v4f32 (DUPv4i32lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rn, ssub),
-            (i64 0)))>;
-def : Pat<(v2f64 (ARM64dup (f64 FPR64:$Rn))),
-          (v2f64 (DUPv2i64lane
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rn, dsub),
-            (i64 0)))>;
-
-def : Pat<(v2f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-          (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v4f32 (ARM64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
-         (DUPv4i32lane V128:$Rn, VectorIndexS:$imm)>;
-def : Pat<(v2f64 (ARM64duplane64 (v2f64 V128:$Rn), VectorIndexD:$imm)),
-          (DUPv2i64lane V128:$Rn, VectorIndexD:$imm)>;
-
-defm SMOV : SMov;
-defm UMOV : UMov;
-
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i32 (SMOVvi8to32 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx), i8),
-          (i64 (SMOVvi8to64 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i64 (SMOVvi16to64 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext_inreg (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),i16),
-          (i32 (SMOVvi16to32 V128:$Rn, VectorIndexH:$idx))>;
-def : Pat<(sext (i32 (vector_extract (v4i32 V128:$Rn), VectorIndexS:$idx))),
-          (i64 (SMOVvi32to64 V128:$Rn, VectorIndexS:$idx))>;
-
-// Extracting i8 or i16 elements will have the zero-extend transformed to
-// an 'and' mask by type legalization since neither i8 nor i16 are legal types
-// for ARM64. Match these patterns here since UMOV already zeroes out the high
-// bits of the destination register.
-def : Pat<(and (vector_extract (v16i8 V128:$Rn), VectorIndexB:$idx),
-               (i32 0xff)),
-          (i32 (UMOVvi8 V128:$Rn, VectorIndexB:$idx))>;
-def : Pat<(and (vector_extract (v8i16 V128:$Rn), VectorIndexH:$idx),
-               (i32 0xffff)),
-          (i32 (UMOVvi16 V128:$Rn, VectorIndexH:$idx))>;
-
-defm INS : SIMDIns;
-
-def : Pat<(v16i8 (scalar_to_vector GPR32:$Rn)),
-          (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
-def : Pat<(v8i8 (scalar_to_vector GPR32:$Rn)),
-          (EXTRACT_SUBREG
-            (INSvi8gpr (v16i8 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
-
-def : Pat<(v8i16 (scalar_to_vector GPR32:$Rn)),
-          (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn)>;
-def : Pat<(v4i16 (scalar_to_vector GPR32:$Rn)),
-          (EXTRACT_SUBREG
-            (INSvi16gpr (v8i16 (IMPLICIT_DEF)), (i64 0), GPR32:$Rn), dsub)>;
-
-def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v4i32 (scalar_to_vector (i32 FPR32:$Rn))),
-            (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)),
-                                  (i32 FPR32:$Rn), ssub))>;
-def : Pat<(v2i64 (scalar_to_vector (i64 FPR64:$Rn))),
-            (v2i64 (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)),
-                                  (i64 FPR64:$Rn), dsub))>;
-
-def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
-          (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
-def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$Rn))),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rn, dsub)>;
-
-def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (EXTRACT_SUBREG
-            (INSvi32lane
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), V64:$Rn, dsub)),
-              VectorIndexS:$imm,
-              (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-              (i64 0)),
-            dsub)>;
-def : Pat<(v4f32 (vector_insert (v4f32 V128:$Rn),
-            (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
-          (INSvi32lane
-            V128:$Rn, VectorIndexS:$imm,
-            (v4f32 (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rm, ssub)),
-            (i64 0))>;
-def : Pat<(v2f64 (vector_insert (v2f64 V128:$Rn),
-            (f64 FPR64:$Rm), (i64 VectorIndexD:$imm))),
-          (INSvi64lane
-            V128:$Rn, VectorIndexD:$imm,
-            (v2f64 (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$Rm, dsub)),
-            (i64 0))>;
-
-// Copy an element at a constant index in one vector into a constant indexed
-// element of another.
-// FIXME refactor to a shared class/dev parameterized on vector type, vector
-// index type and INS extension
-def : Pat<(v16i8 (int_arm64_neon_vcopy_lane
-                   (v16i8 V128:$Vd), VectorIndexB:$idx, (v16i8 V128:$Vs),
-                   VectorIndexB:$idx2)),
-          (v16i8 (INSvi8lane
-                   V128:$Vd, VectorIndexB:$idx, V128:$Vs, VectorIndexB:$idx2)
-          )>;
-def : Pat<(v8i16 (int_arm64_neon_vcopy_lane
-                   (v8i16 V128:$Vd), VectorIndexH:$idx, (v8i16 V128:$Vs),
-                   VectorIndexH:$idx2)),
-          (v8i16 (INSvi16lane
-                   V128:$Vd, VectorIndexH:$idx, V128:$Vs, VectorIndexH:$idx2)
-          )>;
-def : Pat<(v4i32 (int_arm64_neon_vcopy_lane
-                   (v4i32 V128:$Vd), VectorIndexS:$idx, (v4i32 V128:$Vs),
-                   VectorIndexS:$idx2)),
-          (v4i32 (INSvi32lane
-                   V128:$Vd, VectorIndexS:$idx, V128:$Vs, VectorIndexS:$idx2)
-          )>;
-def : Pat<(v2i64 (int_arm64_neon_vcopy_lane
-                   (v2i64 V128:$Vd), VectorIndexD:$idx, (v2i64 V128:$Vs),
-                   VectorIndexD:$idx2)),
-          (v2i64 (INSvi64lane
-                   V128:$Vd, VectorIndexD:$idx, V128:$Vs, VectorIndexD:$idx2)
-          )>;
-
-// Floating point vector extractions are codegen'd as either a sequence of
-// subregister extractions, possibly fed by an INS if the lane number is
-// anything other than zero.
-def : Pat<(vector_extract (v2f64 V128:$Rn), 0),
-          (f64 (EXTRACT_SUBREG V128:$Rn, dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
-          (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
-def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
-          (f64 (EXTRACT_SUBREG
-            (INSvi64lane (v2f64 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexD:$idx),
-            dsub))>;
-def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
-          (f32 (EXTRACT_SUBREG
-            (INSvi32lane (v4f32 (IMPLICIT_DEF)), 0,
-                         V128:$Rn, VectorIndexS:$idx),
-            ssub))>;
-
-// All concat_vectors operations are canonicalised to act on i64 vectors for
-// ARM64. In the general case we need an instruction, which had just as well be
-// INS.
-class ConcatPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rd), V64:$Rn)),
-        (INSvi64lane (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub), 1,
-                     (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub), 0)>;
-
-def : ConcatPat<v2i64, v1i64>;
-def : ConcatPat<v2f64, v1f64>;
-def : ConcatPat<v4i32, v2i32>;
-def : ConcatPat<v4f32, v2f32>;
-def : ConcatPat<v8i16, v4i16>;
-def : ConcatPat<v16i8, v8i8>;
-
-// If the high lanes are undef, though, we can just ignore them:
-class ConcatUndefPat<ValueType DstTy, ValueType SrcTy>
-  : Pat<(DstTy (concat_vectors (SrcTy V64:$Rn), undef)),
-        (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rn, dsub)>;
-
-def : ConcatUndefPat<v2i64, v1i64>;
-def : ConcatUndefPat<v2f64, v1f64>;
-def : ConcatUndefPat<v4i32, v2i32>;
-def : ConcatUndefPat<v4f32, v2f32>;
-def : ConcatUndefPat<v8i16, v4i16>;
-def : ConcatUndefPat<v16i8, v8i8>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD across lanes instructions
-//----------------------------------------------------------------------------
-
-defm ADDV    : SIMDAcrossLanesBHS<0, 0b11011, "addv">;
-defm SMAXV   : SIMDAcrossLanesBHS<0, 0b01010, "smaxv">;
-defm SMINV   : SIMDAcrossLanesBHS<0, 0b11010, "sminv">;
-defm UMAXV   : SIMDAcrossLanesBHS<1, 0b01010, "umaxv">;
-defm UMINV   : SIMDAcrossLanesBHS<1, 0b11010, "uminv">;
-defm SADDLV  : SIMDAcrossLanesHSD<0, 0b00011, "saddlv">;
-defm UADDLV  : SIMDAcrossLanesHSD<1, 0b00011, "uaddlv">;
-defm FMAXNMV : SIMDAcrossLanesS<0b01100, 0, "fmaxnmv", int_arm64_neon_fmaxnmv>;
-defm FMAXV   : SIMDAcrossLanesS<0b01111, 0, "fmaxv", int_arm64_neon_fmaxv>;
-defm FMINNMV : SIMDAcrossLanesS<0b01100, 1, "fminnmv", int_arm64_neon_fminnmv>;
-defm FMINV   : SIMDAcrossLanesS<0b01111, 1, "fminv", int_arm64_neon_fminv>;
-
-multiclass SIMDAcrossLanesSignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-  def : Pat<(i32 (sext_inreg (i32 (intOp (v8i8 V64:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v16i8 V128:$Rn))), i8)),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi8to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v4i16 V64:$Rn))), i16)),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (SMOVvi16to32
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-           (i64 0)))>;
-// If there is a sign extension after this intrinsic, consume it as smov already
-// performed it
-def : Pat<(i32 (sext_inreg (i32 (intOp (v8i16 V128:$Rn))), i16)),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedIntrinsic<string baseOpc, Intrinsic intOp> {
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-  def : Pat<(i32 (and (i32 (intOp (v8i8 V64:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), bsub),
-          ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v16i8 V128:$Rn))), maski8_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), bsub),
-          ssub))>;
-
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v4i16 V64:$Rn))), maski16_or_more)),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), hsub),
-            ssub))>;
-// If there is a masking operation keeping only what has been actually
-// generated, consume it.
-def : Pat<(i32 (and (i32 (intOp (v8i16 V128:$Rn))), maski16_or_more)),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i32 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), ssub),
-          ssub))>;
-
-}
-
-multiclass SIMDAcrossLanesSignedLongIntrinsic<string baseOpc, Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          (i64 0)))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (SMOVvi16to32
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          (i64 0)))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-           (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-           ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-           (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-multiclass SIMDAcrossLanesUnsignedLongIntrinsic<string baseOpc,
-                                                Intrinsic intOp> {
-  def : Pat<(i32 (intOp (v8i8 V64:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i8v")) V64:$Rn), hsub),
-          ssub))>;
-def : Pat<(i32 (intOp (v16i8 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v16i8v")) V128:$Rn), hsub),
-          ssub))>;
-
-def : Pat<(i32 (intOp (v4i16 V64:$Rn))),
-          (i32 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (!cast<Instruction>(!strconcat(baseOpc, "v4i16v")) V64:$Rn), ssub),
-            ssub))>;
-def : Pat<(i32 (intOp (v8i16 V128:$Rn))),
-        (i32 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v8i16v")) V128:$Rn), ssub),
-          ssub))>;
-
-def : Pat<(i64 (intOp (v4i32 V128:$Rn))),
-        (i64 (EXTRACT_SUBREG
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-            (!cast<Instruction>(!strconcat(baseOpc, "v4i32v")) V128:$Rn), dsub),
-          dsub))>;
-}
-
-defm : SIMDAcrossLanesSignedIntrinsic<"ADDV",  int_arm64_neon_saddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_saddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"ADDV",  int_arm64_neon_uaddv>;
-// vaddv_[su]32 is special; -> ADDP Vd.2S,Vn.2S,Vm.2S; return Vd.s[0];Vn==Vm
-def : Pat<(i32 (int_arm64_neon_uaddv (v2i32 V64:$Rn))),
-          (EXTRACT_SUBREG (ADDPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMAXV", int_arm64_neon_smaxv>;
-def : Pat<(i32 (int_arm64_neon_smaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedIntrinsic<"SMINV", int_arm64_neon_sminv>;
-def : Pat<(i32 (int_arm64_neon_sminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (SMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMAXV", int_arm64_neon_umaxv>;
-def : Pat<(i32 (int_arm64_neon_umaxv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMAXPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesUnsignedIntrinsic<"UMINV", int_arm64_neon_uminv>;
-def : Pat<(i32 (int_arm64_neon_uminv (v2i32 V64:$Rn))),
-           (EXTRACT_SUBREG (UMINPv2i32 V64:$Rn, V64:$Rn), ssub)>;
-
-defm : SIMDAcrossLanesSignedLongIntrinsic<"SADDLV", int_arm64_neon_saddlv>;
-defm : SIMDAcrossLanesUnsignedLongIntrinsic<"UADDLV", int_arm64_neon_uaddlv>;
-
-// The vaddlv_s32 intrinsic gets mapped to SADDLP.
-def : Pat<(i64 (int_arm64_neon_saddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (SADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-// The vaddlv_u32 intrinsic gets mapped to UADDLP.
-def : Pat<(i64 (int_arm64_neon_uaddlv (v2i32 V64:$Rn))),
-          (i64 (EXTRACT_SUBREG
-            (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)),
-              (UADDLPv2i32_v1i64 V64:$Rn), dsub),
-            dsub))>;
-
-//------------------------------------------------------------------------------
-// AdvSIMD modified immediate instructions
-//------------------------------------------------------------------------------
-
-// AdvSIMD BIC
-defm BIC : SIMDModifiedImmVectorShiftTied<1, 0b11, 0b01, "bic", ARM64bici>;
-// AdvSIMD ORR
-defm ORR : SIMDModifiedImmVectorShiftTied<0, 0b11, 0b01, "orr", ARM64orri>;
-
-
-// AdvSIMD FMOV
-def FMOVv2f64_ns : SIMDModifiedImmVectorNoShift<1, 1, 0b1111, V128, fpimm8,
-                                              "fmov", ".2d",
-                       [(set (v2f64 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv2f32_ns : SIMDModifiedImmVectorNoShift<0, 0, 0b1111, V64,  fpimm8,
-                                              "fmov", ".2s",
-                       [(set (v2f32 V64:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-def FMOVv4f32_ns : SIMDModifiedImmVectorNoShift<1, 0, 0b1111, V128, fpimm8,
-                                              "fmov", ".4s",
-                       [(set (v4f32 V128:$Rd), (ARM64fmov imm0_255:$imm8))]>;
-
-// AdvSIMD MOVI
-
-// EDIT byte mask: scalar
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVID      : SIMDModifiedImmScalarNoShift<0, 1, 0b1110, "movi",
-                    [(set FPR64:$Rd, simdimmtype10:$imm8)]>;
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 here.
-def : Pat<(f64 (ARM64movi_edit imm0_255:$shift)),
-          (MOVID imm0_255:$shift)>;
-
-def : Pat<(v1i64 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v2i32 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v4i16 immAllZerosV), (MOVID (i32 0))>;
-def : Pat<(v8i8  immAllZerosV), (MOVID (i32 0))>;
-
-def : Pat<(v1i64 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v2i32 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v4i16 immAllOnesV), (MOVID (i32 255))>;
-def : Pat<(v8i8  immAllOnesV), (MOVID (i32 255))>;
-
-// EDIT byte mask: 2d
-
-// The movi_edit node has the immediate value already encoded, so we use
-// a plain imm0_255 in the pattern
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in
-def MOVIv2d_ns   : SIMDModifiedImmVectorNoShift<1, 1, 0b1110, V128,
-                                                simdimmtype10,
-                                                "movi", ".2d",
-                   [(set (v2i64 V128:$Rd), (ARM64movi_edit imm0_255:$imm8))]>;
-
-
-// Use movi.2d to materialize 0.0 if the HW does zero-cycle zeroing.
-// Complexity is added to break a tie with a plain MOVI.
-let AddedComplexity = 1 in {
-def : Pat<(f32   fpimm0),
-          (f32 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), ssub))>,
-      Requires<[HasZCZ]>;
-def : Pat<(f64   fpimm0),
-          (f64 (EXTRACT_SUBREG (v2i64 (MOVIv2d_ns (i32 0))), dsub))>,
-      Requires<[HasZCZ]>;
-}
-
-def : Pat<(v2i64 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v4i32 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v8i16 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-def : Pat<(v16i8 immAllZerosV), (MOVIv2d_ns (i32 0))>;
-
-def : Pat<(v2i64 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v4i32 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v8i16 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-def : Pat<(v16i8 immAllOnesV), (MOVIv2d_ns (i32 255))>;
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MOVI      : SIMDModifiedImmVectorShift<0, 0b10, 0b00, "movi">;
-def : Pat<(v2i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64movi_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MOVIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MOVIv2s_msl  : SIMDModifiedImmMoveMSL<0, 0, {1,1,0,?}, V64, "movi", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MOVIv4s_msl  : SIMDModifiedImmMoveMSL<1, 0, {1,1,0,?}, V128, "movi", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64movi_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-// Per byte: 8b & 16b
-def MOVIv8b_ns   : SIMDModifiedImmVectorNoShift<0, 0, 0b1110, V64,  imm0_255,
-                                                 "movi", ".8b",
-                       [(set (v8i8 V64:$Rd), (ARM64movi imm0_255:$imm8))]>;
-def MOVIv16b_ns  : SIMDModifiedImmVectorNoShift<1, 0, 0b1110, V128, imm0_255,
-                                                 "movi", ".16b",
-                       [(set (v16i8 V128:$Rd), (ARM64movi imm0_255:$imm8))]>;
-
-// AdvSIMD MVNI
-
-// EDIT per word & halfword: 2s, 4h, 4s, & 8h
-defm MVNI      : SIMDModifiedImmVectorShift<1, 0b10, 0b00, "mvni">;
-def : Pat<(v2i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv2i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i32 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i32 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v4i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv4i16 imm0_255:$imm8, imm:$shift)>;
-def : Pat<(v8i16 (ARM64mvni_shift imm0_255:$imm8, (i32 imm:$shift))),
-          (MVNIv8i16 imm0_255:$imm8, imm:$shift)>;
-
-// EDIT per word: 2s & 4s with MSL shifter
-def MVNIv2s_msl   : SIMDModifiedImmMoveMSL<0, 1, {1,1,0,?}, V64, "mvni", ".2s",
-                      [(set (v2i32 V64:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-def MVNIv4s_msl   : SIMDModifiedImmMoveMSL<1, 1, {1,1,0,?}, V128, "mvni", ".4s",
-                      [(set (v4i32 V128:$Rd),
-                            (ARM64mvni_msl imm0_255:$imm8, (i32 imm:$shift)))]>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD indexed element
-//----------------------------------------------------------------------------
-
-let neverHasSideEffects = 1 in {
-  defm FMLA  : SIMDFPIndexedSDTied<0, 0b0001, "fmla">;
-  defm FMLS  : SIMDFPIndexedSDTied<0, 0b0101, "fmls">;
-}
-
-// NOTE: Operands are reordered in the FMLA/FMLS PatFrags because the
-// instruction expects the addend first, while the intrinsic expects it last.
-
-// On the other hand, there are quite a few valid combinatorial options due to
-// the commutativity of multiplication and the fact that (-x) * y = x * (-y).
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)>>;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLA",
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)>>;
-
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$MHS, (fneg node:$RHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma node:$RHS, (fneg node:$MHS), node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$RHS), node:$MHS, node:$LHS)> >;
-defm : SIMDFPIndexedSDTiedPatterns<"FMLS",
-           TriOpFrag<(fma (fneg node:$MHS), node:$RHS, node:$LHS)> >;
-
-multiclass FMLSIndexedAfterNegPatterns<SDPatternOperator OpNode> {
-  // 3 variants for the .2s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (v2f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv2i32_indexed V64:$Rd, V64:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 3 variants for the .4s version: DUPLANE from 128-bit, DUPLANE from 64-bit
-  // and DUP scalar.
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64duplane32 (v4f32 (fneg V128:$Rm)),
-                                           VectorIndexS:$idx))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn, V128:$Rm,
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (v4f32 (ARM64duplane32
-                                      (v4f32 (insert_subvector undef,
-                                                 (v2f32 (fneg V64:$Rm)),
-                                                 (i32 0))),
-                                      VectorIndexS:$idx)))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                               (SUBREG_TO_REG (i32 0), V64:$Rm, dsub),
-                               VectorIndexS:$idx)>;
-  def : Pat<(v4f32 (OpNode (v4f32 V128:$Rd), (v4f32 V128:$Rn),
-                           (ARM64dup (f32 (fneg FPR32Op:$Rm))))),
-            (FMLSv4i32_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR32Op:$Rm, ssub), (i64 0))>;
-
-  // 2 variants for the .2d version: DUPLANE from 128-bit, and DUP scalar
-  // (DUPLANE from 64-bit would be trivial).
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64duplane64 (v2f64 (fneg V128:$Rm)),
-                                           VectorIndexD:$idx))),
-            (FMLSv2i64_indexed
-                V128:$Rd, V128:$Rn, V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(v2f64 (OpNode (v2f64 V128:$Rd), (v2f64 V128:$Rn),
-                           (ARM64dup (f64 (fneg FPR64Op:$Rm))))),
-            (FMLSv2i64_indexed V128:$Rd, V128:$Rn,
-                (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
-
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v4f32 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 (fneg V64:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i32_indexed FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
-
-  // 1 variant for 64-bit scalar version: extract from .1d or from .2d
-  def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
-                         (vector_extract (v2f64 (fneg V128:$Rm)),
-                                         VectorIndexS:$idx))),
-            (FMLSv1i64_indexed FPR64:$Rd, FPR64:$Rn,
-                V128:$Rm, VectorIndexS:$idx)>;
-}
-
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$RHS, node:$MHS, node:$LHS)> >;
-defm : FMLSIndexedAfterNegPatterns<
-           TriOpFrag<(fma node:$MHS, node:$RHS, node:$LHS)> >;
-
-defm FMULX : SIMDFPIndexedSD<1, 0b1001, "fmulx", int_arm64_neon_fmulx>;
-defm FMUL  : SIMDFPIndexedSD<0, 0b1001, "fmul", fmul>;
-
-def : Pat<(v2f32 (fmul V64:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv2i32_indexed V64:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v4f32 (fmul V128:$Rn, (ARM64dup (f32 FPR32:$Rm)))),
-          (FMULv4i32_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR32:$Rm, ssub),
-            (i64 0))>;
-def : Pat<(v2f64 (fmul V128:$Rn, (ARM64dup (f64 FPR64:$Rm)))),
-          (FMULv2i64_indexed V128:$Rn,
-            (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$Rm, dsub),
-            (i64 0))>;
-
-defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_arm64_neon_sqdmulh>;
-defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_arm64_neon_sqrdmulh>;
-defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
-              TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
-              TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MUL   : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
-defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMLSL : SIMDVectorIndexedLongSDTied<0, 0b0110, "smlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_smull node:$MHS, node:$RHS))>>;
-defm SMULL : SIMDVectorIndexedLongSD<0, 0b1010, "smull",
-                int_arm64_neon_smull>;
-defm SQDMLAL : SIMDIndexedLongSQDMLXSDTied<0, 0b0011, "sqdmlal",
-                                           int_arm64_neon_sqadd>;
-defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
-                                           int_arm64_neon_sqsub>;
-defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_arm64_neon_sqdmull>;
-defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
-    TriOpFrag<(add node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMLSL   : SIMDVectorIndexedLongSDTied<1, 0b0110, "umlsl",
-    TriOpFrag<(sub node:$LHS, (int_arm64_neon_umull node:$MHS, node:$RHS))>>;
-defm UMULL   : SIMDVectorIndexedLongSD<1, 0b1010, "umull",
-                int_arm64_neon_umull>;
-
-// A scalar sqdmull with the second operand being a vector lane can be
-// handled directly with the indexed instruction encoding.
-def : Pat<(int_arm64_neon_sqdmulls_scalar (i32 FPR32:$Rn),
-                                          (vector_extract (v4i32 V128:$Vm),
-                                                           VectorIndexS:$idx)),
-          (SQDMULLv1i64_indexed FPR32:$Rn, V128:$Vm, VectorIndexS:$idx)>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD scalar shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS : SIMDScalarRShiftSD<0, 0b11111, "fcvtzs">;
-defm FCVTZU : SIMDScalarRShiftSD<1, 0b11111, "fcvtzu">;
-defm SCVTF  : SIMDScalarRShiftSD<0, 0b11100, "scvtf">;
-defm UCVTF  : SIMDScalarRShiftSD<1, 0b11100, "ucvtf">;
-// Codegen patterns for the above. We don't put these directly on the
-// instructions because TableGen's type inference can't handle the truth.
-// Having the same base pattern for fp <--> int totally freaks it out.
-def : Pat<(int_arm64_neon_vcvtfp2fxs FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZSs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfp2fxu FPR32:$Rn, vecshiftR32:$imm),
-          (FCVTZUs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxs (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(i64 (int_arm64_neon_vcvtfp2fxu (f64 FPR64:$Rn), vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxs (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZSd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1i64 (int_arm64_neon_vcvtfp2fxu (v1f64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (FCVTZUd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxs2fp FPR32:$Rn, vecshiftR32:$imm),
-          (SCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(int_arm64_neon_vcvtfxu2fp FPR32:$Rn, vecshiftR32:$imm),
-          (UCVTFs FPR32:$Rn, vecshiftR32:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxs2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(f64 (int_arm64_neon_vcvtfxu2fp (i64 FPR64:$Rn), vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxs2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (SCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-def : Pat<(v1f64 (int_arm64_neon_vcvtfxu2fp (v1i64 FPR64:$Rn),
-                                            vecshiftR64:$imm)),
-          (UCVTFd FPR64:$Rn, vecshiftR64:$imm)>;
-
-defm SHL      : SIMDScalarLShiftD<   0, 0b01010, "shl", ARM64vshl>;
-defm SLI      : SIMDScalarLShiftDTied<1, 0b01010, "sli">;
-defm SQRSHRN  : SIMDScalarRShiftBHS< 0, 0b10011, "sqrshrn",
-                                     int_arm64_neon_sqrshrn>;
-defm SQRSHRUN : SIMDScalarRShiftBHS< 1, 0b10001, "sqrshrun",
-                                     int_arm64_neon_sqrshrun>;
-defm SQSHLU   : SIMDScalarLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL    : SIMDScalarLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN   : SIMDScalarRShiftBHS< 0, 0b10010, "sqshrn",
-                                     int_arm64_neon_sqshrn>;
-defm SQSHRUN  : SIMDScalarRShiftBHS< 1, 0b10000, "sqshrun",
-                                     int_arm64_neon_sqshrun>;
-defm SRI      : SIMDScalarRShiftDTied<   1, 0b01000, "sri">;
-defm SRSHR    : SIMDScalarRShiftD<   0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA    : SIMDScalarRShiftDTied<   0, 0b00110, "srsra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64srshri node:$MHS, node:$RHS))>>;
-defm SSHR     : SIMDScalarRShiftD<   0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA     : SIMDScalarRShiftDTied<   0, 0b00010, "ssra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UQRSHRN  : SIMDScalarRShiftBHS< 1, 0b10011, "uqrshrn",
-                                     int_arm64_neon_uqrshrn>;
-defm UQSHL    : SIMDScalarLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN   : SIMDScalarRShiftBHS< 1, 0b10010, "uqshrn",
-                                     int_arm64_neon_uqshrn>;
-defm URSHR    : SIMDScalarRShiftD<   1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA    : SIMDScalarRShiftDTied<   1, 0b00110, "ursra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64urshri node:$MHS, node:$RHS))>>;
-defm USHR     : SIMDScalarRShiftD<   1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA     : SIMDScalarRShiftDTied<   1, 0b00010, "usra",
-    TriOpFrag<(add node:$LHS,
-                   (ARM64vlshr node:$MHS, node:$RHS))>>;
-
-//----------------------------------------------------------------------------
-// AdvSIMD vector shift instructions
-//----------------------------------------------------------------------------
-defm FCVTZS:SIMDVectorRShiftSD<0, 0b11111, "fcvtzs", int_arm64_neon_vcvtfp2fxs>;
-defm FCVTZU:SIMDVectorRShiftSD<1, 0b11111, "fcvtzu", int_arm64_neon_vcvtfp2fxu>;
-defm SCVTF: SIMDVectorRShiftSDToFP<0, 0b11100, "scvtf",
-                                   int_arm64_neon_vcvtfxs2fp>;
-defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
-                                         int_arm64_neon_rshrn>;
-defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", ARM64vshl>;
-defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
-                          BinOpFrag<(trunc (ARM64vashr node:$LHS, node:$RHS))>>;
-defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_arm64_neon_vsli>;
-def : Pat<(v1i64 (int_arm64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftL64:$imm))),
-          (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
-defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
-                                         int_arm64_neon_sqrshrn>;
-defm SQRSHRUN: SIMDVectorRShiftNarrowBHS<1, 0b10001, "sqrshrun",
-                                         int_arm64_neon_sqrshrun>;
-defm SQSHLU : SIMDVectorLShiftBHSD<1, 0b01100, "sqshlu", ARM64sqshlui>;
-defm SQSHL  : SIMDVectorLShiftBHSD<0, 0b01110, "sqshl", ARM64sqshli>;
-defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
-                                         int_arm64_neon_sqshrn>;
-defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
-                                         int_arm64_neon_sqshrun>;
-defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_arm64_neon_vsri>;
-def : Pat<(v1i64 (int_arm64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
-                                      (i32 vecshiftR64:$imm))),
-          (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
-defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", ARM64srshri>;
-defm SRSRA   : SIMDVectorRShiftBHSDTied<0, 0b00110, "srsra",
-                 TriOpFrag<(add node:$LHS,
-                                (ARM64srshri node:$MHS, node:$RHS))> >;
-defm SSHLL   : SIMDVectorLShiftLongBHSD<0, 0b10100, "sshll",
-                BinOpFrag<(ARM64vshl (sext node:$LHS), node:$RHS)>>;
-
-defm SSHR    : SIMDVectorRShiftBHSD<0, 0b00000, "sshr", ARM64vashr>;
-defm SSRA    : SIMDVectorRShiftBHSDTied<0, 0b00010, "ssra",
-                TriOpFrag<(add node:$LHS, (ARM64vashr node:$MHS, node:$RHS))>>;
-defm UCVTF   : SIMDVectorRShiftSDToFP<1, 0b11100, "ucvtf",
-                        int_arm64_neon_vcvtfxu2fp>;
-defm UQRSHRN : SIMDVectorRShiftNarrowBHS<1, 0b10011, "uqrshrn",
-                                         int_arm64_neon_uqrshrn>;
-defm UQSHL   : SIMDVectorLShiftBHSD<1, 0b01110, "uqshl", ARM64uqshli>;
-defm UQSHRN  : SIMDVectorRShiftNarrowBHS<1, 0b10010, "uqshrn",
-                                         int_arm64_neon_uqshrn>;
-defm URSHR   : SIMDVectorRShiftBHSD<1, 0b00100, "urshr", ARM64urshri>;
-defm URSRA   : SIMDVectorRShiftBHSDTied<1, 0b00110, "ursra",
-                TriOpFrag<(add node:$LHS,
-                               (ARM64urshri node:$MHS, node:$RHS))> >;
-defm USHLL   : SIMDVectorLShiftLongBHSD<1, 0b10100, "ushll",
-                BinOpFrag<(ARM64vshl (zext node:$LHS), node:$RHS)>>;
-defm USHR    : SIMDVectorRShiftBHSD<1, 0b00000, "ushr", ARM64vlshr>;
-defm USRA    : SIMDVectorRShiftBHSDTied<1, 0b00010, "usra",
-                TriOpFrag<(add node:$LHS, (ARM64vlshr node:$MHS, node:$RHS))> >;
-
-// SHRN patterns for when a logical right shift was used instead of arithmetic
-// (the immediate guarantees no sign bits actually end up in the result so it
-// doesn't matter).
-def : Pat<(v8i8 (trunc (ARM64vlshr (v8i16 V128:$Rn), vecshiftR16Narrow:$imm))),
-          (SHRNv8i8_shift V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v4i16 (trunc (ARM64vlshr (v4i32 V128:$Rn), vecshiftR32Narrow:$imm))),
-          (SHRNv4i16_shift V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v2i32 (trunc (ARM64vlshr (v2i64 V128:$Rn), vecshiftR64Narrow:$imm))),
-          (SHRNv2i32_shift V128:$Rn, vecshiftR64Narrow:$imm)>;
-
-def : Pat<(v16i8 (concat_vectors (v8i8 V64:$Rd),
-                                 (trunc (ARM64vlshr (v8i16 V128:$Rn),
-                                                    vecshiftR16Narrow:$imm)))),
-          (SHRNv16i8_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR16Narrow:$imm)>;
-def : Pat<(v8i16 (concat_vectors (v4i16 V64:$Rd),
-                                 (trunc (ARM64vlshr (v4i32 V128:$Rn),
-                                                    vecshiftR32Narrow:$imm)))),
-          (SHRNv8i16_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-def : Pat<(v4i32 (concat_vectors (v2i32 V64:$Rd),
-                                 (trunc (ARM64vlshr (v2i64 V128:$Rn),
-                                                    vecshiftR64Narrow:$imm)))),
-          (SHRNv4i32_shift (INSERT_SUBREG (IMPLICIT_DEF), V64:$Rd, dsub),
-                           V128:$Rn, vecshiftR32Narrow:$imm)>;
-
-// Vector sign and zero extensions are implemented with SSHLL and USSHLL.
-// Anyexts are implemented as zexts.
-def : Pat<(v8i16 (sext   (v8i8 V64:$Rn))),  (SSHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v8i16 (anyext (v8i8 V64:$Rn))),  (USHLLv8i8_shift  V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 V64:$Rn))), (SSHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 V64:$Rn))), (USHLLv4i16_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 V64:$Rn))), (SSHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 V64:$Rn))), (USHLLv2i32_shift V64:$Rn, (i32 0))>;
-// Also match an extend from the upper half of a 128 bit source register.
-def : Pat<(v8i16 (anyext (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (zext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (USHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v8i16 (sext   (v8i8 (extract_subvector V128:$Rn, (i64 8)) ))),
-          (SSHLLv16i8_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (anyext (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (zext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (USHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v4i32 (sext   (v4i16 (extract_subvector V128:$Rn, (i64 4)) ))),
-          (SSHLLv8i16_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (anyext (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (zext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (USHLLv4i32_shift V128:$Rn, (i32 0))>;
-def : Pat<(v2i64 (sext   (v2i32 (extract_subvector V128:$Rn, (i64 2)) ))),
-          (SSHLLv4i32_shift V128:$Rn, (i32 0))>;
-
-// Vector shift sxtl aliases
-def : InstAlias<"sxtl.8h $dst, $src1",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.8h, $src1.8b",
-                (SSHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.4s $dst, $src1",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.4s, $src1.4h",
-                (SSHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl.2d $dst, $src1",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"sxtl $dst.2d, $src1.2s",
-                (SSHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift sxtl2 aliases
-def : InstAlias<"sxtl2.8h $dst, $src1",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.8h, $src1.16b",
-                (SSHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.4s $dst, $src1",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.4s, $src1.8h",
-                (SSHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2.2d $dst, $src1",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"sxtl2 $dst.2d, $src1.4s",
-                (SSHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// Vector shift uxtl aliases
-def : InstAlias<"uxtl.8h $dst, $src1",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.8h, $src1.8b",
-                (USHLLv8i8_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.4s $dst, $src1",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.4s, $src1.4h",
-                (USHLLv4i16_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl.2d $dst, $src1",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-def : InstAlias<"uxtl $dst.2d, $src1.2s",
-                (USHLLv2i32_shift V128:$dst, V64:$src1, 0)>;
-
-// Vector shift uxtl2 aliases
-def : InstAlias<"uxtl2.8h $dst, $src1",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.8h, $src1.16b",
-                (USHLLv16i8_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.4s $dst, $src1",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.4s, $src1.8h",
-                (USHLLv8i16_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2.2d $dst, $src1",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-def : InstAlias<"uxtl2 $dst.2d, $src1.4s",
-                (USHLLv4i32_shift V128:$dst, V128:$src1, 0)>;
-
-// If an integer is about to be converted to a floating point value,
-// just load it on the floating point unit.
-// These patterns are more complex because floating point loads do not
-// support sign extension.
-// The sign extension has to be explicitly added and is only supported for
-// one step: byte-to-half, half-to-word, word-to-doubleword.
-// SCVTF GPR -> FPR is 9 cycles.
-// SCVTF FPR -> FPR is 4 cyclces.
-// (sign extension with lengthen) SXTL FPR -> FPR is 2 cycles.
-// Therefore, we can do 2 sign extensions and one SCVTF FPR -> FPR
-// and still being faster.
-// However, this is not good for code size.
-// 8-bits -> float. 2 sizes step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 ro_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBro ro_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_indexed8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRBui am_indexed8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi8 am_unscaled8:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv8i8_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURBi am_unscaled8:$addr),
-                                                  bsub),
-                                     0),
-                                   dsub)),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 16-bits -> float. 1 size step-up.
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHro ro_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRHui am_indexed16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f32 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i32 (f32 (EXTRACT_SUBREG
-                              (SSHLLv4i16_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURHi am_unscaled16:$addr),
-                                               hsub),
-                               0),
-                           ssub)))>, Requires<[NotForCodeSize]>;
-// 32-bits to 32-bits are handled in target specific dag combine:
-// performIntToFpCombine.
-// 64-bits integer to 32-bits floating point, not possible with
-// SCVTF on floating point registers (both source and destination
-// must have the same size).
-
-// Here are the patterns for 8, 16, 32, and 64-bits to double.
-// 8-bits -> double. 3 size step-up: give up.
-// 16-bits -> double. 2 size step.
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 ro_indexed16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                 (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                      (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHro ro_indexed16:$addr),
-                                                  hsub),
-                                     0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_indexed16:$addr)))),
-           (SCVTFv1i64  (f64 (EXTRACT_SUBREG
-                               (SSHLLv2i32_shift
-                                 (f64
-                                   (EXTRACT_SUBREG
-                                     (SSHLLv4i16_shift
-                                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDRHui am_indexed16:$addr),
-                                                  hsub),
-                                      0),
-                                    dsub)),
-                                 0),
-                              dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (sextloadi16 am_unscaled16:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (f64
-                                  (EXTRACT_SUBREG
-                                    (SSHLLv4i16_shift
-                                     (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                                  (LDURHi am_unscaled16:$addr),
-                                                  hsub),
-                                      0),
-                                   dsub)),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 32-bits -> double. 1 size step-up.
-def : Pat <(f64 (sint_to_fp (i32 (load ro_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSro ro_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_indexed32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDRSui am_indexed32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-def : Pat <(f64 (sint_to_fp (i32 (load am_unscaled32:$addr)))),
-           (SCVTFv1i64 (f64 (EXTRACT_SUBREG
-                              (SSHLLv2i32_shift
-                                (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-                                               (LDURSi am_unscaled32:$addr),
-                                               ssub),
-                               0),
-                             dsub)))>, Requires<[NotForCodeSize]>;
-// 64-bits -> double are handled in target specific dag combine:
-// performIntToFpCombine.
-
-
-//----------------------------------------------------------------------------
-// AdvSIMD Load-Store Structure
-//----------------------------------------------------------------------------
-defm LD1 : SIMDLd1Multiple<"ld1">;
-defm LD2 : SIMDLd2Multiple<"ld2">;
-defm LD3 : SIMDLd3Multiple<"ld3">;
-defm LD4 : SIMDLd4Multiple<"ld4">;
-
-defm ST1 : SIMDSt1Multiple<"st1">;
-defm ST2 : SIMDSt2Multiple<"st2">;
-defm ST3 : SIMDSt3Multiple<"st3">;
-defm ST4 : SIMDSt4Multiple<"st4">;
-
-class Ld1Pat<ValueType ty, Instruction INST>
-  : Pat<(ty (load am_simdnoindex:$vaddr)), (INST am_simdnoindex:$vaddr)>;
-
-def : Ld1Pat<v16i8, LD1Onev16b>;
-def : Ld1Pat<v8i16, LD1Onev8h>;
-def : Ld1Pat<v4i32, LD1Onev4s>;
-def : Ld1Pat<v2i64, LD1Onev2d>;
-def : Ld1Pat<v8i8,  LD1Onev8b>;
-def : Ld1Pat<v4i16, LD1Onev4h>;
-def : Ld1Pat<v2i32, LD1Onev2s>;
-def : Ld1Pat<v1i64, LD1Onev1d>;
-
-class St1Pat<ValueType ty, Instruction INST>
-  : Pat<(store ty:$Vt, am_simdnoindex:$vaddr),
-        (INST ty:$Vt, am_simdnoindex:$vaddr)>;
-
-def : St1Pat<v16i8, ST1Onev16b>;
-def : St1Pat<v8i16, ST1Onev8h>;
-def : St1Pat<v4i32, ST1Onev4s>;
-def : St1Pat<v2i64, ST1Onev2d>;
-def : St1Pat<v8i8,  ST1Onev8b>;
-def : St1Pat<v4i16, ST1Onev4h>;
-def : St1Pat<v2i32, ST1Onev2s>;
-def : St1Pat<v1i64, ST1Onev1d>;
-
-//---
-// Single-element
-//---
-
-defm LD1R          : SIMDLdR<0, 0b110, 0, "ld1r", "One", 1, 2, 4, 8>;
-defm LD2R          : SIMDLdR<1, 0b110, 0, "ld2r", "Two", 2, 4, 8, 16>;
-defm LD3R          : SIMDLdR<0, 0b111, 0, "ld3r", "Three", 3, 6, 12, 24>;
-defm LD4R          : SIMDLdR<1, 0b111, 0, "ld4r", "Four", 4, 8, 16, 32>;
-let mayLoad = 1, neverHasSideEffects = 1 in {
-defm LD1 : SIMDLdSingleBTied<0, 0b000,       "ld1", VecListOneb,   GPR64pi1>;
-defm LD1 : SIMDLdSingleHTied<0, 0b010, 0,    "ld1", VecListOneh,   GPR64pi2>;
-defm LD1 : SIMDLdSingleSTied<0, 0b100, 0b00, "ld1", VecListOnes,   GPR64pi4>;
-defm LD1 : SIMDLdSingleDTied<0, 0b100, 0b01, "ld1", VecListOned,   GPR64pi8>;
-defm LD2 : SIMDLdSingleBTied<1, 0b000,       "ld2", VecListTwob,   GPR64pi2>;
-defm LD2 : SIMDLdSingleHTied<1, 0b010, 0,    "ld2", VecListTwoh,   GPR64pi4>;
-defm LD2 : SIMDLdSingleSTied<1, 0b100, 0b00, "ld2", VecListTwos,   GPR64pi8>;
-defm LD2 : SIMDLdSingleDTied<1, 0b100, 0b01, "ld2", VecListTwod,   GPR64pi16>;
-defm LD3 : SIMDLdSingleBTied<0, 0b001,       "ld3", VecListThreeb, GPR64pi3>;
-defm LD3 : SIMDLdSingleHTied<0, 0b011, 0,    "ld3", VecListThreeh, GPR64pi6>;
-defm LD3 : SIMDLdSingleSTied<0, 0b101, 0b00, "ld3", VecListThrees, GPR64pi12>;
-defm LD3 : SIMDLdSingleDTied<0, 0b101, 0b01, "ld3", VecListThreed, GPR64pi24>;
-defm LD4 : SIMDLdSingleBTied<1, 0b001,       "ld4", VecListFourb,  GPR64pi4>;
-defm LD4 : SIMDLdSingleHTied<1, 0b011, 0,    "ld4", VecListFourh,  GPR64pi8>;
-defm LD4 : SIMDLdSingleSTied<1, 0b101, 0b00, "ld4", VecListFours,  GPR64pi16>;
-defm LD4 : SIMDLdSingleDTied<1, 0b101, 0b01, "ld4", VecListFourd,  GPR64pi32>;
-}
-
-def : Pat<(v8i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv8b am_simdnoindex:$vaddr)>;
-def : Pat<(v16i8 (ARM64dup (i32 (extloadi8 am_simdnoindex:$vaddr)))),
-          (LD1Rv16b am_simdnoindex:$vaddr)>;
-def : Pat<(v4i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv4h am_simdnoindex:$vaddr)>;
-def : Pat<(v8i16 (ARM64dup (i32 (extloadi16 am_simdnoindex:$vaddr)))),
-          (LD1Rv8h am_simdnoindex:$vaddr)>;
-def : Pat<(v2i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4i32 (ARM64dup (i32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1i64 (ARM64dup (i64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-// Grab the floating point version too
-def : Pat<(v2f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2s am_simdnoindex:$vaddr)>;
-def : Pat<(v4f32 (ARM64dup (f32 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv4s am_simdnoindex:$vaddr)>;
-def : Pat<(v2f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv2d am_simdnoindex:$vaddr)>;
-def : Pat<(v1f64 (ARM64dup (f64 (load am_simdnoindex:$vaddr)))),
-          (LD1Rv1d am_simdnoindex:$vaddr)>;
-
-class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne128:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (LD1 VecListOne128:$Rd, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : Ld1Lane128Pat<extloadi8,  VectorIndexB, v16i8, i32, LD1i8>;
-def : Ld1Lane128Pat<extloadi16, VectorIndexH, v8i16, i32, LD1i16>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4i32, i32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
-def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
-
-class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction LD1>
-  : Pat<(vector_insert (VTy VecListOne64:$Rd),
-           (STy (scalar_load am_simdnoindex:$vaddr)), VecIndex:$idx),
-        (EXTRACT_SUBREG
-            (LD1 (SUBREG_TO_REG (i32 0), VecListOne64:$Rd, dsub),
-                          VecIndex:$idx, am_simdnoindex:$vaddr),
-            dsub)>;
-
-def : Ld1Lane64Pat<extloadi8,  VectorIndexB, v8i8,  i32, LD1i8>;
-def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
-def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
-
-
-defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
-defm LD2 : SIMDLdSt2SingleAliases<"ld2">;
-defm LD3 : SIMDLdSt3SingleAliases<"ld3">;
-defm LD4 : SIMDLdSt4SingleAliases<"ld4">;
-
-// Stores
-defm ST1 : SIMDStSingleB<0, 0b000,       "st1", VecListOneb, GPR64pi1>;
-defm ST1 : SIMDStSingleH<0, 0b010, 0,    "st1", VecListOneh, GPR64pi2>;
-defm ST1 : SIMDStSingleS<0, 0b100, 0b00, "st1", VecListOnes, GPR64pi4>;
-defm ST1 : SIMDStSingleD<0, 0b100, 0b01, "st1", VecListOned, GPR64pi8>;
-
-let AddedComplexity = 8 in
-class St1Lane128Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                    ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne128:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 VecListOne128:$Vt, VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane128Pat<truncstorei8,  VectorIndexB, v16i8, i32, ST1i8>;
-def : St1Lane128Pat<truncstorei16, VectorIndexH, v8i16, i32, ST1i16>;
-def : St1Lane128Pat<store,         VectorIndexS, v4i32, i32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
-def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
-def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
-
-let AddedComplexity = 8 in
-class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
-                   ValueType VTy, ValueType STy, Instruction ST1>
-  : Pat<(scalar_store
-             (STy (vector_extract (VTy VecListOne64:$Vt), VecIndex:$idx)),
-             am_simdnoindex:$vaddr),
-        (ST1 (SUBREG_TO_REG (i32 0), VecListOne64:$Vt, dsub),
-             VecIndex:$idx, am_simdnoindex:$vaddr)>;
-
-def : St1Lane64Pat<truncstorei8,  VectorIndexB, v8i8, i32, ST1i8>;
-def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
-def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
-def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
-
-let mayStore = 1, neverHasSideEffects = 1 in {
-defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
-defm ST2 : SIMDStSingleH<1, 0b010, 0,    "st2", VecListTwoh,   GPR64pi4>;
-defm ST2 : SIMDStSingleS<1, 0b100, 0b00, "st2", VecListTwos,   GPR64pi8>;
-defm ST2 : SIMDStSingleD<1, 0b100, 0b01, "st2", VecListTwod,   GPR64pi16>;
-defm ST3 : SIMDStSingleB<0, 0b001,       "st3", VecListThreeb, GPR64pi3>;
-defm ST3 : SIMDStSingleH<0, 0b011, 0,    "st3", VecListThreeh, GPR64pi6>;
-defm ST3 : SIMDStSingleS<0, 0b101, 0b00, "st3", VecListThrees, GPR64pi12>;
-defm ST3 : SIMDStSingleD<0, 0b101, 0b01, "st3", VecListThreed, GPR64pi24>;
-defm ST4 : SIMDStSingleB<1, 0b001,       "st4", VecListFourb,  GPR64pi4>;
-defm ST4 : SIMDStSingleH<1, 0b011, 0,    "st4", VecListFourh,  GPR64pi8>;
-defm ST4 : SIMDStSingleS<1, 0b101, 0b00, "st4", VecListFours,  GPR64pi16>;
-defm ST4 : SIMDStSingleD<1, 0b101, 0b01, "st4", VecListFourd,  GPR64pi32>;
-}
-
-defm ST1 : SIMDLdSt1SingleAliases<"st1">;
-defm ST2 : SIMDLdSt2SingleAliases<"st2">;
-defm ST3 : SIMDLdSt3SingleAliases<"st3">;
-defm ST4 : SIMDLdSt4SingleAliases<"st4">;
-
-//----------------------------------------------------------------------------
-// Crypto extensions
-//----------------------------------------------------------------------------
-
-def AESErr   : AESTiedInst<0b0100, "aese",   int_arm64_crypto_aese>;
-def AESDrr   : AESTiedInst<0b0101, "aesd",   int_arm64_crypto_aesd>;
-def AESMCrr  : AESInst<    0b0110, "aesmc",  int_arm64_crypto_aesmc>;
-def AESIMCrr : AESInst<    0b0111, "aesimc", int_arm64_crypto_aesimc>;
-
-def SHA1Crrr     : SHATiedInstQSV<0b000, "sha1c",   int_arm64_crypto_sha1c>;
-def SHA1Prrr     : SHATiedInstQSV<0b001, "sha1p",   int_arm64_crypto_sha1p>;
-def SHA1Mrrr     : SHATiedInstQSV<0b010, "sha1m",   int_arm64_crypto_sha1m>;
-def SHA1SU0rrr   : SHATiedInstVVV<0b011, "sha1su0", int_arm64_crypto_sha1su0>;
-def SHA256Hrrr   : SHATiedInstQQV<0b100, "sha256h", int_arm64_crypto_sha256h>;
-def SHA256H2rrr  : SHATiedInstQQV<0b101, "sha256h2",int_arm64_crypto_sha256h2>;
-def SHA256SU1rrr :SHATiedInstVVV<0b110, "sha256su1",int_arm64_crypto_sha256su1>;
-
-def SHA1Hrr     : SHAInstSS<    0b0000, "sha1h",    int_arm64_crypto_sha1h>;
-def SHA1SU1rr   : SHATiedInstVV<0b0001, "sha1su1",  int_arm64_crypto_sha1su1>;
-def SHA256SU0rr : SHATiedInstVV<0b0010, "sha256su0",int_arm64_crypto_sha256su0>;
-
-//----------------------------------------------------------------------------
-// Compiler-pseudos
-//----------------------------------------------------------------------------
-// FIXME: Like for X86, these should go in their own separate .td file.
-
-// Any instruction that defines a 32-bit result leaves the high half of the
-// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
-// be copying from a truncate. But any other 32-bit operation will zero-extend
-// up to 64 bits.
-// FIXME: X86 also checks for CMOV here. Do we need something similar?
-def def32 : PatLeaf<(i32 GPR32:$src), [{
-  return N->getOpcode() != ISD::TRUNCATE &&
-         N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
-         N->getOpcode() != ISD::CopyFromReg;
-}]>;
-
-// In the case of a 32-bit def that is known to implicitly zero-extend,
-// we can use a SUBREG_TO_REG.
-def : Pat<(i64 (zext def32:$src)), (SUBREG_TO_REG (i64 0), GPR32:$src, sub_32)>;
-
-// For an anyext, we don't care what the high bits are, so we can perform an
-// INSERT_SUBREF into an IMPLICIT_DEF.
-def : Pat<(i64 (anyext GPR32:$src)),
-          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32)>;
-
-// When we need to explicitly zero-extend, we use an unsigned bitfield move
-// instruction (UBFM) on the enclosing super-reg.
-def : Pat<(i64 (zext GPR32:$src)),
- (UBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-
-// To sign extend, we use a signed bitfield move instruction (SBFM) on the
-// containing super-reg.
-def : Pat<(i64 (sext GPR32:$src)),
-   (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$src, sub_32), 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i32)), (SBFMXri GPR64:$src, 0, 31)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i16)), (SBFMXri GPR64:$src, 0, 15)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i8)),  (SBFMXri GPR64:$src, 0, 7)>;
-def : Pat<(i64 (sext_inreg GPR64:$src, i1)),  (SBFMXri GPR64:$src, 0, 0)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i16)), (SBFMWri GPR32:$src, 0, 15)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i8)),  (SBFMWri GPR32:$src, 0, 7)>;
-def : Pat<(i32 (sext_inreg GPR32:$src, i1)),  (SBFMWri GPR32:$src, 0, 0)>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i8), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a       imm0_31:$imm)),
-                              (i64 (i32shift_sext_i8 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i8), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a imm0_63:$imm)),
-                              (i64 (i64shift_sext_i8 imm0_63:$imm)))>;
-
-def : Pat<(shl (sext_inreg GPR32:$Rn, i16), (i64 imm0_31:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 (i32shift_a        imm0_31:$imm)),
-                              (i64 (i32shift_sext_i16 imm0_31:$imm)))>;
-def : Pat<(shl (sext_inreg GPR64:$Rn, i16), (i64 imm0_63:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 (i64shift_a        imm0_63:$imm)),
-                              (i64 (i64shift_sext_i16 imm0_63:$imm)))>;
-
-def : Pat<(shl (i64 (sext GPR32:$Rn)), (i64 imm0_63:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 (i64shift_a        imm0_63:$imm)),
-                   (i64 (i64shift_sext_i32 imm0_63:$imm)))>;
-
-// sra patterns have an AddedComplexity of 10, so make sure we have a higher
-// AddedComplexity for the following patterns since we want to match sext + sra
-// patterns before we attempt to match a single sra node.
-let AddedComplexity = 20 in {
-// We support all sext + sra combinations which preserve at least one bit of the
-// original value which is to be sign extended. E.g. we support shifts up to
-// bitwidth-1 bits.
-def : Pat<(sra (sext_inreg GPR32:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_7:$imm), 7)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i8), (i64 imm0_7:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_7:$imm), 7)>;
-
-def : Pat<(sra (sext_inreg GPR32:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMWri GPR32:$Rn, (i64 imm0_15:$imm), 15)>;
-def : Pat<(sra (sext_inreg GPR64:$Rn, i16), (i64 imm0_15:$imm)),
-          (SBFMXri GPR64:$Rn, (i64 imm0_15:$imm), 15)>;
-
-def : Pat<(sra (i64 (sext GPR32:$Rn)), (i64 imm0_31:$imm)),
-          (SBFMXri (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GPR32:$Rn, sub_32),
-                   (i64 imm0_31:$imm), 31)>;
-} // AddedComplexity = 20
-
-// To truncate, we can simply extract from a subregister.
-def : Pat<(i32 (trunc GPR64sp:$src)),
-          (i32 (EXTRACT_SUBREG GPR64sp:$src, sub_32))>;
-
-// __builtin_trap() uses the BRK instruction on ARM64.
-def : Pat<(trap), (BRK 1)>;
-
-// Conversions within AdvSIMD types in the same register size are free.
-
-def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
-def : Pat<(v1i64 (bitconvert (v1f64 FPR64:$src))), (v1i64 FPR64:$src)>;
-
-def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
-
-def : Pat<(v4i16 (bitconvert (v1i64 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v8i8  FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v2f32 FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))), (v4i16 FPR64:$src)>;
-
-def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2i32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v4i16 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
-def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
-
-def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
-def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
-
-def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
-def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
-
-def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v4i16 FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
-def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
-
-
-def : Pat<(f128 (bitconvert (v2i64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4i32 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
-def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
-
-def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v2i64 FPR128:$src))), (v2f64 FPR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
-
-def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
-
-def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
-
-def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
-
-def : Pat<(v8i16 (bitconvert (f128  FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))), (v8i16 FPR128:$src)>;
-
-def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
-
-def : Pat<(v8i8 (extract_subvector (v16i8 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v4i16 (extract_subvector (v8i16 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v2i32 (extract_subvector (v4i32 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-def : Pat<(v1i64 (extract_subvector (v2i64 FPR128:$Rn), (i64 1))),
-          (EXTRACT_SUBREG (DUPv2i64lane FPR128:$Rn, 1), dsub)>;
-
-// A 64-bit subvector insert to the first 128-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(insert_subvector undef, (v1i64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v1f64 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2i32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v2f32 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v4i16 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (i32 0)),
-          (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
-
-// Use pair-wise add instructions when summing up the lanes for v2f64, v2i64
-// or v2f32.
-def : Pat<(i64 (add (vector_extract (v2i64 FPR128:$Rn), (i64 0)),
-                    (vector_extract (v2i64 FPR128:$Rn), (i64 1)))),
-           (i64 (ADDPv2i64p (v2i64 FPR128:$Rn)))>;
-def : Pat<(f64 (fadd (vector_extract (v2f64 FPR128:$Rn), (i64 0)),
-                     (vector_extract (v2f64 FPR128:$Rn), (i64 1)))),
-           (f64 (FADDPv2i64p (v2f64 FPR128:$Rn)))>;
-    // vector_extract on 64-bit vectors gets promoted to a 128 bit vector,
-    // so we match on v4f32 here, not v2f32. This will also catch adding
-    // the low two lanes of a true v4f32 vector.
-def : Pat<(fadd (vector_extract (v4f32 FPR128:$Rn), (i64 0)),
-                (vector_extract (v4f32 FPR128:$Rn), (i64 1))),
-          (f32 (FADDPv2i32p (EXTRACT_SUBREG FPR128:$Rn, dsub)))>;
-
-// Scalar 64-bit shifts in FPR64 registers.
-def : Pat<(i64 (int_arm64_neon_sshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_ushl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (USHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_srshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (SRSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-def : Pat<(i64 (int_arm64_neon_urshl (i64 FPR64:$Rn), (i64 FPR64:$Rm))),
-          (URSHLv1i64 FPR64:$Rn, FPR64:$Rm)>;
-
-// Tail call return handling. These are all compiler pseudo-instructions,
-// so no encoding information or anything like that.
-let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, Uses = [SP] in {
-  def TCRETURNdi : Pseudo<(outs), (ins i64imm:$dst), []>;
-  def TCRETURNri : Pseudo<(outs), (ins tcGPR64:$dst), []>;
-}
-
-def : Pat<(ARM64tcret tcGPR64:$dst), (TCRETURNri tcGPR64:$dst)>;
-def : Pat<(ARM64tcret (i64 tglobaladdr:$dst)), (TCRETURNdi texternalsym:$dst)>;
-def : Pat<(ARM64tcret (i64 texternalsym:$dst)), (TCRETURNdi texternalsym:$dst)>;
-
-include "ARM64InstrAtomics.td"
diff --git a/lib/Target/ARM64/ARM64MCInstLower.cpp b/lib/Target/ARM64/ARM64MCInstLower.cpp
deleted file mode 100644
index 01dc229..0000000
--- a/lib/Target/ARM64/ARM64MCInstLower.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-//===-- ARM64MCInstLower.cpp - Convert ARM64 MachineInstr to an MCInst---===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains code to lower ARM64 MachineInstrs to their corresponding
-// MCInst records.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCInstLower.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/Support/CodeGen.h"
-#include "llvm/Target/TargetMachine.h"
-using namespace llvm;
-
-ARM64MCInstLower::ARM64MCInstLower(MCContext &ctx, Mangler &mang,
-                                   AsmPrinter &printer)
-    : Ctx(ctx), Printer(printer), TargetTriple(printer.getTargetTriple()) {}
-
-MCSymbol *
-ARM64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
-}
-
-MCSymbol *
-ARM64MCInstLower::GetExternalSymbolSymbol(const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandDarwin(const MachineOperand &MO,
-                                                     MCSymbol *Sym) const {
-  // FIXME: We would like an efficient form for this, so we don't have to do a
-  // lot of extra uniquing.
-  MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
-  if ((MO.getTargetFlags() & ARM64II::MO_GOT) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_GOTPAGEOFF;
-    else
-      assert(0 && "Unexpected target flags with MO_GOT on GV operand");
-  } else if ((MO.getTargetFlags() & ARM64II::MO_TLS) != 0) {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_TLVPPAGEOFF;
-    else
-      llvm_unreachable("Unexpected target flags with MO_TLS on GV operand");
-  } else {
-    if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-      RefKind = MCSymbolRefExpr::VK_PAGE;
-    else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) ==
-             ARM64II::MO_PAGEOFF)
-      RefKind = MCSymbolRefExpr::VK_PAGEOFF;
-  }
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::lowerSymbolOperandELF(const MachineOperand &MO,
-                                                  MCSymbol *Sym) const {
-  uint32_t RefFlags = 0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_GOT)
-    RefFlags |= ARM64MCExpr::VK_GOT;
-  else if (MO.getTargetFlags() & ARM64II::MO_TLS) {
-    TLSModel::Model Model;
-    if (MO.isGlobal()) {
-      const GlobalValue *GV = MO.getGlobal();
-      Model = Printer.TM.getTLSModel(GV);
-    } else {
-      assert(MO.isSymbol() &&
-             StringRef(MO.getSymbolName()) == "_TLS_MODULE_BASE_" &&
-             "unexpected external TLS symbol");
-      Model = TLSModel::GeneralDynamic;
-    }
-    switch (Model) {
-    case TLSModel::InitialExec:
-      RefFlags |= ARM64MCExpr::VK_GOTTPREL;
-      break;
-    case TLSModel::LocalExec:
-      RefFlags |= ARM64MCExpr::VK_TPREL;
-      break;
-    case TLSModel::LocalDynamic:
-      RefFlags |= ARM64MCExpr::VK_DTPREL;
-      break;
-    case TLSModel::GeneralDynamic:
-      RefFlags |= ARM64MCExpr::VK_TLSDESC;
-      break;
-    }
-  } else {
-    // No modifier means this is a generic reference, classified as absolute for
-    // the cases where it matters (:abs_g0: etc).
-    RefFlags |= ARM64MCExpr::VK_ABS;
-  }
-
-  if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGE)
-    RefFlags |= ARM64MCExpr::VK_PAGE;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_PAGEOFF)
-    RefFlags |= ARM64MCExpr::VK_PAGEOFF;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G3)
-    RefFlags |= ARM64MCExpr::VK_G3;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G2)
-    RefFlags |= ARM64MCExpr::VK_G2;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G1)
-    RefFlags |= ARM64MCExpr::VK_G1;
-  else if ((MO.getTargetFlags() & ARM64II::MO_FRAGMENT) == ARM64II::MO_G0)
-    RefFlags |= ARM64MCExpr::VK_G0;
-
-  if (MO.getTargetFlags() & ARM64II::MO_NC)
-    RefFlags |= ARM64MCExpr::VK_NC;
-
-  const MCExpr *Expr =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_None, Ctx);
-  if (!MO.isJTI() && MO.getOffset())
-    Expr = MCBinaryExpr::CreateAdd(
-        Expr, MCConstantExpr::Create(MO.getOffset(), Ctx), Ctx);
-
-  ARM64MCExpr::VariantKind RefKind;
-  RefKind = static_cast<ARM64MCExpr::VariantKind>(RefFlags);
-  Expr = ARM64MCExpr::Create(Expr, RefKind, Ctx);
-
-  return MCOperand::CreateExpr(Expr);
-}
-
-MCOperand ARM64MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
-                                               MCSymbol *Sym) const {
-  if (TargetTriple.isOSDarwin())
-    return lowerSymbolOperandDarwin(MO, Sym);
-
-  assert(TargetTriple.isOSBinFormatELF() && "Expect Darwin or ELF target");
-  return lowerSymbolOperandELF(MO, Sym);
-}
-
-bool ARM64MCInstLower::lowerOperand(const MachineOperand &MO,
-                                    MCOperand &MCOp) const {
-  switch (MO.getType()) {
-  default:
-    assert(0 && "unknown operand type");
-  case MachineOperand::MO_Register:
-    // Ignore all implicit register operands.
-    if (MO.isImplicit())
-      return false;
-    MCOp = MCOperand::CreateReg(MO.getReg());
-    break;
-  case MachineOperand::MO_RegisterMask:
-    // Regmasks are like implicit defs.
-    return false;
-  case MachineOperand::MO_Immediate:
-    MCOp = MCOperand::CreateImm(MO.getImm());
-    break;
-  case MachineOperand::MO_MachineBasicBlock:
-    MCOp = MCOperand::CreateExpr(
-        MCSymbolRefExpr::Create(MO.getMBB()->getSymbol(), Ctx));
-    break;
-  case MachineOperand::MO_GlobalAddress:
-    MCOp = LowerSymbolOperand(MO, GetGlobalAddressSymbol(MO));
-    break;
-  case MachineOperand::MO_ExternalSymbol:
-    MCOp = LowerSymbolOperand(MO, GetExternalSymbolSymbol(MO));
-    break;
-  case MachineOperand::MO_JumpTableIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetJTISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_ConstantPoolIndex:
-    MCOp = LowerSymbolOperand(MO, Printer.GetCPISymbol(MO.getIndex()));
-    break;
-  case MachineOperand::MO_BlockAddress:
-    MCOp = LowerSymbolOperand(
-        MO, Printer.GetBlockAddressSymbol(MO.getBlockAddress()));
-    break;
-  }
-  return true;
-}
-
-void ARM64MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MCOperand MCOp;
-    if (lowerOperand(MI->getOperand(i), MCOp))
-      OutMI.addOperand(MCOp);
-  }
-}
diff --git a/lib/Target/ARM64/ARM64MachineFunctionInfo.h b/lib/Target/ARM64/ARM64MachineFunctionInfo.h
deleted file mode 100644
index 02bf7cf..0000000
--- a/lib/Target/ARM64/ARM64MachineFunctionInfo.h
+++ /dev/null
@@ -1,139 +0,0 @@
-//===- ARM64MachineFuctionInfo.h - ARM64 machine function info --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares ARM64-specific per-machine-function information.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MACHINEFUNCTIONINFO_H
-#define ARM64MACHINEFUNCTIONINFO_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/MC/MCLinkerOptimizationHint.h"
-
-namespace llvm {
-
-/// ARM64FunctionInfo - This class is derived from MachineFunctionInfo and
-/// contains private ARM64-specific information for each MachineFunction.
-class ARM64FunctionInfo : public MachineFunctionInfo {
-
-  /// HasStackFrame - True if this function has a stack frame. Set by
-  /// processFunctionBeforeCalleeSavedScan().
-  bool HasStackFrame;
-
-  /// \brief Amount of stack frame size, not including callee-saved registers.
-  unsigned LocalStackSize;
-
-  /// \brief Number of TLS accesses using the special (combinable)
-  /// _TLS_MODULE_BASE_ symbol.
-  unsigned NumLocalDynamicTLSAccesses;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed on the
-  /// stack.
-  int VarArgsStackIndex;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// general purpose registers.
-  int VarArgsGPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in general purpose
-  /// registers.
-  unsigned VarArgsGPRSize;
-
-  /// \brief FrameIndex for start of varargs area for arguments passed in
-  /// floating-point registers.
-  int VarArgsFPRIndex;
-
-  /// \brief Size of the varargs area for arguments passed in floating-point
-  /// registers.
-  unsigned VarArgsFPRSize;
-
-public:
-  ARM64FunctionInfo()
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {}
-
-  explicit ARM64FunctionInfo(MachineFunction &MF)
-      : HasStackFrame(false), NumLocalDynamicTLSAccesses(0),
-        VarArgsStackIndex(0), VarArgsGPRIndex(0), VarArgsGPRSize(0),
-        VarArgsFPRIndex(0), VarArgsFPRSize(0) {
-    (void)MF;
-  }
-
-  bool hasStackFrame() const { return HasStackFrame; }
-  void setHasStackFrame(bool s) { HasStackFrame = s; }
-
-  void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
-  unsigned getLocalStackSize() const { return LocalStackSize; }
-
-  void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
-  unsigned getNumLocalDynamicTLSAccesses() const {
-    return NumLocalDynamicTLSAccesses;
-  }
-
-  int getVarArgsStackIndex() const { return VarArgsStackIndex; }
-  void setVarArgsStackIndex(int Index) { VarArgsStackIndex = Index; }
-
-  int getVarArgsGPRIndex() const { return VarArgsGPRIndex; }
-  void setVarArgsGPRIndex(int Index) { VarArgsGPRIndex = Index; }
-
-  unsigned getVarArgsGPRSize() const { return VarArgsGPRSize; }
-  void setVarArgsGPRSize(unsigned Size) { VarArgsGPRSize = Size; }
-
-  int getVarArgsFPRIndex() const { return VarArgsFPRIndex; }
-  void setVarArgsFPRIndex(int Index) { VarArgsFPRIndex = Index; }
-
-  unsigned getVarArgsFPRSize() const { return VarArgsFPRSize; }
-  void setVarArgsFPRSize(unsigned Size) { VarArgsFPRSize = Size; }
-
-  typedef SmallPtrSet<const MachineInstr *, 16> SetOfInstructions;
-
-  const SetOfInstructions &getLOHRelated() const { return LOHRelated; }
-
-  // Shortcuts for LOH related types.
-  class MILOHDirective {
-    MCLOHType Kind;
-
-    /// Arguments of this directive. Order matters.
-    SmallVector<const MachineInstr *, 3> Args;
-
-  public:
-    typedef SmallVectorImpl<const MachineInstr *> LOHArgs;
-
-    MILOHDirective(MCLOHType Kind, const LOHArgs &Args)
-        : Kind(Kind), Args(Args.begin(), Args.end()) {
-      assert(isValidMCLOHType(Kind) && "Invalid LOH directive type!");
-    }
-
-    MCLOHType getKind() const { return Kind; }
-    const LOHArgs &getArgs() const { return Args; }
-  };
-
-  typedef MILOHDirective::LOHArgs MILOHArgs;
-  typedef SmallVector<MILOHDirective, 32> MILOHContainer;
-
-  const MILOHContainer &getLOHContainer() const { return LOHContainerSet; }
-
-  /// Add a LOH directive of this @p Kind and this @p Args.
-  void addLOHDirective(MCLOHType Kind, const MILOHArgs &Args) {
-    LOHContainerSet.push_back(MILOHDirective(Kind, Args));
-    LOHRelated.insert(Args.begin(), Args.end());
-  }
-
-private:
-  // Hold the lists of LOHs.
-  MILOHContainer LOHContainerSet;
-  SetOfInstructions LOHRelated;
-};
-} // End llvm namespace
-
-#endif // ARM64MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.cpp b/lib/Target/ARM64/ARM64RegisterInfo.cpp
deleted file mode 100644
index 4c7fc8a..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.cpp
+++ /dev/null
@@ -1,400 +0,0 @@
-//===- ARM64RegisterInfo.cpp - ARM64 Register Information -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the TargetRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64RegisterInfo.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/Function.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetOptions.h"
-
-#define GET_REGINFO_TARGET_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-using namespace llvm;
-
-ARM64RegisterInfo::ARM64RegisterInfo(const ARM64InstrInfo *tii,
-                                     const ARM64Subtarget *sti)
-    : ARM64GenRegisterInfo(ARM64::LR), TII(tii), STI(sti) {}
-
-const uint16_t *
-ARM64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  assert(MF && "Invalid MachineFunction pointer.");
-  if (MF->getFunction()->getCallingConv() == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_SaveList;
-  else
-    return CSR_ARM64_AAPCS_SaveList;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
-  if (CC == CallingConv::AnyReg)
-    return CSR_ARM64_AllRegs_RegMask;
-  else
-    return CSR_ARM64_AAPCS_RegMask;
-}
-
-const uint32_t *ARM64RegisterInfo::getTLSCallPreservedMask() const {
-  if (STI->isTargetDarwin())
-    return CSR_ARM64_TLS_Darwin_RegMask;
-
-  assert(STI->isTargetELF() && "only expect Darwin or ELF TLS");
-  return CSR_ARM64_TLS_ELF_RegMask;
-}
-
-const uint32_t *
-ARM64RegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  // This should return a register mask that is the same as that returned by
-  // getCallPreservedMask but that additionally preserves the register used for
-  // the first i64 argument (which must also be the register used to return a
-  // single i64 return value)
-  //
-  // In case that the calling convention does not use the same register for
-  // both, the function should return NULL (does not currently apply)
-  return CSR_ARM64_AAPCS_ThisReturn_RegMask;
-}
-
-BitVector ARM64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  // FIXME: avoid re-calculating this everytime.
-  BitVector Reserved(getNumRegs());
-  Reserved.set(ARM64::SP);
-  Reserved.set(ARM64::XZR);
-  Reserved.set(ARM64::WSP);
-  Reserved.set(ARM64::WZR);
-
-  if (TFI->hasFP(MF) || STI->isTargetDarwin()) {
-    Reserved.set(ARM64::FP);
-    Reserved.set(ARM64::W29);
-  }
-
-  if (STI->isTargetDarwin()) {
-    Reserved.set(ARM64::X18); // Platform register
-    Reserved.set(ARM64::W18);
-  }
-
-  if (hasBasePointer(MF)) {
-    Reserved.set(ARM64::X19);
-    Reserved.set(ARM64::W19);
-  }
-
-  return Reserved;
-}
-
-bool ARM64RegisterInfo::isReservedReg(const MachineFunction &MF,
-                                      unsigned Reg) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (Reg) {
-  default:
-    break;
-  case ARM64::SP:
-  case ARM64::XZR:
-  case ARM64::WSP:
-  case ARM64::WZR:
-    return true;
-  case ARM64::X18:
-  case ARM64::W18:
-    return STI->isTargetDarwin();
-  case ARM64::FP:
-  case ARM64::W29:
-    return TFI->hasFP(MF) || STI->isTargetDarwin();
-  case ARM64::W19:
-  case ARM64::X19:
-    return hasBasePointer(MF);
-  }
-
-  return false;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getPointerRegClass(const MachineFunction &MF,
-                                      unsigned Kind) const {
-  return &ARM64::GPR64RegClass;
-}
-
-const TargetRegisterClass *
-ARM64RegisterInfo::getCrossCopyRegClass(const TargetRegisterClass *RC) const {
-  if (RC == &ARM64::CCRRegClass)
-    return NULL; // Can't copy CPSR.
-  return RC;
-}
-
-unsigned ARM64RegisterInfo::getBaseRegister() const { return ARM64::X19; }
-
-bool ARM64RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // In the presence of variable sized objects, if the fixed stack size is
-  // large enough that referencing from the FP won't result in things being
-  // in range relatively often, we can use a base pointer to allow access
-  // from the other direction like the SP normally works.
-  if (MFI->hasVarSizedObjects()) {
-    // Conservatively estimate whether the negative offset from the frame
-    // pointer will be sufficient to reach. If a function has a smallish
-    // frame, it's less likely to have lots of spills and callee saved
-    // space, so it's all more likely to be within range of the frame pointer.
-    // If it's wrong, we'll materialize the constant and still get to the
-    // object; it's just suboptimal. Negative offsets use the unscaled
-    // load/store instructions, which have a 9-bit signed immediate.
-    if (MFI->getLocalFrameSize() < 256)
-      return false;
-    return true;
-  }
-
-  return false;
-}
-
-unsigned ARM64RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  return TFI->hasFP(MF) ? ARM64::FP : ARM64::SP;
-}
-
-bool
-ARM64RegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
-  return true;
-}
-
-bool ARM64RegisterInfo::requiresVirtualBaseRegisters(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool
-ARM64RegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // ARM64FrameLowering::resolveFrameIndexReference() can always fall back
-  // to the stack pointer, so only put the emergency spill slot next to the
-  // FP when there's no better way to access it (SP or base pointer).
-  return MFI->hasVarSizedObjects() && !hasBasePointer(MF);
-}
-
-bool ARM64RegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF)
-    const {
-  return true;
-}
-
-bool ARM64RegisterInfo::cannotEliminateFrame(const MachineFunction &MF) const {
-  const MachineFrameInfo *MFI = MF.getFrameInfo();
-  // Only consider eliminating leaf frames.
-  if (MFI->hasCalls() || (MF.getTarget().Options.DisableFramePointerElim(MF) &&
-                          MFI->adjustsStack()))
-    return true;
-  return MFI->hasVarSizedObjects() || MFI->isFrameAddressTaken();
-}
-
-/// needsFrameBaseReg - Returns true if the instruction's frame index
-/// reference would be better served by a base register other than FP
-/// or SP. Used by LocalStackFrameAllocation to determine which frame index
-/// references it should create new base registers for.
-bool ARM64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
-                                          int64_t Offset) const {
-  for (unsigned i = 0; !MI->getOperand(i).isFI(); ++i)
-    assert(i < MI->getNumOperands() &&
-           "Instr doesn't have FrameIndex operand!");
-
-  // It's the load/store FI references that cause issues, as it can be difficult
-  // to materialize the offset if it won't fit in the literal field. Estimate
-  // based on the size of the local frame and some conservative assumptions
-  // about the rest of the stack frame (note, this is pre-regalloc, so
-  // we don't know everything for certain yet) whether this offset is likely
-  // to be out of range of the immediate. Return true if so.
-
-  // We only generate virtual base registers for loads and stores, so
-  // return false for everything else.
-  if (!MI->mayLoad() && !MI->mayStore())
-    return false;
-
-  // Without a virtual base register, if the function has variable sized
-  // objects, all fixed-size local references will be via the frame pointer,
-  // Approximate the offset and see if it's legal for the instruction.
-  // Note that the incoming offset is based on the SP value at function entry,
-  // so it'll be negative.
-  MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-
-  // Estimate an offset from the frame pointer.
-  // Conservatively assume all GPR callee-saved registers get pushed.
-  // FP, LR, X19-X28, D8-D15. 64-bits each.
-  int64_t FPOffset = Offset - 16 * 20;
-  // Estimate an offset from the stack pointer.
-  // The incoming offset is relating to the SP at the start of the function,
-  // but when we access the local it'll be relative to the SP after local
-  // allocation, so adjust our SP-relative offset by that allocation size.
-  Offset += MFI->getLocalFrameSize();
-  // Assume that we'll have at least some spill slots allocated.
-  // FIXME: This is a total SWAG number. We should run some statistics
-  //        and pick a real one.
-  Offset += 128; // 128 bytes of spill slots
-
-  // If there is a frame pointer, try using it.
-  // The FP is only available if there is no dynamic realignment. We
-  // don't know for sure yet whether we'll need that, so we guess based
-  // on whether there are any local variables that would trigger it.
-  if (TFI->hasFP(MF) && isFrameOffsetLegal(MI, FPOffset))
-    return false;
-
-  // If we can reference via the stack pointer or base pointer, try that.
-  // FIXME: This (and the code that resolves the references) can be improved
-  //        to only disallow SP relative references in the live range of
-  //        the VLA(s). In practice, it's unclear how much difference that
-  //        would make, but it may be worth doing.
-  if (isFrameOffsetLegal(MI, Offset))
-    return false;
-
-  // The offset likely isn't legal; we want to allocate a virtual base register.
-  return true;
-}
-
-bool ARM64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                           int64_t Offset) const {
-  assert(Offset <= INT_MAX && "Offset too big to fit in int.");
-  assert(MI && "Unable to get the legal offset for nil instruction.");
-  int SaveOffset = Offset;
-  return isARM64FrameOffsetLegal(*MI, SaveOffset) & ARM64FrameOffsetIsLegal;
-}
-
-/// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
-/// at the beginning of the basic block.
-void ARM64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                     unsigned BaseReg,
-                                                     int FrameIdx,
-                                                     int64_t Offset) const {
-  MachineBasicBlock::iterator Ins = MBB->begin();
-  DebugLoc DL; // Defaults to "unknown"
-  if (Ins != MBB->end())
-    DL = Ins->getDebugLoc();
-
-  const MCInstrDesc &MCID = TII->get(ARM64::ADDXri);
-  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const MachineFunction &MF = *MBB->getParent();
-  MRI.constrainRegClass(BaseReg, TII->getRegClass(MCID, 0, this, MF));
-  unsigned Shifter = ARM64_AM::getShifterImm(ARM64_AM::LSL, 0);
-
-  BuildMI(*MBB, Ins, DL, MCID, BaseReg)
-      .addFrameIndex(FrameIdx)
-      .addImm(Offset)
-      .addImm(Shifter);
-}
-
-void ARM64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                                          int64_t Offset) const {
-  int Off = Offset; // ARM doesn't need the general 64-bit offsets
-  unsigned i = 0;
-
-  while (!MI.getOperand(i).isFI()) {
-    ++i;
-    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
-  }
-  bool Done = rewriteARM64FrameIndex(MI, i, BaseReg, Off, TII);
-  assert(Done && "Unable to resolve frame index!");
-  (void)Done;
-}
-
-void ARM64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
-                                            int SPAdj, unsigned FIOperandNum,
-                                            RegScavenger *RS) const {
-  assert(SPAdj == 0 && "Unexpected");
-
-  MachineInstr &MI = *II;
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction &MF = *MBB.getParent();
-  const ARM64FrameLowering *TFI = static_cast<const ARM64FrameLowering *>(
-      MF.getTarget().getFrameLowering());
-
-  int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
-  int Offset;
-
-  // Special handling of dbg_value, stackmap and patchpoint instructions.
-  if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
-      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
-    Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg,
-                                             /*PreferFP=*/true);
-    Offset += MI.getOperand(FIOperandNum + 1).getImm();
-    MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
-
-  // Modify MI as necessary to handle as much of 'Offset' as possible
-  Offset = TFI->resolveFrameIndexReference(MF, FrameIndex, FrameReg);
-  if (rewriteARM64FrameIndex(MI, FIOperandNum, FrameReg, Offset, TII))
-    return;
-
-  assert((!RS || !RS->isScavengingFrameIndex(FrameIndex)) &&
-         "Emergency spill slot is out of reach");
-
-  // If we get here, the immediate doesn't fit into the instruction.  We folded
-  // as much as possible above.  Handle the rest, providing a register that is
-  // SP+LargeImm.
-  unsigned ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&ARM64::GPR64RegClass);
-  emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
-  MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
-}
-
-namespace llvm {
-
-unsigned ARM64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
-                                                MachineFunction &MF) const {
-  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
-
-  switch (RC->getID()) {
-  default:
-    return 0;
-  case ARM64::GPR32RegClassID:
-  case ARM64::GPR32spRegClassID:
-  case ARM64::GPR32allRegClassID:
-  case ARM64::GPR64spRegClassID:
-  case ARM64::GPR64allRegClassID:
-  case ARM64::GPR64RegClassID:
-  case ARM64::GPR32commonRegClassID:
-  case ARM64::GPR64commonRegClassID:
-    return 32 - 1                                      // XZR/SP
-           - (TFI->hasFP(MF) || STI->isTargetDarwin()) // FP
-           - STI->isTargetDarwin() // X18 reserved as platform register
-           - hasBasePointer(MF);   // X19
-  case ARM64::FPR8RegClassID:
-  case ARM64::FPR16RegClassID:
-  case ARM64::FPR32RegClassID:
-  case ARM64::FPR64RegClassID:
-  case ARM64::FPR128RegClassID:
-    return 32;
-
-  case ARM64::DDRegClassID:
-  case ARM64::DDDRegClassID:
-  case ARM64::DDDDRegClassID:
-  case ARM64::QQRegClassID:
-  case ARM64::QQQRegClassID:
-  case ARM64::QQQQRegClassID:
-    return 32;
-
-  case ARM64::FPR128_loRegClassID:
-    return 16;
-  }
-}
-
-} // namespace llvm
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.h b/lib/Target/ARM64/ARM64RegisterInfo.h
deleted file mode 100644
index 31d9242..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.h
+++ /dev/null
@@ -1,101 +0,0 @@
-//===- ARM64RegisterInfo.h - ARM64 Register Information Impl ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the ARM64 implementation of the MRegisterInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64REGISTERINFO_H
-#define LLVM_TARGET_ARM64REGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "ARM64GenRegisterInfo.inc"
-
-namespace llvm {
-
-class ARM64InstrInfo;
-class ARM64Subtarget;
-class MachineFunction;
-class RegScavenger;
-class TargetRegisterClass;
-
-struct ARM64RegisterInfo : public ARM64GenRegisterInfo {
-private:
-  const ARM64InstrInfo *TII;
-  const ARM64Subtarget *STI;
-
-public:
-  ARM64RegisterInfo(const ARM64InstrInfo *tii, const ARM64Subtarget *sti);
-
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
-
-  /// Code Generation virtual methods...
-  const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const override;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
-
-  unsigned getCSRFirstUseCost() const {
-    // The cost will be compared against BlockFrequency where entry has the
-    // value of 1 << 14. A value of 5 will choose to spill or split really
-    // cold path instead of using a callee-saved register.
-    return 5;
-  }
-
-  // Calls involved in thread-local variable lookup save more registers than
-  // normal calls, so they need a different mask to represent this.
-  const uint32_t *getTLSCallPreservedMask() const;
-
-  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
-  /// case that 'returned' is on an i64 first argument if the calling convention
-  /// is one that can (partially) model this attribute with a preserved mask
-  /// (i.e. it is a calling convention that uses the same register for the first
-  /// i64 argument and an i64 return value)
-  ///
-  /// Should return NULL in the case that the calling convention does not have
-  /// this property
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
-
-  BitVector getReservedRegs(const MachineFunction &MF) const override;
-  const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF,
-                     unsigned Kind = 0) const override;
-  const TargetRegisterClass *
-  getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
-
-  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
-  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
-
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI,
-                          int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
-                                    int FrameIdx,
-                                    int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const override;
-  void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
-                           unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
-  bool cannotEliminateFrame(const MachineFunction &MF) const;
-
-  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
-  bool hasBasePointer(const MachineFunction &MF) const;
-  unsigned getBaseRegister() const;
-
-  // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const override;
-
-  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const override;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TARGET_ARM64REGISTERINFO_H
diff --git a/lib/Target/ARM64/ARM64RegisterInfo.td b/lib/Target/ARM64/ARM64RegisterInfo.td
deleted file mode 100644
index 96001c5..0000000
--- a/lib/Target/ARM64/ARM64RegisterInfo.td
+++ /dev/null
@@ -1,561 +0,0 @@
-//===- ARM64RegisterInfo.td - Describe the ARM64 Regisers --*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-
-class ARM64Reg<bits<16> enc, string n, list<Register> subregs = [],
-               list<string> altNames = []>
-        : Register<n, altNames> {
-  let HWEncoding = enc;
-  let Namespace = "ARM64";
-  let SubRegs = subregs;
-}
-
-let Namespace = "ARM64" in {
-  def sub_32 : SubRegIndex<32>;
-
-  def bsub : SubRegIndex<8>;
-  def hsub : SubRegIndex<16>;
-  def ssub : SubRegIndex<32>;
-  def dsub : SubRegIndex<32>;
-  def qhisub : SubRegIndex<64>;
-  def qsub : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def dsub0 : SubRegIndex<64>;
-  def dsub1 : SubRegIndex<64>;
-  def dsub2 : SubRegIndex<64>;
-  def dsub3 : SubRegIndex<64>;
-  // Note: Code depends on these having consecutive numbers
-  def qsub0 : SubRegIndex<128>;
-  def qsub1 : SubRegIndex<128>;
-  def qsub2 : SubRegIndex<128>;
-  def qsub3 : SubRegIndex<128>;
-}
-
-let Namespace = "ARM64" in {
-  def vreg : RegAltNameIndex;
-  def vlist1 : RegAltNameIndex;
-}
-
-//===----------------------------------------------------------------------===//
-// Registers
-//===----------------------------------------------------------------------===//
-def W0    : ARM64Reg<0,   "w0" >, DwarfRegNum<[0]>;
-def W1    : ARM64Reg<1,   "w1" >, DwarfRegNum<[1]>;
-def W2    : ARM64Reg<2,   "w2" >, DwarfRegNum<[2]>;
-def W3    : ARM64Reg<3,   "w3" >, DwarfRegNum<[3]>;
-def W4    : ARM64Reg<4,   "w4" >, DwarfRegNum<[4]>;
-def W5    : ARM64Reg<5,   "w5" >, DwarfRegNum<[5]>;
-def W6    : ARM64Reg<6,   "w6" >, DwarfRegNum<[6]>;
-def W7    : ARM64Reg<7,   "w7" >, DwarfRegNum<[7]>;
-def W8    : ARM64Reg<8,   "w8" >, DwarfRegNum<[8]>;
-def W9    : ARM64Reg<9,   "w9" >, DwarfRegNum<[9]>;
-def W10   : ARM64Reg<10, "w10">, DwarfRegNum<[10]>;
-def W11   : ARM64Reg<11, "w11">, DwarfRegNum<[11]>;
-def W12   : ARM64Reg<12, "w12">, DwarfRegNum<[12]>;
-def W13   : ARM64Reg<13, "w13">, DwarfRegNum<[13]>;
-def W14   : ARM64Reg<14, "w14">, DwarfRegNum<[14]>;
-def W15   : ARM64Reg<15, "w15">, DwarfRegNum<[15]>;
-def W16   : ARM64Reg<16, "w16">, DwarfRegNum<[16]>;
-def W17   : ARM64Reg<17, "w17">, DwarfRegNum<[17]>;
-def W18   : ARM64Reg<18, "w18">, DwarfRegNum<[18]>;
-def W19   : ARM64Reg<19, "w19">, DwarfRegNum<[19]>;
-def W20   : ARM64Reg<20, "w20">, DwarfRegNum<[20]>;
-def W21   : ARM64Reg<21, "w21">, DwarfRegNum<[21]>;
-def W22   : ARM64Reg<22, "w22">, DwarfRegNum<[22]>;
-def W23   : ARM64Reg<23, "w23">, DwarfRegNum<[23]>;
-def W24   : ARM64Reg<24, "w24">, DwarfRegNum<[24]>;
-def W25   : ARM64Reg<25, "w25">, DwarfRegNum<[25]>;
-def W26   : ARM64Reg<26, "w26">, DwarfRegNum<[26]>;
-def W27   : ARM64Reg<27, "w27">, DwarfRegNum<[27]>;
-def W28   : ARM64Reg<28, "w28">, DwarfRegNum<[28]>;
-def W29   : ARM64Reg<29, "w29">, DwarfRegNum<[29]>;
-def W30   : ARM64Reg<30, "w30">, DwarfRegNum<[30]>;
-def WSP   : ARM64Reg<31, "wsp">, DwarfRegNum<[31]>;
-def WZR   : ARM64Reg<31, "wzr">, DwarfRegAlias<WSP>;
-
-let SubRegIndices = [sub_32] in {
-def X0    : ARM64Reg<0,   "x0",  [W0]>, DwarfRegAlias<W0>;
-def X1    : ARM64Reg<1,   "x1",  [W1]>, DwarfRegAlias<W1>;
-def X2    : ARM64Reg<2,   "x2",  [W2]>, DwarfRegAlias<W2>;
-def X3    : ARM64Reg<3,   "x3",  [W3]>, DwarfRegAlias<W3>;
-def X4    : ARM64Reg<4,   "x4",  [W4]>, DwarfRegAlias<W4>;
-def X5    : ARM64Reg<5,   "x5",  [W5]>, DwarfRegAlias<W5>;
-def X6    : ARM64Reg<6,   "x6",  [W6]>, DwarfRegAlias<W6>;
-def X7    : ARM64Reg<7,   "x7",  [W7]>, DwarfRegAlias<W7>;
-def X8    : ARM64Reg<8,   "x8",  [W8]>, DwarfRegAlias<W8>;
-def X9    : ARM64Reg<9,   "x9",  [W9]>, DwarfRegAlias<W9>;
-def X10   : ARM64Reg<10, "x10", [W10]>, DwarfRegAlias<W10>;
-def X11   : ARM64Reg<11, "x11", [W11]>, DwarfRegAlias<W11>;
-def X12   : ARM64Reg<12, "x12", [W12]>, DwarfRegAlias<W12>;
-def X13   : ARM64Reg<13, "x13", [W13]>, DwarfRegAlias<W13>;
-def X14   : ARM64Reg<14, "x14", [W14]>, DwarfRegAlias<W14>;
-def X15   : ARM64Reg<15, "x15", [W15]>, DwarfRegAlias<W15>;
-def X16   : ARM64Reg<16, "x16", [W16]>, DwarfRegAlias<W16>;
-def X17   : ARM64Reg<17, "x17", [W17]>, DwarfRegAlias<W17>;
-def X18   : ARM64Reg<18, "x18", [W18]>, DwarfRegAlias<W18>;
-def X19   : ARM64Reg<19, "x19", [W19]>, DwarfRegAlias<W19>;
-def X20   : ARM64Reg<20, "x20", [W20]>, DwarfRegAlias<W20>;
-def X21   : ARM64Reg<21, "x21", [W21]>, DwarfRegAlias<W21>;
-def X22   : ARM64Reg<22, "x22", [W22]>, DwarfRegAlias<W22>;
-def X23   : ARM64Reg<23, "x23", [W23]>, DwarfRegAlias<W23>;
-def X24   : ARM64Reg<24, "x24", [W24]>, DwarfRegAlias<W24>;
-def X25   : ARM64Reg<25, "x25", [W25]>, DwarfRegAlias<W25>;
-def X26   : ARM64Reg<26, "x26", [W26]>, DwarfRegAlias<W26>;
-def X27   : ARM64Reg<27, "x27", [W27]>, DwarfRegAlias<W27>;
-def X28   : ARM64Reg<28, "x28", [W28]>, DwarfRegAlias<W28>;
-def FP    : ARM64Reg<29, "fp",  [W29]>, DwarfRegAlias<W29>;
-def LR    : ARM64Reg<30, "lr",  [W30]>, DwarfRegAlias<W30>;
-def SP    : ARM64Reg<31, "sp",  [WSP]>, DwarfRegAlias<WSP>;
-def XZR   : ARM64Reg<31, "xzr", [WZR]>, DwarfRegAlias<WSP>;
-}
-
-// Condition code register.
-def CPSR  : ARM64Reg<0, "cpsr">;
-
-// GPR register classes with the intersections of GPR32/GPR32sp and
-// GPR64/GPR64sp for use by the coalescer.
-def GPR32common : RegisterClass<"ARM64", [i32], 32, (sequence "W%u", 0, 30)> {
-  let AltOrders = [(rotl GPR32common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64common : RegisterClass<"ARM64", [i64], 64,
-                                (add (sequence "X%u", 0, 28), FP, LR)> {
-  let AltOrders = [(rotl GPR64common, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-// GPR register classes which exclude SP/WSP.
-def GPR32 : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR)> {
-  let AltOrders = [(rotl GPR32, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64 : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR)> {
-  let AltOrders = [(rotl GPR64, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include SP/WSP.
-def GPR32sp : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WSP)> {
-  let AltOrders = [(rotl GPR32sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-def GPR64sp : RegisterClass<"ARM64", [i64], 64, (add GPR64common, SP)> {
-  let AltOrders = [(rotl GPR64sp, 8)];
-  let AltOrderSelect = [{ return 1; }];
-}
-
-// GPR register classes which include WZR/XZR AND SP/WSP. This is not a
-// constraint used by any instructions, it is used as a common super-class.
-def GPR32all : RegisterClass<"ARM64", [i32], 32, (add GPR32common, WZR, WSP)>;
-def GPR64all : RegisterClass<"ARM64", [i64], 64, (add GPR64common, XZR, SP)>;
-
-// For tail calls, we can't use callee-saved registers, as they are restored
-// to the saved value before the tail call, which would clobber a call address.
-// This is for indirect tail calls to store the address of the destination.
-def tcGPR64 : RegisterClass<"ARM64", [i64], 64, (sub GPR64common, X19, X20, X21,
-                                                     X22, X23, X24, X25, X26,
-                                                     X27, X28)>;
-
-// GPR register classes for post increment ammount of vector load/store that
-// has alternate printing when Rm=31 and prints a constant immediate value
-// equal to the total number of bytes transferred.
-def GPR64pi1  : RegisterOperand<GPR64, "printPostIncOperand1">;
-def GPR64pi2  : RegisterOperand<GPR64, "printPostIncOperand2">;
-def GPR64pi3  : RegisterOperand<GPR64, "printPostIncOperand3">;
-def GPR64pi4  : RegisterOperand<GPR64, "printPostIncOperand4">;
-def GPR64pi6  : RegisterOperand<GPR64, "printPostIncOperand6">;
-def GPR64pi8  : RegisterOperand<GPR64, "printPostIncOperand8">;
-def GPR64pi12 : RegisterOperand<GPR64, "printPostIncOperand12">;
-def GPR64pi16 : RegisterOperand<GPR64, "printPostIncOperand16">;
-def GPR64pi24 : RegisterOperand<GPR64, "printPostIncOperand24">;
-def GPR64pi32 : RegisterOperand<GPR64, "printPostIncOperand32">;
-def GPR64pi48 : RegisterOperand<GPR64, "printPostIncOperand48">;
-def GPR64pi64 : RegisterOperand<GPR64, "printPostIncOperand64">;
-
-// Condition code regclass.
-def CCR : RegisterClass<"ARM64", [i32], 32, (add CPSR)> {
-  let CopyCost = -1;  // Don't allow copying of status registers.
-
-  // CCR is not allocatable.
-  let isAllocatable = 0;
-}
-
-//===----------------------------------------------------------------------===//
-// Floating Point Scalar Registers
-//===----------------------------------------------------------------------===//
-
-def B0    : ARM64Reg<0,   "b0">, DwarfRegNum<[64]>;
-def B1    : ARM64Reg<1,   "b1">, DwarfRegNum<[65]>;
-def B2    : ARM64Reg<2,   "b2">, DwarfRegNum<[66]>;
-def B3    : ARM64Reg<3,   "b3">, DwarfRegNum<[67]>;
-def B4    : ARM64Reg<4,   "b4">, DwarfRegNum<[68]>;
-def B5    : ARM64Reg<5,   "b5">, DwarfRegNum<[69]>;
-def B6    : ARM64Reg<6,   "b6">, DwarfRegNum<[70]>;
-def B7    : ARM64Reg<7,   "b7">, DwarfRegNum<[71]>;
-def B8    : ARM64Reg<8,   "b8">, DwarfRegNum<[72]>;
-def B9    : ARM64Reg<9,   "b9">, DwarfRegNum<[73]>;
-def B10   : ARM64Reg<10, "b10">, DwarfRegNum<[74]>;
-def B11   : ARM64Reg<11, "b11">, DwarfRegNum<[75]>;
-def B12   : ARM64Reg<12, "b12">, DwarfRegNum<[76]>;
-def B13   : ARM64Reg<13, "b13">, DwarfRegNum<[77]>;
-def B14   : ARM64Reg<14, "b14">, DwarfRegNum<[78]>;
-def B15   : ARM64Reg<15, "b15">, DwarfRegNum<[79]>;
-def B16   : ARM64Reg<16, "b16">, DwarfRegNum<[80]>;
-def B17   : ARM64Reg<17, "b17">, DwarfRegNum<[81]>;
-def B18   : ARM64Reg<18, "b18">, DwarfRegNum<[82]>;
-def B19   : ARM64Reg<19, "b19">, DwarfRegNum<[83]>;
-def B20   : ARM64Reg<20, "b20">, DwarfRegNum<[84]>;
-def B21   : ARM64Reg<21, "b21">, DwarfRegNum<[85]>;
-def B22   : ARM64Reg<22, "b22">, DwarfRegNum<[86]>;
-def B23   : ARM64Reg<23, "b23">, DwarfRegNum<[87]>;
-def B24   : ARM64Reg<24, "b24">, DwarfRegNum<[88]>;
-def B25   : ARM64Reg<25, "b25">, DwarfRegNum<[89]>;
-def B26   : ARM64Reg<26, "b26">, DwarfRegNum<[90]>;
-def B27   : ARM64Reg<27, "b27">, DwarfRegNum<[91]>;
-def B28   : ARM64Reg<28, "b28">, DwarfRegNum<[92]>;
-def B29   : ARM64Reg<29, "b29">, DwarfRegNum<[93]>;
-def B30   : ARM64Reg<30, "b30">, DwarfRegNum<[94]>;
-def B31   : ARM64Reg<31, "b31">, DwarfRegNum<[95]>;
-
-let SubRegIndices = [bsub] in {
-def H0    : ARM64Reg<0,   "h0", [B0]>, DwarfRegAlias<B0>;
-def H1    : ARM64Reg<1,   "h1", [B1]>, DwarfRegAlias<B1>;
-def H2    : ARM64Reg<2,   "h2", [B2]>, DwarfRegAlias<B2>;
-def H3    : ARM64Reg<3,   "h3", [B3]>, DwarfRegAlias<B3>;
-def H4    : ARM64Reg<4,   "h4", [B4]>, DwarfRegAlias<B4>;
-def H5    : ARM64Reg<5,   "h5", [B5]>, DwarfRegAlias<B5>;
-def H6    : ARM64Reg<6,   "h6", [B6]>, DwarfRegAlias<B6>;
-def H7    : ARM64Reg<7,   "h7", [B7]>, DwarfRegAlias<B7>;
-def H8    : ARM64Reg<8,   "h8", [B8]>, DwarfRegAlias<B8>;
-def H9    : ARM64Reg<9,   "h9", [B9]>, DwarfRegAlias<B9>;
-def H10   : ARM64Reg<10, "h10", [B10]>, DwarfRegAlias<B10>;
-def H11   : ARM64Reg<11, "h11", [B11]>, DwarfRegAlias<B11>;
-def H12   : ARM64Reg<12, "h12", [B12]>, DwarfRegAlias<B12>;
-def H13   : ARM64Reg<13, "h13", [B13]>, DwarfRegAlias<B13>;
-def H14   : ARM64Reg<14, "h14", [B14]>, DwarfRegAlias<B14>;
-def H15   : ARM64Reg<15, "h15", [B15]>, DwarfRegAlias<B15>;
-def H16   : ARM64Reg<16, "h16", [B16]>, DwarfRegAlias<B16>;
-def H17   : ARM64Reg<17, "h17", [B17]>, DwarfRegAlias<B17>;
-def H18   : ARM64Reg<18, "h18", [B18]>, DwarfRegAlias<B18>;
-def H19   : ARM64Reg<19, "h19", [B19]>, DwarfRegAlias<B19>;
-def H20   : ARM64Reg<20, "h20", [B20]>, DwarfRegAlias<B20>;
-def H21   : ARM64Reg<21, "h21", [B21]>, DwarfRegAlias<B21>;
-def H22   : ARM64Reg<22, "h22", [B22]>, DwarfRegAlias<B22>;
-def H23   : ARM64Reg<23, "h23", [B23]>, DwarfRegAlias<B23>;
-def H24   : ARM64Reg<24, "h24", [B24]>, DwarfRegAlias<B24>;
-def H25   : ARM64Reg<25, "h25", [B25]>, DwarfRegAlias<B25>;
-def H26   : ARM64Reg<26, "h26", [B26]>, DwarfRegAlias<B26>;
-def H27   : ARM64Reg<27, "h27", [B27]>, DwarfRegAlias<B27>;
-def H28   : ARM64Reg<28, "h28", [B28]>, DwarfRegAlias<B28>;
-def H29   : ARM64Reg<29, "h29", [B29]>, DwarfRegAlias<B29>;
-def H30   : ARM64Reg<30, "h30", [B30]>, DwarfRegAlias<B30>;
-def H31   : ARM64Reg<31, "h31", [B31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [hsub] in {
-def S0    : ARM64Reg<0,   "s0", [H0]>, DwarfRegAlias<B0>;
-def S1    : ARM64Reg<1,   "s1", [H1]>, DwarfRegAlias<B1>;
-def S2    : ARM64Reg<2,   "s2", [H2]>, DwarfRegAlias<B2>;
-def S3    : ARM64Reg<3,   "s3", [H3]>, DwarfRegAlias<B3>;
-def S4    : ARM64Reg<4,   "s4", [H4]>, DwarfRegAlias<B4>;
-def S5    : ARM64Reg<5,   "s5", [H5]>, DwarfRegAlias<B5>;
-def S6    : ARM64Reg<6,   "s6", [H6]>, DwarfRegAlias<B6>;
-def S7    : ARM64Reg<7,   "s7", [H7]>, DwarfRegAlias<B7>;
-def S8    : ARM64Reg<8,   "s8", [H8]>, DwarfRegAlias<B8>;
-def S9    : ARM64Reg<9,   "s9", [H9]>, DwarfRegAlias<B9>;
-def S10   : ARM64Reg<10, "s10", [H10]>, DwarfRegAlias<B10>;
-def S11   : ARM64Reg<11, "s11", [H11]>, DwarfRegAlias<B11>;
-def S12   : ARM64Reg<12, "s12", [H12]>, DwarfRegAlias<B12>;
-def S13   : ARM64Reg<13, "s13", [H13]>, DwarfRegAlias<B13>;
-def S14   : ARM64Reg<14, "s14", [H14]>, DwarfRegAlias<B14>;
-def S15   : ARM64Reg<15, "s15", [H15]>, DwarfRegAlias<B15>;
-def S16   : ARM64Reg<16, "s16", [H16]>, DwarfRegAlias<B16>;
-def S17   : ARM64Reg<17, "s17", [H17]>, DwarfRegAlias<B17>;
-def S18   : ARM64Reg<18, "s18", [H18]>, DwarfRegAlias<B18>;
-def S19   : ARM64Reg<19, "s19", [H19]>, DwarfRegAlias<B19>;
-def S20   : ARM64Reg<20, "s20", [H20]>, DwarfRegAlias<B20>;
-def S21   : ARM64Reg<21, "s21", [H21]>, DwarfRegAlias<B21>;
-def S22   : ARM64Reg<22, "s22", [H22]>, DwarfRegAlias<B22>;
-def S23   : ARM64Reg<23, "s23", [H23]>, DwarfRegAlias<B23>;
-def S24   : ARM64Reg<24, "s24", [H24]>, DwarfRegAlias<B24>;
-def S25   : ARM64Reg<25, "s25", [H25]>, DwarfRegAlias<B25>;
-def S26   : ARM64Reg<26, "s26", [H26]>, DwarfRegAlias<B26>;
-def S27   : ARM64Reg<27, "s27", [H27]>, DwarfRegAlias<B27>;
-def S28   : ARM64Reg<28, "s28", [H28]>, DwarfRegAlias<B28>;
-def S29   : ARM64Reg<29, "s29", [H29]>, DwarfRegAlias<B29>;
-def S30   : ARM64Reg<30, "s30", [H30]>, DwarfRegAlias<B30>;
-def S31   : ARM64Reg<31, "s31", [H31]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [ssub], RegAltNameIndices = [vreg, vlist1] in {
-def D0    : ARM64Reg<0,   "d0", [S0], ["v0", ""]>, DwarfRegAlias<B0>;
-def D1    : ARM64Reg<1,   "d1", [S1], ["v1", ""]>, DwarfRegAlias<B1>;
-def D2    : ARM64Reg<2,   "d2", [S2], ["v2", ""]>, DwarfRegAlias<B2>;
-def D3    : ARM64Reg<3,   "d3", [S3], ["v3", ""]>, DwarfRegAlias<B3>;
-def D4    : ARM64Reg<4,   "d4", [S4], ["v4", ""]>, DwarfRegAlias<B4>;
-def D5    : ARM64Reg<5,   "d5", [S5], ["v5", ""]>, DwarfRegAlias<B5>;
-def D6    : ARM64Reg<6,   "d6", [S6], ["v6", ""]>, DwarfRegAlias<B6>;
-def D7    : ARM64Reg<7,   "d7", [S7], ["v7", ""]>, DwarfRegAlias<B7>;
-def D8    : ARM64Reg<8,   "d8", [S8], ["v8", ""]>, DwarfRegAlias<B8>;
-def D9    : ARM64Reg<9,   "d9", [S9], ["v9", ""]>, DwarfRegAlias<B9>;
-def D10   : ARM64Reg<10, "d10", [S10], ["v10", ""]>, DwarfRegAlias<B10>;
-def D11   : ARM64Reg<11, "d11", [S11], ["v11", ""]>, DwarfRegAlias<B11>;
-def D12   : ARM64Reg<12, "d12", [S12], ["v12", ""]>, DwarfRegAlias<B12>;
-def D13   : ARM64Reg<13, "d13", [S13], ["v13", ""]>, DwarfRegAlias<B13>;
-def D14   : ARM64Reg<14, "d14", [S14], ["v14", ""]>, DwarfRegAlias<B14>;
-def D15   : ARM64Reg<15, "d15", [S15], ["v15", ""]>, DwarfRegAlias<B15>;
-def D16   : ARM64Reg<16, "d16", [S16], ["v16", ""]>, DwarfRegAlias<B16>;
-def D17   : ARM64Reg<17, "d17", [S17], ["v17", ""]>, DwarfRegAlias<B17>;
-def D18   : ARM64Reg<18, "d18", [S18], ["v18", ""]>, DwarfRegAlias<B18>;
-def D19   : ARM64Reg<19, "d19", [S19], ["v19", ""]>, DwarfRegAlias<B19>;
-def D20   : ARM64Reg<20, "d20", [S20], ["v20", ""]>, DwarfRegAlias<B20>;
-def D21   : ARM64Reg<21, "d21", [S21], ["v21", ""]>, DwarfRegAlias<B21>;
-def D22   : ARM64Reg<22, "d22", [S22], ["v22", ""]>, DwarfRegAlias<B22>;
-def D23   : ARM64Reg<23, "d23", [S23], ["v23", ""]>, DwarfRegAlias<B23>;
-def D24   : ARM64Reg<24, "d24", [S24], ["v24", ""]>, DwarfRegAlias<B24>;
-def D25   : ARM64Reg<25, "d25", [S25], ["v25", ""]>, DwarfRegAlias<B25>;
-def D26   : ARM64Reg<26, "d26", [S26], ["v26", ""]>, DwarfRegAlias<B26>;
-def D27   : ARM64Reg<27, "d27", [S27], ["v27", ""]>, DwarfRegAlias<B27>;
-def D28   : ARM64Reg<28, "d28", [S28], ["v28", ""]>, DwarfRegAlias<B28>;
-def D29   : ARM64Reg<29, "d29", [S29], ["v29", ""]>, DwarfRegAlias<B29>;
-def D30   : ARM64Reg<30, "d30", [S30], ["v30", ""]>, DwarfRegAlias<B30>;
-def D31   : ARM64Reg<31, "d31", [S31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-let SubRegIndices = [dsub], RegAltNameIndices = [vreg, vlist1] in {
-def Q0    : ARM64Reg<0,   "q0", [D0], ["v0", ""]>, DwarfRegAlias<B0>;
-def Q1    : ARM64Reg<1,   "q1", [D1], ["v1", ""]>, DwarfRegAlias<B1>;
-def Q2    : ARM64Reg<2,   "q2", [D2], ["v2", ""]>, DwarfRegAlias<B2>;
-def Q3    : ARM64Reg<3,   "q3", [D3], ["v3", ""]>, DwarfRegAlias<B3>;
-def Q4    : ARM64Reg<4,   "q4", [D4], ["v4", ""]>, DwarfRegAlias<B4>;
-def Q5    : ARM64Reg<5,   "q5", [D5], ["v5", ""]>, DwarfRegAlias<B5>;
-def Q6    : ARM64Reg<6,   "q6", [D6], ["v6", ""]>, DwarfRegAlias<B6>;
-def Q7    : ARM64Reg<7,   "q7", [D7], ["v7", ""]>, DwarfRegAlias<B7>;
-def Q8    : ARM64Reg<8,   "q8", [D8], ["v8", ""]>, DwarfRegAlias<B8>;
-def Q9    : ARM64Reg<9,   "q9", [D9], ["v9", ""]>, DwarfRegAlias<B9>;
-def Q10   : ARM64Reg<10, "q10", [D10], ["v10", ""]>, DwarfRegAlias<B10>;
-def Q11   : ARM64Reg<11, "q11", [D11], ["v11", ""]>, DwarfRegAlias<B11>;
-def Q12   : ARM64Reg<12, "q12", [D12], ["v12", ""]>, DwarfRegAlias<B12>;
-def Q13   : ARM64Reg<13, "q13", [D13], ["v13", ""]>, DwarfRegAlias<B13>;
-def Q14   : ARM64Reg<14, "q14", [D14], ["v14", ""]>, DwarfRegAlias<B14>;
-def Q15   : ARM64Reg<15, "q15", [D15], ["v15", ""]>, DwarfRegAlias<B15>;
-def Q16   : ARM64Reg<16, "q16", [D16], ["v16", ""]>, DwarfRegAlias<B16>;
-def Q17   : ARM64Reg<17, "q17", [D17], ["v17", ""]>, DwarfRegAlias<B17>;
-def Q18   : ARM64Reg<18, "q18", [D18], ["v18", ""]>, DwarfRegAlias<B18>;
-def Q19   : ARM64Reg<19, "q19", [D19], ["v19", ""]>, DwarfRegAlias<B19>;
-def Q20   : ARM64Reg<20, "q20", [D20], ["v20", ""]>, DwarfRegAlias<B20>;
-def Q21   : ARM64Reg<21, "q21", [D21], ["v21", ""]>, DwarfRegAlias<B21>;
-def Q22   : ARM64Reg<22, "q22", [D22], ["v22", ""]>, DwarfRegAlias<B22>;
-def Q23   : ARM64Reg<23, "q23", [D23], ["v23", ""]>, DwarfRegAlias<B23>;
-def Q24   : ARM64Reg<24, "q24", [D24], ["v24", ""]>, DwarfRegAlias<B24>;
-def Q25   : ARM64Reg<25, "q25", [D25], ["v25", ""]>, DwarfRegAlias<B25>;
-def Q26   : ARM64Reg<26, "q26", [D26], ["v26", ""]>, DwarfRegAlias<B26>;
-def Q27   : ARM64Reg<27, "q27", [D27], ["v27", ""]>, DwarfRegAlias<B27>;
-def Q28   : ARM64Reg<28, "q28", [D28], ["v28", ""]>, DwarfRegAlias<B28>;
-def Q29   : ARM64Reg<29, "q29", [D29], ["v29", ""]>, DwarfRegAlias<B29>;
-def Q30   : ARM64Reg<30, "q30", [D30], ["v30", ""]>, DwarfRegAlias<B30>;
-def Q31   : ARM64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
-}
-
-def FPR8  : RegisterClass<"ARM64", [untyped], 8, (sequence "B%u", 0, 31)> {
-  let Size = 8;
-}
-def FPR16 : RegisterClass<"ARM64", [untyped], 16, (sequence "H%u", 0, 31)> {
-  let Size = 16;
-}
-def FPR32 : RegisterClass<"ARM64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
-def FPR64 : RegisterClass<"ARM64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64],
-                                    64, (sequence "D%u", 0, 31)>;
-// We don't (yet) have an f128 legal type, so don't use that here. We
-// normalize 128-bit vectors to v2f64 for arg passing and such, so use
-// that here.
-def FPR128 : RegisterClass<"ARM64",
-                           [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128],
-                           128, (sequence "Q%u", 0, 31)>;
-
-// The lower 16 vector registers.  Some instructions can only take registers
-// in this range.
-def FPR128_lo : RegisterClass<"ARM64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
-                              128, (trunc FPR128, 16)>;
-
-// Pairs, triples, and quads of 64-bit vector registers.
-def DSeqPairs : RegisterTuples<[dsub0, dsub1], [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-def DSeqTriples : RegisterTuples<[dsub0, dsub1, dsub2],
-                                 [(rotl FPR64, 0), (rotl FPR64, 1),
-                                  (rotl FPR64, 2)]>;
-def DSeqQuads : RegisterTuples<[dsub0, dsub1, dsub2, dsub3],
-                               [(rotl FPR64, 0), (rotl FPR64, 1),
-                                (rotl FPR64, 2), (rotl FPR64, 3)]>;
-def DD   : RegisterClass<"ARM64", [untyped], 64, (add DSeqPairs)> {
-  let Size = 128;
-}
-def DDD  : RegisterClass<"ARM64", [untyped], 64, (add DSeqTriples)> {
-  let Size = 196;
-}
-def DDDD : RegisterClass<"ARM64", [untyped], 64, (add DSeqQuads)> {
-  let Size = 256;
-}
-
-// Pairs, triples, and quads of 128-bit vector registers.
-def QSeqPairs : RegisterTuples<[qsub0, qsub1], [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-def QSeqTriples : RegisterTuples<[qsub0, qsub1, qsub2],
-                                 [(rotl FPR128, 0), (rotl FPR128, 1),
-                                  (rotl FPR128, 2)]>;
-def QSeqQuads : RegisterTuples<[qsub0, qsub1, qsub2, qsub3],
-                               [(rotl FPR128, 0), (rotl FPR128, 1),
-                                (rotl FPR128, 2), (rotl FPR128, 3)]>;
-def QQ   : RegisterClass<"ARM64", [untyped], 128, (add QSeqPairs)> {
-  let Size = 256;
-}
-def QQQ  : RegisterClass<"ARM64", [untyped], 128, (add QSeqTriples)> {
-  let Size = 384;
-}
-def QQQQ : RegisterClass<"ARM64", [untyped], 128, (add QSeqQuads)> {
-  let Size = 512;
-}
-
-
-// Vector operand versions of the FP registers. Alternate name printing and
-// assmebler matching.
-def VectorRegAsmOperand : AsmOperandClass { let Name = "VectorReg"; }
-let ParserMatchClass = VectorRegAsmOperand in {
-def V64  : RegisterOperand<FPR64, "printVRegOperand">;
-def V128 : RegisterOperand<FPR128, "printVRegOperand">;
-def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand">;
-}
-
-class TypedVecListAsmOperand<int count, int regsize, int lanes, string kind>
-    : AsmOperandClass {
-  let Name = "TypedVectorList" # count # "_" # lanes # kind;
-
-  let PredicateMethod
-      = "isTypedVectorList<" # count # ", " # lanes # ", '" # kind # "'>";
-  let RenderMethod = "addVectorList" # regsize # "Operands<" # count # ">";
-}
-
-class TypedVecListRegOperand<RegisterClass Reg, int lanes, string kind>
-    : RegisterOperand<Reg, "printTypedVectorList<" # lanes # ", '"
-                                                   # kind # "'>">;
-
-multiclass VectorList<int count, RegisterClass Reg64, RegisterClass Reg128> {
-  // With implicit types (probably on instruction instead). E.g. { v0, v1 }
-  def _64AsmOperand : AsmOperandClass {
-    let Name = NAME # "64";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList64Operands<" # count # ">";
-  }
-
-  def "64" : RegisterOperand<Reg64, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_64AsmOperand");
-  }
-
-  def _128AsmOperand : AsmOperandClass {
-    let Name = NAME # "128";
-    let PredicateMethod = "isImplicitlyTypedVectorList<" # count # ">";
-    let RenderMethod = "addVectorList128Operands<" # count # ">";
-  }
-
-  def "128" : RegisterOperand<Reg128, "printImplicitlyTypedVectorList"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_128AsmOperand");
-  }
-
-  // 64-bit register lists with explicit type.
-
-  // { v0.8b, v1.8b }
-  def _8bAsmOperand : TypedVecListAsmOperand<count, 64, 8, "b">;
-  def "8b" : TypedVecListRegOperand<Reg64, 8, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8bAsmOperand");
-  }
-
-  // { v0.4h, v1.4h }
-  def _4hAsmOperand : TypedVecListAsmOperand<count, 64, 4, "h">;
-  def "4h" : TypedVecListRegOperand<Reg64, 4, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4hAsmOperand");
-  }
-
-  // { v0.2s, v1.2s }
-  def _2sAsmOperand : TypedVecListAsmOperand<count, 64, 2, "s">;
-  def "2s" : TypedVecListRegOperand<Reg64, 2, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2sAsmOperand");
-  }
-
-  // { v0.1d, v1.1d }
-  def _1dAsmOperand : TypedVecListAsmOperand<count, 64, 1, "d">;
-  def "1d" : TypedVecListRegOperand<Reg64, 1, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_1dAsmOperand");
-  }
-
-  // 128-bit register lists with explicit type
-
-  // { v0.16b, v1.16b }
-  def _16bAsmOperand : TypedVecListAsmOperand<count, 128, 16, "b">;
-  def "16b" : TypedVecListRegOperand<Reg128, 16, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_16bAsmOperand");
-  }
-
-  // { v0.8h, v1.8h }
-  def _8hAsmOperand : TypedVecListAsmOperand<count, 128, 8, "h">;
-  def "8h" : TypedVecListRegOperand<Reg128, 8, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_8hAsmOperand");
-  }
-
-  // { v0.4s, v1.4s }
-  def _4sAsmOperand : TypedVecListAsmOperand<count, 128, 4, "s">;
-  def "4s" : TypedVecListRegOperand<Reg128, 4, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_4sAsmOperand");
-  }
-
-  // { v0.2d, v1.2d }
-  def _2dAsmOperand : TypedVecListAsmOperand<count, 128, 2, "d">;
-  def "2d" : TypedVecListRegOperand<Reg128, 2, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_2dAsmOperand");
-  }
-
-  // { v0.b, v1.b }
-  def _bAsmOperand : TypedVecListAsmOperand<count, 128, 0, "b">;
-  def "b" : TypedVecListRegOperand<Reg128, 0, "b"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_bAsmOperand");
-  }
-
-  // { v0.h, v1.h }
-  def _hAsmOperand : TypedVecListAsmOperand<count, 128, 0, "h">;
-  def "h" : TypedVecListRegOperand<Reg128, 0, "h"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_hAsmOperand");
-  }
-
-  // { v0.s, v1.s }
-  def _sAsmOperand : TypedVecListAsmOperand<count, 128, 0, "s">;
-  def "s" : TypedVecListRegOperand<Reg128, 0, "s"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_sAsmOperand");
-  }
-
-  // { v0.d, v1.d }
-  def _dAsmOperand : TypedVecListAsmOperand<count, 128, 0, "d">;
-  def "d" : TypedVecListRegOperand<Reg128, 0, "d"> {
-    let ParserMatchClass = !cast<AsmOperandClass>(NAME # "_dAsmOperand");
-  }
-
-
-}
-
-defm VecListOne   : VectorList<1, FPR64, FPR128>;
-defm VecListTwo   : VectorList<2, DD,    QQ>;
-defm VecListThree : VectorList<3, DDD,   QQQ>;
-defm VecListFour  : VectorList<4, DDDD,  QQQQ>;
-
-
-// Register operand versions of the scalar FP registers.
-def FPR16Op : RegisterOperand<FPR16, "printOperand">;
-def FPR32Op : RegisterOperand<FPR32, "printOperand">;
-def FPR64Op : RegisterOperand<FPR64, "printOperand">;
-def FPR128Op : RegisterOperand<FPR128, "printOperand">;
diff --git a/lib/Target/ARM64/ARM64Schedule.td b/lib/Target/ARM64/ARM64Schedule.td
deleted file mode 100644
index 52f9262..0000000
--- a/lib/Target/ARM64/ARM64Schedule.td
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARMSchedule.td - ARM Scheduling Definitions --------*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-// Define TII for use in SchedVariant Predicates.
-// const MachineInstr *MI and const TargetSchedModel *SchedModel
-// are defined by default.
-def : PredicateProlog<[{
-  const ARM64InstrInfo *TII =
-    static_cast<const ARM64InstrInfo*>(SchedModel->getInstrInfo());
-  (void)TII;
-}]>;
-
-// ARM64 Scheduler Definitions
-
-def WriteImm       : SchedWrite; // MOVN, MOVZ
-// TODO: Provide variants for MOV32/64imm Pseudos that dynamically
-// select the correct sequence of WriteImms.
-
-def WriteI         : SchedWrite; // ALU
-def WriteISReg     : SchedWrite; // ALU of Shifted-Reg
-def WriteIEReg     : SchedWrite; // ALU of Extended-Reg
-def WriteExtr      : SchedWrite; // EXTR shifts a reg pair
-def ReadExtrHi     : SchedRead;  // Read the high reg of the EXTR pair
-def WriteIS        : SchedWrite; // Shift/Scale
-def WriteID32      : SchedWrite; // 32-bit Divide
-def WriteID64      : SchedWrite; // 64-bit Divide
-def WriteIM32      : SchedWrite; // 32-bit Multiply
-def WriteIM64      : SchedWrite; // 64-bit Multiply
-def WriteBr        : SchedWrite; // Branch
-def WriteBrReg     : SchedWrite; // Indirect Branch
-
-def WriteLD        : SchedWrite; // Load from base addr plus immediate offset
-def WriteST        : SchedWrite; // Store to base addr plus immediate offset
-def WriteSTP       : SchedWrite; // Store a register pair.
-def WriteAdr       : SchedWrite; // Address pre/post increment.
-
-def WriteLDIdx : SchedWrite; // Load from a register index (maybe scaled).
-def WriteSTIdx : SchedWrite; // Store to a register index (maybe scaled).
-def ReadAdrBase : SchedRead; // Read the base resister of a reg-offset LD/ST.
-
-// ScaledIdxPred is true if a WriteLDIdx operand will be
-// scaled. Subtargets can use this to dynamically select resources and
-// latency for WriteLDIdx and ReadAdrBase.
-def ScaledIdxPred : SchedPredicate<[{TII->isScaledAddr(MI)}]>;
-
-// Serialized two-level address load.
-// EXAMPLE: LOADGot
-def WriteLDAdr : WriteSequence<[WriteAdr, WriteLD]>;
-
-// Serialized two-level address lookup.
-// EXAMPLE: MOVaddr...
-def WriteAdrAdr : WriteSequence<[WriteAdr, WriteAdr]>;
-
-// The second register of a load-pair.
-// LDP,LDPSW,LDNP,LDXP,LDAXP
-def WriteLDHi : SchedWrite;
-
-// Store-exclusive is a store followed by a dependent load.
-def WriteSTX : WriteSequence<[WriteST, WriteLD]>;
-
-def WriteSys     : SchedWrite; // Long, variable latency system ops.
-def WriteBarrier : SchedWrite; // Memory barrier.
-def WriteHint    : SchedWrite; // Hint instruction.
-
-def WriteF       : SchedWrite; // General floating-point ops.
-def WriteFCmp    : SchedWrite; // Floating-point compare.
-def WriteFCvt    : SchedWrite; // Float conversion.
-def WriteFCopy   : SchedWrite; // Float-int register copy.
-def WriteFImm    : SchedWrite; // Floating-point immediate.
-def WriteFMul    : SchedWrite; // Floating-point multiply.
-def WriteFDiv    : SchedWrite; // Floating-point division.
-
-def WriteV   : SchedWrite; // Vector ops.
-def WriteVLD : SchedWrite; // Vector loads.
-def WriteVST : SchedWrite; // Vector stores.
-
-// Read the unwritten lanes of the VLD's destination registers.
-def ReadVLD : SchedRead;
-
-// Sequential vector load and shuffle.
-def WriteVLDShuffle     : WriteSequence<[WriteVLD, WriteV]>;
-def WriteVLDPairShuffle : WriteSequence<[WriteVLD, WriteV, WriteV]>;
-
-// Store a shuffled vector.
-def WriteVSTShuffle : WriteSequence<[WriteV, WriteVST]>;
-def WriteVSTPairShuffle : WriteSequence<[WriteV, WriteV, WriteVST]>;
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp b/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
deleted file mode 100644
index 79d507f..0000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-//===-- ARM64SelectionDAGInfo.cpp - ARM64 SelectionDAG Info ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64SelectionDAGInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-selectiondag-info"
-#include "ARM64TargetMachine.h"
-using namespace llvm;
-
-ARM64SelectionDAGInfo::ARM64SelectionDAGInfo(const TargetMachine &TM)
-    : TargetSelectionDAGInfo(TM),
-      Subtarget(&TM.getSubtarget<ARM64Subtarget>()) {}
-
-ARM64SelectionDAGInfo::~ARM64SelectionDAGInfo() {}
-
-SDValue ARM64SelectionDAGInfo::EmitTargetCodeForMemset(
-    SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
-    MachinePointerInfo DstPtrInfo) const {
-  // Check to see if there is a specialized entry-point for memory zeroing.
-  ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
-  ConstantSDNode *SizeValue = dyn_cast<ConstantSDNode>(Size);
-  const char *bzeroEntry =
-      (V && V->isNullValue()) ? Subtarget->getBZeroEntry() : 0;
-  // For small size (< 256), it is not beneficial to use bzero
-  // instead of memset.
-  if (bzeroEntry && (!SizeValue || SizeValue->getZExtValue() > 256)) {
-    const ARM64TargetLowering &TLI = *static_cast<const ARM64TargetLowering *>(
-                                          DAG.getTarget().getTargetLowering());
-
-    EVT IntPtr = TLI.getPointerTy();
-    Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
-    TargetLowering::ArgListTy Args;
-    TargetLowering::ArgListEntry Entry;
-    Entry.Node = Dst;
-    Entry.Ty = IntPtrTy;
-    Args.push_back(Entry);
-    Entry.Node = Size;
-    Args.push_back(Entry);
-    TargetLowering::CallLoweringInfo CLI(
-        Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
-        0, CallingConv::C, /*isTailCall=*/false,
-        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args, DAG, dl);
-    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
-    return CallResult.second;
-  }
-  return SDValue();
-}
diff --git a/lib/Target/ARM64/ARM64SelectionDAGInfo.h b/lib/Target/ARM64/ARM64SelectionDAGInfo.h
deleted file mode 100644
index 770775f..0000000
--- a/lib/Target/ARM64/ARM64SelectionDAGInfo.h
+++ /dev/null
@@ -1,37 +0,0 @@
-//===-- ARM64SelectionDAGInfo.h - ARM64 SelectionDAG Info -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the ARM64 subclass for TargetSelectionDAGInfo.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SELECTIONDAGINFO_H
-#define ARM64SELECTIONDAGINFO_H
-
-#include "llvm/Target/TargetSelectionDAGInfo.h"
-
-namespace llvm {
-
-class ARM64SelectionDAGInfo : public TargetSelectionDAGInfo {
-  /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
-  /// make the right decision when generating code for different targets.
-  const ARM64Subtarget *Subtarget;
-
-public:
-  explicit ARM64SelectionDAGInfo(const TargetMachine &TM);
-  ~ARM64SelectionDAGInfo();
-
-  SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
-                                  SDValue Dst, SDValue Src, SDValue Size,
-                                  unsigned Align, bool isVolatile,
-                                  MachinePointerInfo DstPtrInfo) const override;
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/ARM64Subtarget.cpp b/lib/Target/ARM64/ARM64Subtarget.cpp
deleted file mode 100644
index 14b5444..0000000
--- a/lib/Target/ARM64/ARM64Subtarget.cpp
+++ /dev/null
@@ -1,100 +0,0 @@
-//===-- ARM64Subtarget.cpp - ARM64 Subtarget Information --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64InstrInfo.h"
-#include "ARM64Subtarget.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/CodeGen/MachineScheduler.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_SUBTARGETINFO_CTOR
-#define GET_SUBTARGETINFO_TARGET_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-using namespace llvm;
-
-ARM64Subtarget::ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                               const std::string &FS)
-    : ARM64GenSubtargetInfo(TT, CPU, FS), HasZeroCycleRegMove(false),
-      HasZeroCycleZeroing(false), CPUString(CPU), TargetTriple(TT) {
-  // Determine default and user-specified characteristics
-
-  if (CPUString.empty())
-    // We default to Cyclone for now.
-    CPUString = "cyclone";
-
-  ParseSubtargetFeatures(CPUString, FS);
-}
-
-/// ClassifyGlobalReference - Find the target operand flags that describe
-/// how a global value should be referenced for the current subtarget.
-unsigned char
-ARM64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const {
-
-  // Determine whether this is a reference to a definition or a declaration.
-  // Materializable GVs (in JIT lazy compilation mode) do not require an extra
-  // load from stub.
-  bool isDecl = GV->hasAvailableExternallyLinkage();
-  if (GV->isDeclaration() && !GV->isMaterializable())
-    isDecl = true;
-
-  // MachO large model always goes via a GOT, simply to get a single 8-byte
-  // absolute relocation on all global addresses.
-  if (TM.getCodeModel() == CodeModel::Large && isTargetMachO())
-    return ARM64II::MO_GOT;
-
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB). Therefore they must use the
-  // GOT.
-  if (TM.getCodeModel() == CodeModel::Small && GV->isWeakForLinker() && isDecl)
-    return ARM64II::MO_GOT;
-
-  // If symbol visibility is hidden, the extra load is not needed if
-  // the symbol is definitely defined in the current translation unit.
-
-  // The handling of non-hidden symbols in PIC mode is rather target-dependent:
-  //   + On MachO, if the symbol is defined in this module the GOT can be
-  //     skipped.
-  //   + On ELF, the R_AARCH64_COPY relocation means that even symbols actually
-  //     defined could end up in unexpected places. Use a GOT.
-  if (TM.getRelocationModel() != Reloc::Static && GV->hasDefaultVisibility()) {
-    if (isTargetMachO())
-      return (isDecl || GV->isWeakForLinker()) ? ARM64II::MO_GOT
-                                               : ARM64II::MO_NO_FLAG;
-    else
-      return ARM64II::MO_GOT;
-  }
-
-  return ARM64II::MO_NO_FLAG;
-}
-
-/// This function returns the name of a function which has an interface
-/// like the non-standard bzero function, if such a function exists on
-/// the current subtarget and it is considered prefereable over
-/// memset with zero passed as the second argument. Otherwise it
-/// returns null.
-const char *ARM64Subtarget::getBZeroEntry() const {
-  // At the moment, always prefer bzero.
-  return "bzero";
-}
-
-void ARM64Subtarget::overrideSchedPolicy(MachineSchedPolicy &Policy,
-                                         MachineInstr *begin, MachineInstr *end,
-                                         unsigned NumRegionInstrs) const {
-  // LNT run (at least on Cyclone) showed reasonably significant gains for
-  // bi-directional scheduling. 253.perlbmk.
-  Policy.OnlyTopDown = false;
-  Policy.OnlyBottomUp = false;
-}
diff --git a/lib/Target/ARM64/ARM64Subtarget.h b/lib/Target/ARM64/ARM64Subtarget.h
deleted file mode 100644
index 1cbd79e..0000000
--- a/lib/Target/ARM64/ARM64Subtarget.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//=====---- ARM64Subtarget.h - Define Subtarget for the ARM64 -*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetSubtarget.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64SUBTARGET_H
-#define ARM64SUBTARGET_H
-
-#include "llvm/Target/TargetSubtargetInfo.h"
-#include "ARM64RegisterInfo.h"
-#include <string>
-
-#define GET_SUBTARGETINFO_HEADER
-#include "ARM64GenSubtargetInfo.inc"
-
-namespace llvm {
-class GlobalValue;
-class StringRef;
-
-class ARM64Subtarget : public ARM64GenSubtargetInfo {
-protected:
-  // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
-  bool HasZeroCycleRegMove;
-
-  // HasZeroCycleZeroing - Has zero-cycle zeroing instructions.
-  bool HasZeroCycleZeroing;
-
-  /// CPUString - String name of used CPU.
-  std::string CPUString;
-
-  /// TargetTriple - What processor and OS we're targeting.
-  Triple TargetTriple;
-
-public:
-  /// This constructor initializes the data members to match that
-  /// of the specified triple.
-  ARM64Subtarget(const std::string &TT, const std::string &CPU,
-                 const std::string &FS);
-
-  bool enableMachineScheduler() const override { return true; }
-
-  bool hasZeroCycleRegMove() const { return HasZeroCycleRegMove; }
-
-  bool hasZeroCycleZeroing() const { return HasZeroCycleZeroing; }
-
-  bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
-
-  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
-
-  bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
-
-  bool isCyclone() const { return CPUString == "cyclone"; }
-
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
-
-  /// ParseSubtargetFeatures - Parses features string setting specified
-  /// subtarget options.  Definition of function is auto generated by tblgen.
-  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
-
-  /// ClassifyGlobalReference - Find the target operand flags that describe
-  /// how a global value should be referenced for the current subtarget.
-  unsigned char ClassifyGlobalReference(const GlobalValue *GV,
-                                        const TargetMachine &TM) const;
-
-  /// This function returns the name of a function which has an interface
-  /// like the non-standard bzero function, if such a function exists on
-  /// the current subtarget and it is considered prefereable over
-  /// memset with zero passed as the second argument. Otherwise it
-  /// returns null.
-  const char *getBZeroEntry() const;
-
-  void overrideSchedPolicy(MachineSchedPolicy &Policy, MachineInstr *begin,
-                           MachineInstr *end, unsigned NumRegionInstrs) const;
-};
-} // End llvm namespace
-
-#endif // ARM64SUBTARGET_H
diff --git a/lib/Target/ARM64/ARM64TargetMachine.cpp b/lib/Target/ARM64/ARM64TargetMachine.cpp
deleted file mode 100644
index 101dc25..0000000
--- a/lib/Target/ARM64/ARM64TargetMachine.cpp
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- ARM64TargetMachine.cpp - Define TargetMachine for ARM64 -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/PassManager.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetOptions.h"
-#include "llvm/Transforms/Scalar.h"
-using namespace llvm;
-
-static cl::opt<bool> EnableCCMP("arm64-ccmp",
-                                cl::desc("Enable the CCMP formation pass"),
-                                cl::init(true));
-
-static cl::opt<bool> EnableStPairSuppress("arm64-stp-suppress", cl::Hidden,
-                                          cl::desc("Suppress STP for ARM64"),
-                                          cl::init(true));
-
-static cl::opt<bool>
-EnablePromoteConstant("arm64-promote-const", cl::Hidden,
-                      cl::desc("Enable the promote constant pass"),
-                      cl::init(true));
-
-static cl::opt<bool>
-EnableCollectLOH("arm64-collect-loh", cl::Hidden,
-                 cl::desc("Enable the pass that emits the linker"
-                          " optimization hints (LOH)"),
-                 cl::init(true));
-
-extern "C" void LLVMInitializeARM64Target() {
-  // Register the target.
-  RegisterTargetMachine<ARM64TargetMachine> X(TheARM64Target);
-}
-
-/// TargetMachine ctor - Create an ARM64 architecture model.
-///
-ARM64TargetMachine::ARM64TargetMachine(const Target &T, StringRef TT,
-                                       StringRef CPU, StringRef FS,
-                                       const TargetOptions &Options,
-                                       Reloc::Model RM, CodeModel::Model CM,
-                                       CodeGenOpt::Level OL)
-    : LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL),
-      Subtarget(TT, CPU, FS),
-      DL(Subtarget.isTargetMachO() ? "e-m:o-i64:64-i128:128-n32:64-S128"
-                                   : "e-m:e-i64:64-i128:128-n32:64-S128"),
-      InstrInfo(Subtarget), TLInfo(*this), FrameLowering(*this, Subtarget),
-      TSInfo(*this) {
-  initAsmInfo();
-}
-
-namespace {
-/// ARM64 Code Generator Pass Configuration Options.
-class ARM64PassConfig : public TargetPassConfig {
-public:
-  ARM64PassConfig(ARM64TargetMachine *TM, PassManagerBase &PM)
-      : TargetPassConfig(TM, PM) {}
-
-  ARM64TargetMachine &getARM64TargetMachine() const {
-    return getTM<ARM64TargetMachine>();
-  }
-
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addILPOpts();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
-};
-} // namespace
-
-void ARM64TargetMachine::addAnalysisPasses(PassManagerBase &PM) {
-  // Add first the target-independent BasicTTI pass, then our ARM64 pass. This
-  // allows the ARM64 pass to delegate to the target independent layer when
-  // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(this));
-  PM.add(createARM64TargetTransformInfoPass(this));
-}
-
-TargetPassConfig *ARM64TargetMachine::createPassConfig(PassManagerBase &PM) {
-  return new ARM64PassConfig(this, PM);
-}
-
-// Pass Pipeline Configuration
-bool ARM64PassConfig::addPreISel() {
-  // Run promote constant before global merge, so that the promoted constants
-  // get a chance to be merged
-  if (TM->getOptLevel() != CodeGenOpt::None && EnablePromoteConstant)
-    addPass(createARM64PromoteConstantPass());
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createGlobalMergePass(TM));
-  if (TM->getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64AddressTypePromotionPass());
-  return false;
-}
-
-bool ARM64PassConfig::addInstSelector() {
-  addPass(createARM64ISelDag(getARM64TargetMachine(), getOptLevel()));
-
-  // For ELF, cleanup any local-dynamic TLS accesses (i.e. combine as many
-  // references to _TLS_MODULE_BASE_ as possible.
-  if (TM->getSubtarget<ARM64Subtarget>().isTargetELF() &&
-      getOptLevel() != CodeGenOpt::None)
-    addPass(createARM64CleanupLocalDynamicTLSPass());
-
-  return false;
-}
-
-bool ARM64PassConfig::addILPOpts() {
-  if (EnableCCMP)
-    addPass(createARM64ConditionalCompares());
-  addPass(&EarlyIfConverterID);
-  if (EnableStPairSuppress)
-    addPass(createARM64StorePairSuppressPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreRegAlloc() {
-  // Use AdvSIMD scalar instructions whenever profitable.
-  addPass(createARM64AdvSIMDScalar());
-  return true;
-}
-
-bool ARM64PassConfig::addPostRegAlloc() {
-  // Change dead register definitions to refer to the zero register.
-  addPass(createARM64DeadRegisterDefinitions());
-  return true;
-}
-
-bool ARM64PassConfig::addPreSched2() {
-  // Expand some pseudo instructions to allow proper scheduling.
-  addPass(createARM64ExpandPseudoPass());
-  // Use load/store pair instructions when possible.
-  addPass(createARM64LoadStoreOptimizationPass());
-  return true;
-}
-
-bool ARM64PassConfig::addPreEmitPass() {
-  // Relax conditional branch instructions if they're otherwise out of
-  // range of their destination.
-  addPass(createARM64BranchRelaxation());
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH)
-    addPass(createARM64CollectLOHPass());
-  return true;
-}
diff --git a/lib/Target/ARM64/ARM64TargetMachine.h b/lib/Target/ARM64/ARM64TargetMachine.h
deleted file mode 100644
index 8274550..0000000
--- a/lib/Target/ARM64/ARM64TargetMachine.h
+++ /dev/null
@@ -1,69 +0,0 @@
-//===-- ARM64TargetMachine.h - Define TargetMachine for ARM64 ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares the ARM64 specific subclass of TargetMachine.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETMACHINE_H
-#define ARM64TARGETMACHINE_H
-
-#include "ARM64InstrInfo.h"
-#include "ARM64ISelLowering.h"
-#include "ARM64Subtarget.h"
-#include "ARM64FrameLowering.h"
-#include "ARM64SelectionDAGInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/MC/MCStreamer.h"
-
-namespace llvm {
-
-class ARM64TargetMachine : public LLVMTargetMachine {
-protected:
-  ARM64Subtarget Subtarget;
-
-private:
-  const DataLayout DL;
-  ARM64InstrInfo InstrInfo;
-  ARM64TargetLowering TLInfo;
-  ARM64FrameLowering FrameLowering;
-  ARM64SelectionDAGInfo TSInfo;
-
-public:
-  ARM64TargetMachine(const Target &T, StringRef TT, StringRef CPU, StringRef FS,
-                     const TargetOptions &Options, Reloc::Model RM,
-                     CodeModel::Model CM, CodeGenOpt::Level OL);
-
-  const ARM64Subtarget *getSubtargetImpl() const override { return &Subtarget; }
-  const ARM64TargetLowering *getTargetLowering() const override {
-    return &TLInfo;
-  }
-  const DataLayout *getDataLayout() const override { return &DL; }
-  const ARM64FrameLowering *getFrameLowering() const override {
-    return &FrameLowering;
-  }
-  const ARM64InstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const ARM64RegisterInfo *getRegisterInfo() const override {
-    return &InstrInfo.getRegisterInfo();
-  }
-  const ARM64SelectionDAGInfo *getSelectionDAGInfo() const override {
-    return &TSInfo;
-  }
-
-  // Pass Pipeline Configuration
-  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
-
-  /// \brief Register ARM64 analysis passes with a pass manager.
-  void addAnalysisPasses(PassManagerBase &PM) override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.cpp b/lib/Target/ARM64/ARM64TargetObjectFile.cpp
deleted file mode 100644
index cde01e5..0000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-//===-- ARM64TargetObjectFile.cpp - ARM64 Object Info ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64TargetObjectFile.h"
-#include "ARM64TargetMachine.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/Dwarf.h"
-using namespace llvm;
-using namespace dwarf;
-
-void ARM64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
-                                           const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-const MCExpr *ARM64_MachoTargetObjectFile::getTTypeGlobalReference(
-    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
-    const TargetMachine &TM, MachineModuleInfo *MMI,
-    MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
-    const MCSymbol *Sym = TM.getSymbol(GV, Mang);
-    const MCExpr *Res =
-        MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
-    MCSymbol *PCSym = getContext().CreateTempSymbol();
-    Streamer.EmitLabel(PCSym);
-    const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, getContext());
-    return MCBinaryExpr::CreateSub(Res, PC, getContext());
-  }
-
-  return TargetLoweringObjectFileMachO::getTTypeGlobalReference(
-      GV, Encoding, Mang, TM, MMI, Streamer);
-}
-
-MCSymbol *ARM64_MachoTargetObjectFile::getCFIPersonalitySymbol(
-    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
-    MachineModuleInfo *MMI) const {
-  return TM.getSymbol(GV, Mang);
-}
diff --git a/lib/Target/ARM64/ARM64TargetObjectFile.h b/lib/Target/ARM64/ARM64TargetObjectFile.h
deleted file mode 100644
index 62446f9..0000000
--- a/lib/Target/ARM64/ARM64TargetObjectFile.h
+++ /dev/null
@@ -1,40 +0,0 @@
-//===-- ARM64TargetObjectFile.h - ARM64 Object Info -*- C++ -------------*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-#define LLVM_TARGET_ARM64_TARGETOBJECTFILE_H
-
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-
-namespace llvm {
-class ARM64TargetMachine;
-
-/// This implementation is used for AArch64 ELF targets (Linux in particular).
-class ARM64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
-  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-};
-
-/// ARM64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
-class ARM64_MachoTargetObjectFile : public TargetLoweringObjectFileMachO {
-public:
-  const MCExpr *getTTypeGlobalReference(const GlobalValue *GV,
-                                        unsigned Encoding, Mangler &Mang,
-                                        const TargetMachine &TM,
-                                        MachineModuleInfo *MMI,
-                                        MCStreamer &Streamer) const override;
-
-  MCSymbol *getCFIPersonalitySymbol(const GlobalValue *GV, Mangler &Mang,
-                                    const TargetMachine &TM,
-                                    MachineModuleInfo *MMI) const override;
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp b/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
deleted file mode 100644
index 9b598d7..0000000
--- a/lib/Target/ARM64/ARM64TargetTransformInfo.cpp
+++ /dev/null
@@ -1,326 +0,0 @@
-//===-- ARM64TargetTransformInfo.cpp - ARM64 specific TTI pass ------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file implements a TargetTransformInfo analysis pass specific to the
-/// ARM64 target machine. It uses the target's detailed information to provide
-/// more precise answers to certain TTI queries, while letting the target
-/// independent and default TTI implementations handle the rest.
-///
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64tti"
-#include "ARM64.h"
-#include "ARM64TargetMachine.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
-using namespace llvm;
-
-// Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
-// pass constructor initialization.
-namespace llvm {
-void initializeARM64TTIPass(PassRegistry &);
-}
-
-namespace {
-
-class ARM64TTI final : public ImmutablePass, public TargetTransformInfo {
-  const ARM64TargetMachine *TM;
-  const ARM64Subtarget *ST;
-  const ARM64TargetLowering *TLI;
-
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
-public:
-  ARM64TTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
-    llvm_unreachable("This pass cannot be directly constructed");
-  }
-
-  ARM64TTI(const ARM64TargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
-        TLI(TM->getTargetLowering()) {
-    initializeARM64TTIPass(*PassRegistry::getPassRegistry());
-  }
-
-  void initializePass() override { pushTTIStack(this); }
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    TargetTransformInfo::getAnalysisUsage(AU);
-  }
-
-  /// Pass identification.
-  static char ID;
-
-  /// Provide necessary pointer adjustments for the two base classes.
-  void *getAdjustedAnalysisPointer(const void *ID) override {
-    if (ID == &TargetTransformInfo::ID)
-      return (TargetTransformInfo *)this;
-    return this;
-  }
-
-  /// \name Scalar TTI Implementations
-  /// @{
-
-  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
-  PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-
-  /// @}
-
-  /// \name Vector TTI Implementations
-  /// @{
-
-  unsigned getNumberOfRegisters(bool Vector) const override {
-    if (Vector)
-      return 32;
-
-    return 31;
-  }
-
-  unsigned getRegisterBitWidth(bool Vector) const override {
-    if (Vector)
-      return 128;
-
-    return 64;
-  }
-
-  unsigned getMaximumUnrollFactor() const override { return 2; }
-
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const
-      override;
-
-  unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const
-      override;
-
-  unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                  OperandValueKind Opd1Info = OK_AnyValue,
-                                  OperandValueKind Opd2Info = OK_AnyValue) const
-      override;
-
-  unsigned getAddressComputationCost(Type *Ty, bool IsComplex) const override;
-
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) const
-      override;
-
-  unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) const override;
-  /// @}
-};
-
-} // end anonymous namespace
-
-INITIALIZE_AG_PASS(ARM64TTI, TargetTransformInfo, "arm64tti",
-                   "ARM64 Target Transform Info", true, true, false)
-char ARM64TTI::ID = 0;
-
-ImmutablePass *
-llvm::createARM64TargetTransformInfoPass(const ARM64TargetMachine *TM) {
-  return new ARM64TTI(TM);
-}
-
-unsigned ARM64TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  assert(Ty->isIntegerTy());
-
-  unsigned BitSize = Ty->getPrimitiveSizeInBits();
-  if (BitSize == 0)
-    return ~0U;
-
-  int64_t Val = Imm.getSExtValue();
-  if (Val == 0 || ARM64_AM::isLogicalImmediate(Val, BitSize))
-    return 1;
-
-  if ((int64_t)Val < 0)
-    Val = ~Val;
-  if (BitSize == 32)
-    Val &= (1LL << 32) - 1;
-
-  unsigned LZ = countLeadingZeros((uint64_t)Val);
-  unsigned Shift = (63 - LZ) / 16;
-  // MOVZ is free so return true for one or fewer MOVK.
-  return (Shift == 0) ? 1 : Shift;
-}
-
-ARM64TTI::PopcntSupportKind ARM64TTI::getPopcntSupport(unsigned TyWidth) const {
-  assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
-  if (TyWidth == 32 || TyWidth == 64)
-    return PSK_FastHardware;
-  // TODO: ARM64TargetLowering::LowerCTPOP() supports 128bit popcount.
-  return PSK_Software;
-}
-
-unsigned ARM64TTI::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                    Type *Src) const {
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  assert(ISD && "Invalid opcode");
-
-  EVT SrcTy = TLI->getValueType(Src);
-  EVT DstTy = TLI->getValueType(Dst);
-
-  if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-
-  static const TypeConversionCostTblEntry<MVT> ConversionTbl[] = {
-    // LowerVectorINT_TO_FP:
-    { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 1 },
-    { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1 },
-    // LowerVectorFP_TO_INT
-    { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 1 },
-    { ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 4 },
-    { ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 4 },
-  };
-
-  int Idx = ConvertCostTableLookup<MVT>(
-      ConversionTbl, array_lengthof(ConversionTbl), ISD, DstTy.getSimpleVT(),
-      SrcTy.getSimpleVT());
-  if (Idx != -1)
-    return ConversionTbl[Idx].Cost;
-
-  return TargetTransformInfo::getCastInstrCost(Opcode, Dst, Src);
-}
-
-unsigned ARM64TTI::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                      unsigned Index) const {
-  assert(Val->isVectorTy() && "This must be a vector type");
-
-  if (Index != -1U) {
-    // Legalize the type.
-    std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Val);
-
-    // This type is legalized to a scalar type.
-    if (!LT.second.isVector())
-      return 0;
-
-    // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
-
-    // The element at index zero is already inside the vector.
-    if (Index == 0)
-      return 0;
-  }
-
-  // All other insert/extracts cost this much.
-  return 2;
-}
-
-unsigned ARM64TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
-                                          OperandValueKind Opd1Info,
-                                          OperandValueKind Opd2Info) const {
-  // Legalize the type.
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Ty);
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-
-  switch (ISD) {
-  default:
-    return TargetTransformInfo::getArithmeticInstrCost(Opcode, Ty, Opd1Info,
-                                                       Opd2Info);
-  case ISD::ADD:
-  case ISD::MUL:
-  case ISD::XOR:
-  case ISD::OR:
-  case ISD::AND:
-    // These nodes are marked as 'custom' for combining purposes only.
-    // We know that they are legal. See LowerAdd in ISelLowering.
-    return 1 * LT.first;
-  }
-}
-
-unsigned ARM64TTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
-  // Address computations in vectorized code with non-consecutive addresses will
-  // likely result in more instructions compared to scalar code where the
-  // computation can more often be merged into the index mode. The resulting
-  // extra micro-ops can significantly decrease throughput.
-  unsigned NumVectorInstToHideOverhead = 10;
-
-  if (Ty->isVectorTy() && IsComplex)
-    return NumVectorInstToHideOverhead;
-
-  // In many cases the address computation is not merged into the instruction
-  // addressing mode.
-  return 1;
-}
-
-unsigned ARM64TTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                      Type *CondTy) const {
-
-  int ISD = TLI->InstructionOpcodeToISD(Opcode);
-  // We don't lower vector selects well that are wider than the register width.
-  if (ValTy->isVectorTy() && ISD == ISD::SELECT) {
-    // We would need this many instructions to hide the scalarization happening.
-    unsigned AmortizationCost = 20;
-    static const TypeConversionCostTblEntry<MVT::SimpleValueType>
-    VectorSelectTbl[] = {
-      { ISD::SELECT, MVT::v16i1, MVT::v16i16, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i32, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i32, 16 * AmortizationCost },
-      { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost },
-      { ISD::SELECT, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost },
-      { ISD::SELECT, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost }
-    };
-
-    EVT SelCondTy = TLI->getValueType(CondTy);
-    EVT SelValTy = TLI->getValueType(ValTy);
-    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
-      int Idx =
-          ConvertCostTableLookup(VectorSelectTbl, ISD, SelCondTy.getSimpleVT(),
-                                 SelValTy.getSimpleVT());
-      if (Idx != -1)
-        return VectorSelectTbl[Idx].Cost;
-    }
-  }
-  return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
-}
-
-unsigned ARM64TTI::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                   unsigned Alignment,
-                                   unsigned AddressSpace) const {
-  std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(Src);
-
-  if (Opcode == Instruction::Store && Src->isVectorTy() && Alignment != 16 &&
-      Src->getVectorElementType()->isIntegerTy(64)) {
-    // Unaligned stores are extremely inefficient. We don't split
-    // unaligned v2i64 stores because the negative impact that has shown in
-    // practice on inlined memcpy code.
-    // We make v2i64 stores expensive so that we will only vectorize if there
-    // are 6 other instructions getting vectorized.
-    unsigned AmortizationCost = 6;
-
-    return LT.first * 2 * AmortizationCost;
-  }
-
-  if (Src->isVectorTy() && Src->getVectorElementType()->isIntegerTy(8) &&
-      Src->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
-    unsigned NumVecElts = Src->getVectorNumElements();
-    unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
-    // We generate 2 instructions per vector element.
-    return NumVectorizableInstsToAmortize * NumVecElts * 2;
-  }
-
-  return LT.first;
-}
diff --git a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp b/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
deleted file mode 100644
index 38a61d8..0000000
--- a/lib/Target/ARM64/AsmParser/ARM64AsmParser.cpp
+++ /dev/null
@@ -1,4832 +0,0 @@
-//===-- ARM64AsmParser.cpp - Parse ARM64 assembly to MCInst instructions --===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/MC/MCParser/MCAsmParser.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCTargetAsmParser.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include <cstdio>
-using namespace llvm;
-
-namespace {
-
-class ARM64Operand;
-
-class ARM64AsmParser : public MCTargetAsmParser {
-public:
-  typedef SmallVectorImpl<MCParsedAsmOperand *> OperandVector;
-
-private:
-  StringRef Mnemonic; ///< Instruction mnemonic.
-  MCSubtargetInfo &STI;
-  MCAsmParser &Parser;
-
-  MCAsmParser &getParser() const { return Parser; }
-  MCAsmLexer &getLexer() const { return Parser.getLexer(); }
-
-  SMLoc getLoc() const { return Parser.getTok().getLoc(); }
-
-  bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
-  unsigned parseCondCodeString(StringRef Cond);
-  bool parseCondCode(OperandVector &Operands, bool invertCondCode);
-  int tryParseRegister();
-  int tryMatchVectorRegister(StringRef &Kind);
-  bool parseOptionalShift(OperandVector &Operands);
-  bool parseOptionalExtend(OperandVector &Operands);
-  bool parseRegister(OperandVector &Operands);
-  bool parseMemory(OperandVector &Operands);
-  bool parseSymbolicImmVal(const MCExpr *&ImmVal);
-  bool parseVectorList(OperandVector &Operands);
-  bool parseOperand(OperandVector &Operands, bool isCondCode,
-                    bool invertCondCode);
-
-  void Warning(SMLoc L, const Twine &Msg) { Parser.Warning(L, Msg); }
-  bool Error(SMLoc L, const Twine &Msg) { return Parser.Error(L, Msg); }
-  bool showMatchError(SMLoc Loc, unsigned ErrCode);
-
-  bool parseDirectiveWord(unsigned Size, SMLoc L);
-  bool parseDirectiveTLSDescCall(SMLoc L);
-
-  bool parseDirectiveLOH(StringRef LOH, SMLoc L);
-
-  bool validateInstruction(MCInst &Inst, SmallVectorImpl<SMLoc> &Loc);
-  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                               OperandVector &Operands, MCStreamer &Out,
-                               unsigned &ErrorInfo, bool MatchingInlineAsm);
-/// @name Auto-generated Match Functions
-/// {
-
-#define GET_ASSEMBLER_HEADER
-#include "ARM64GenAsmMatcher.inc"
-
-  /// }
-
-  OperandMatchResultTy tryParseNoIndexMemory(OperandVector &Operands);
-  OperandMatchResultTy tryParseBarrierOperand(OperandVector &Operands);
-  OperandMatchResultTy tryParseSystemRegister(OperandVector &Operands);
-  OperandMatchResultTy tryParseCPSRField(OperandVector &Operands);
-  OperandMatchResultTy tryParseSysCROperand(OperandVector &Operands);
-  OperandMatchResultTy tryParsePrefetch(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrpLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseAdrLabel(OperandVector &Operands);
-  OperandMatchResultTy tryParseFPImm(OperandVector &Operands);
-  bool tryParseVectorRegister(OperandVector &Operands);
-
-public:
-  enum ARM64MatchResultTy {
-    Match_InvalidSuffix = FIRST_TARGET_MATCH_RESULT_TY,
-#define GET_OPERAND_DIAGNOSTIC_TYPES
-#include "ARM64GenAsmMatcher.inc"
-  };
-  ARM64AsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-                 const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(_STI), Parser(_Parser) {
-    MCAsmParserExtension::Initialize(_Parser);
-  }
-
-  virtual bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                                SMLoc NameLoc, OperandVector &Operands);
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
-  virtual bool ParseDirective(AsmToken DirectiveID);
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
-
-  static bool classifySymbolRef(const MCExpr *Expr,
-                                ARM64MCExpr::VariantKind &ELFRefKind,
-                                MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                const MCConstantExpr *&Addend);
-};
-} // end anonymous namespace
-
-namespace {
-
-/// ARM64Operand - Instances of this class represent a parsed ARM64 machine
-/// instruction.
-class ARM64Operand : public MCParsedAsmOperand {
-public:
-  enum MemIdxKindTy {
-    ImmediateOffset, // pre-indexed, no writeback
-    RegisterOffset   // register offset, with optional extend
-  };
-
-private:
-  enum KindTy {
-    k_Immediate,
-    k_Memory,
-    k_Register,
-    k_VectorList,
-    k_VectorIndex,
-    k_Token,
-    k_SysCR,
-    k_Prefetch,
-    k_Shifter,
-    k_Extend,
-    k_FPImm,
-    k_Barrier,
-    k_SystemRegister,
-    k_CPSRField
-  } Kind;
-
-  SMLoc StartLoc, EndLoc, OffsetLoc;
-
-  struct TokOp {
-    const char *Data;
-    unsigned Length;
-    bool IsSuffix; // Is the operand actually a suffix on the mnemonic.
-  };
-
-  struct RegOp {
-    unsigned RegNum;
-    bool isVector;
-  };
-
-  struct VectorListOp {
-    unsigned RegNum;
-    unsigned Count;
-    unsigned NumElements;
-    unsigned ElementKind;
-  };
-
-  struct VectorIndexOp {
-    unsigned Val;
-  };
-
-  struct ImmOp {
-    const MCExpr *Val;
-  };
-
-  struct FPImmOp {
-    unsigned Val; // Encoded 8-bit representation.
-  };
-
-  struct BarrierOp {
-    unsigned Val; // Not the enum since not all values have names.
-  };
-
-  struct SystemRegisterOp {
-    // 16-bit immediate, usually from the ARM64SYS::SystermRegister enum,
-    // but not limited to those values.
-    uint16_t Val;
-  };
-
-  struct CPSRFieldOp {
-    ARM64SYS::CPSRField Field;
-  };
-
-  struct SysCRImmOp {
-    unsigned Val;
-  };
-
-  struct PrefetchOp {
-    unsigned Val;
-  };
-
-  struct ShifterOp {
-    unsigned Val;
-  };
-
-  struct ExtendOp {
-    unsigned Val;
-  };
-
-  // This is for all forms of ARM64 address expressions
-  struct MemOp {
-    unsigned BaseRegNum, OffsetRegNum;
-    ARM64_AM::ExtendType ExtType;
-    unsigned ShiftVal;
-    bool ExplicitShift;
-    const MCExpr *OffsetImm;
-    MemIdxKindTy Mode;
-  };
-
-  union {
-    struct TokOp Tok;
-    struct RegOp Reg;
-    struct VectorListOp VectorList;
-    struct VectorIndexOp VectorIndex;
-    struct ImmOp Imm;
-    struct FPImmOp FPImm;
-    struct BarrierOp Barrier;
-    struct SystemRegisterOp SystemRegister;
-    struct CPSRFieldOp CPSRField;
-    struct SysCRImmOp SysCRImm;
-    struct PrefetchOp Prefetch;
-    struct ShifterOp Shifter;
-    struct ExtendOp Extend;
-    struct MemOp Mem;
-  };
-
-  // Keep the MCContext around as the MCExprs may need manipulated during
-  // the add<>Operands() calls.
-  MCContext &Ctx;
-
-  ARM64Operand(KindTy K, MCContext &_Ctx)
-      : MCParsedAsmOperand(), Kind(K), Ctx(_Ctx) {}
-
-public:
-  ARM64Operand(const ARM64Operand &o) : MCParsedAsmOperand(), Ctx(o.Ctx) {
-    Kind = o.Kind;
-    StartLoc = o.StartLoc;
-    EndLoc = o.EndLoc;
-    switch (Kind) {
-    case k_Token:
-      Tok = o.Tok;
-      break;
-    case k_Immediate:
-      Imm = o.Imm;
-      break;
-    case k_FPImm:
-      FPImm = o.FPImm;
-      break;
-    case k_Barrier:
-      Barrier = o.Barrier;
-      break;
-    case k_SystemRegister:
-      SystemRegister = o.SystemRegister;
-      break;
-    case k_CPSRField:
-      CPSRField = o.CPSRField;
-      break;
-    case k_Register:
-      Reg = o.Reg;
-      break;
-    case k_VectorList:
-      VectorList = o.VectorList;
-      break;
-    case k_VectorIndex:
-      VectorIndex = o.VectorIndex;
-      break;
-    case k_SysCR:
-      SysCRImm = o.SysCRImm;
-      break;
-    case k_Prefetch:
-      Prefetch = o.Prefetch;
-      break;
-    case k_Memory:
-      Mem = o.Mem;
-      break;
-    case k_Shifter:
-      Shifter = o.Shifter;
-      break;
-    case k_Extend:
-      Extend = o.Extend;
-      break;
-    }
-  }
-
-  /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
-  /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
-  /// getOffsetLoc - Get the location of the offset of this memory operand.
-  SMLoc getOffsetLoc() const { return OffsetLoc; }
-
-  StringRef getToken() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return StringRef(Tok.Data, Tok.Length);
-  }
-
-  bool isTokenSuffix() const {
-    assert(Kind == k_Token && "Invalid access!");
-    return Tok.IsSuffix;
-  }
-
-  const MCExpr *getImm() const {
-    assert(Kind == k_Immediate && "Invalid access!");
-    return Imm.Val;
-  }
-
-  unsigned getFPImm() const {
-    assert(Kind == k_FPImm && "Invalid access!");
-    return FPImm.Val;
-  }
-
-  unsigned getBarrier() const {
-    assert(Kind == k_Barrier && "Invalid access!");
-    return Barrier.Val;
-  }
-
-  uint16_t getSystemRegister() const {
-    assert(Kind == k_SystemRegister && "Invalid access!");
-    return SystemRegister.Val;
-  }
-
-  ARM64SYS::CPSRField getCPSRField() const {
-    assert(Kind == k_CPSRField && "Invalid access!");
-    return CPSRField.Field;
-  }
-
-  unsigned getReg() const {
-    assert(Kind == k_Register && "Invalid access!");
-    return Reg.RegNum;
-  }
-
-  unsigned getVectorListStart() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.RegNum;
-  }
-
-  unsigned getVectorListCount() const {
-    assert(Kind == k_VectorList && "Invalid access!");
-    return VectorList.Count;
-  }
-
-  unsigned getVectorIndex() const {
-    assert(Kind == k_VectorIndex && "Invalid access!");
-    return VectorIndex.Val;
-  }
-
-  unsigned getSysCR() const {
-    assert(Kind == k_SysCR && "Invalid access!");
-    return SysCRImm.Val;
-  }
-
-  unsigned getPrefetch() const {
-    assert(Kind == k_Prefetch && "Invalid access!");
-    return Prefetch.Val;
-  }
-
-  unsigned getShifter() const {
-    assert(Kind == k_Shifter && "Invalid access!");
-    return Shifter.Val;
-  }
-
-  unsigned getExtend() const {
-    assert(Kind == k_Extend && "Invalid access!");
-    return Extend.Val;
-  }
-
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isSImm9() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val < 256);
-  }
-  bool isSImm7s4() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -256 && Val <= 252 && (Val & 3) == 0);
-  }
-  bool isSImm7s8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -512 && Val <= 504 && (Val & 7) == 0);
-  }
-  bool isSImm7s16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= -1024 && Val <= 1008 && (Val & 15) == 0);
-  }
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-  bool isLogicalImm32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 32);
-  }
-  bool isLogicalImm64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isLogicalImmediate(MCE->getValue(), 64);
-  }
-  bool isSIMDImmType10() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    return ARM64_AM::isAdvSIMDModImmType10(MCE->getValue());
-  }
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-  bool isBranchTarget19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-  bool isBranchTarget14() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
-  }
-
-  bool isMovWSymbol(ArrayRef<ARM64MCExpr::VariantKind> AllowedModifiers) const {
-    if (!isImm())
-      return false;
-
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (!ARM64AsmParser::classifySymbolRef(getImm(), ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      return false;
-    }
-    if (DarwinRefKind != MCSymbolRefExpr::VK_None)
-      return false;
-
-    for (unsigned i = 0; i != AllowedModifiers.size(); ++i) {
-      if (ELFRefKind == AllowedModifiers[i])
-        return Addend == 0;
-    }
-
-    return false;
-  }
-
-  bool isMovZSymbolG3() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G3 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2,
-                                                   ARM64MCExpr::VK_TPREL_G2,
-                                                   ARM64MCExpr::VK_DTPREL_G2 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G1,
-                                                   ARM64MCExpr::VK_GOTTPREL_G1,
-                                                   ARM64MCExpr::VK_TPREL_G1,
-                                                   ARM64MCExpr::VK_DTPREL_G1, };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovZSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G0,
-                                                   ARM64MCExpr::VK_TPREL_G0,
-                                                   ARM64MCExpr::VK_DTPREL_G0 };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG2() const {
-    static ARM64MCExpr::VariantKind Variants[] = { ARM64MCExpr::VK_ABS_G2_NC };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG1() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G1_NC, ARM64MCExpr::VK_TPREL_G1_NC,
-      ARM64MCExpr::VK_DTPREL_G1_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isMovKSymbolG0() const {
-    static ARM64MCExpr::VariantKind Variants[] = {
-      ARM64MCExpr::VK_ABS_G0_NC,   ARM64MCExpr::VK_GOTTPREL_G0_NC,
-      ARM64MCExpr::VK_TPREL_G0_NC, ARM64MCExpr::VK_DTPREL_G0_NC
-    };
-    return isMovWSymbol(Variants);
-  }
-
-  bool isFPImm() const { return Kind == k_FPImm; }
-  bool isBarrier() const { return Kind == k_Barrier; }
-  bool isSystemRegister() const {
-    if (Kind == k_SystemRegister)
-      return true;
-    // SPSel is legal for both the system register and the CPSR-field
-    // variants of MSR, so special case that. Fugly.
-    return (Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
-  }
-  bool isSystemCPSRField() const { return Kind == k_CPSRField; }
-  bool isReg() const { return Kind == k_Register && !Reg.isVector; }
-  bool isVectorReg() const { return Kind == k_Register && Reg.isVector; }
-
-  /// Is this a vector list with the type implicit (presumably attached to the
-  /// instruction itself)?
-  template <unsigned NumRegs> bool isImplicitlyTypedVectorList() const {
-    return Kind == k_VectorList && VectorList.Count == NumRegs &&
-           !VectorList.ElementKind;
-  }
-
-  template <unsigned NumRegs, unsigned NumElements, char ElementKind>
-  bool isTypedVectorList() const {
-    if (Kind != k_VectorList)
-      return false;
-    if (VectorList.Count != NumRegs)
-      return false;
-    if (VectorList.ElementKind != ElementKind)
-      return false;
-    return VectorList.NumElements == NumElements;
-  }
-
-  bool isVectorIndexB() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 16;
-  }
-  bool isVectorIndexH() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 8;
-  }
-  bool isVectorIndexS() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 4;
-  }
-  bool isVectorIndexD() const {
-    return Kind == k_VectorIndex && VectorIndex.Val < 2;
-  }
-  bool isToken() const { return Kind == k_Token; }
-  bool isTokenEqual(StringRef Str) const {
-    return Kind == k_Token && getToken() == Str;
-  }
-  bool isMem() const { return Kind == k_Memory; }
-  bool isSysCR() const { return Kind == k_SysCR; }
-  bool isPrefetch() const { return Kind == k_Prefetch; }
-  bool isShifter() const { return Kind == k_Shifter; }
-  bool isExtend() const {
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-      return ST == ARM64_AM::LSL;
-    }
-    return Kind == k_Extend;
-  }
-  bool isExtend64() const {
-    if (Kind != k_Extend)
-      return false;
-    // UXTX and SXTX require a 64-bit source register (the ExtendLSL64 class).
-    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
-    return ET != ARM64_AM::UXTX && ET != ARM64_AM::SXTX;
-  }
-  bool isExtendLSL64() const {
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-      return ST == ARM64_AM::LSL;
-    }
-    if (Kind != k_Extend)
-      return false;
-    ARM64_AM::ExtendType ET = ARM64_AM::getArithExtendType(Extend.Val);
-    return ET == ARM64_AM::UXTX || ET == ARM64_AM::SXTX;
-  }
-
-  bool isArithmeticShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An arithmetic shifter is LSL, LSR, or ASR.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    return ST == ARM64_AM::LSL || ST == ARM64_AM::LSR || ST == ARM64_AM::ASR;
-  }
-
-  bool isMovImm32Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0, 16, 32, or 48.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
-    return (Val == 0 || Val == 16);
-  }
-
-  bool isMovImm64Shifter() const {
-    if (!isShifter())
-      return false;
-
-    // A MOVi shifter is LSL of 0 or 16.
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(Shifter.Val);
-    if (ST != ARM64_AM::LSL)
-      return false;
-    uint64_t Val = ARM64_AM::getShiftValue(Shifter.Val);
-    return (Val == 0 || Val == 16 || Val == 32 || Val == 48);
-  }
-
-  bool isAddSubShifter() const {
-    if (!isShifter())
-      return false;
-
-    // An ADD/SUB shifter is either 'lsl #0' or 'lsl #12'.
-    unsigned Val = Shifter.Val;
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (ARM64_AM::getShiftValue(Val) == 0 ||
-            ARM64_AM::getShiftValue(Val) == 12);
-  }
-
-  bool isLogicalVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0, 8, 16, or 24.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8 || Shift == 16 || Shift == 24);
-  }
-
-  bool isLogicalVecHalfWordShifter() const {
-    if (!isLogicalVecShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 0 or 8.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-           (Shift == 0 || Shift == 8);
-  }
-
-  bool isMoveVecShifter() const {
-    if (!isShifter())
-      return false;
-
-    // A logical vector shifter is a left shift by 8 or 16.
-    unsigned Val = Shifter.Val;
-    unsigned Shift = ARM64_AM::getShiftValue(Val);
-    return ARM64_AM::getShiftType(Val) == ARM64_AM::MSL &&
-           (Shift == 8 || Shift == 16);
-  }
-
-  bool isMemoryRegisterOffset8() const {
-    return isMem() && Mem.Mode == RegisterOffset && Mem.ShiftVal == 0;
-  }
-
-  bool isMemoryRegisterOffset16() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 1);
-  }
-
-  bool isMemoryRegisterOffset32() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 2);
-  }
-
-  bool isMemoryRegisterOffset64() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 3);
-  }
-
-  bool isMemoryRegisterOffset128() const {
-    return isMem() && Mem.Mode == RegisterOffset &&
-           (Mem.ShiftVal == 0 || Mem.ShiftVal == 4);
-  }
-
-  bool isMemoryUnscaled() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE)
-      return false;
-    // The offset must fit in a signed 9-bit unscaled immediate.
-    int64_t Value = CE->getValue();
-    return (Value >= -256 && Value < 256);
-  }
-  // Fallback unscaled operands are for aliases of LDR/STR that fall back
-  // to LDUR/STUR when the offset is not legal for the former but is for
-  // the latter. As such, in addition to checking for being a legal unscaled
-  // address, also check that it is not a legal scaled address. This avoids
-  // ambiguity in the matcher.
-  bool isMemoryUnscaledFB8() const {
-    return isMemoryUnscaled() && !isMemoryIndexed8();
-  }
-  bool isMemoryUnscaledFB16() const {
-    return isMemoryUnscaled() && !isMemoryIndexed16();
-  }
-  bool isMemoryUnscaledFB32() const {
-    return isMemoryUnscaled() && !isMemoryIndexed32();
-  }
-  bool isMemoryUnscaledFB64() const {
-    return isMemoryUnscaled() && !isMemoryIndexed64();
-  }
-  bool isMemoryUnscaledFB128() const {
-    return isMemoryUnscaled() && !isMemoryIndexed128();
-  }
-  bool isMemoryIndexed(unsigned Scale) const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    // Make sure the immediate value is valid.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-
-    if (CE) {
-      // The offset must be a positive multiple of the scale and in range of
-      // encoding with a 12-bit immediate.
-      int64_t Value = CE->getValue();
-      return (Value >= 0 && (Value % Scale) == 0 && Value <= (4095 * Scale));
-    }
-
-    // If it's not a constant, check for some expressions we know.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                           Addend)) {
-      // If we don't understand the expression, assume the best and
-      // let the fixup and relocation code deal with it.
-      return true;
-    }
-
-    if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-        ELFRefKind == ARM64MCExpr::VK_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_GOT_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-        ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_GOTTPREL_LO12_NC ||
-        ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-      // Note that we don't range-check the addend. It's adjusted modulo page
-      // size when converted, so there is no "out of range" condition when using
-      // @pageoff.
-      int64_t Value = Addend ? Addend->getValue() : 0;
-      return Value >= 0 && (Value % Scale) == 0;
-    } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF ||
-               DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF) {
-      // @gotpageoff/@tlvppageoff can only be used directly, not with an addend.
-      return Addend == 0;
-    }
-
-    return false;
-  }
-  bool isMemoryIndexed128() const { return isMemoryIndexed(16); }
-  bool isMemoryIndexed64() const { return isMemoryIndexed(8); }
-  bool isMemoryIndexed32() const { return isMemoryIndexed(4); }
-  bool isMemoryIndexed16() const { return isMemoryIndexed(2); }
-  bool isMemoryIndexed8() const { return isMemoryIndexed(1); }
-  bool isMemoryNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-
-    // Make sure the immediate value is valid. Only zero is allowed.
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    if (!CE || CE->getValue() != 0)
-      return false;
-    return true;
-  }
-  bool isMemorySIMDNoIndex() const {
-    if (!isMem())
-      return false;
-    if (Mem.Mode != ImmediateOffset)
-      return false;
-    return Mem.OffsetImm == 0;
-  }
-  bool isMemoryIndexedSImm9() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return Value >= -256 && Value <= 255;
-  }
-  bool isMemoryIndexed32SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 4) == 0) && Value >= -256 && Value <= 252;
-  }
-  bool isMemoryIndexed64SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 8) == 0) && Value >= -512 && Value <= 504;
-  }
-  bool isMemoryIndexed128SImm7() const {
-    if (!isMem() || Mem.Mode != ImmediateOffset)
-      return false;
-    if (!Mem.OffsetImm)
-      return true;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-    assert(CE && "Non-constant pre-indexed offset!");
-    int64_t Value = CE->getValue();
-    return ((Value % 16) == 0) && Value >= -1024 && Value <= 1008;
-  }
-
-  bool isAdrpLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    return isImm();
-  }
-
-  bool isAdrLabel() const {
-    // Validation was handled during parsing, so we just sanity check that
-    // something didn't go haywire.
-    return isImm();
-  }
-
-  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
-    // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    else
-      Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  void addVectorRegOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateReg(getReg()));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::D0,       ARM64::D0_D1,
-                                    ARM64::D0_D1_D2, ARM64::D0_D1_D2_D3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  template <unsigned NumRegs>
-  void addVectorList128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    static unsigned FirstRegs[] = { ARM64::Q0,       ARM64::Q0_Q1,
-                                    ARM64::Q0_Q1_Q2, ARM64::Q0_Q1_Q2_Q3 };
-    unsigned FirstReg = FirstRegs[NumRegs - 1];
-
-    Inst.addOperand(
-        MCOperand::CreateReg(FirstReg + getVectorListStart() - ARM64::Q0));
-  }
-
-  void addVectorIndexBOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexHOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexSOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addVectorIndexDOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getVectorIndex()));
-  }
-
-  void addImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // If this is a pageoff symrefexpr with an addend, adjust the addend
-    // to be only the page-offset portion. Otherwise, just add the expr
-    // as-is.
-    addExpr(Inst, getImm());
-  }
-
-  void addAdrpLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addAdrLabelOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addSImm9Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addSImm7s4Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 4));
-  }
-
-  void addSImm7s8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 8));
-  }
-
-  void addSImm7s16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() / 16));
-  }
-
-  void addImm0_7Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_15Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_31Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_63Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm1_64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_127Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_255Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addImm0_65535Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue()));
-  }
-
-  void addLogicalImm32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 32);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addLogicalImm64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid logical immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeLogicalImmediate(MCE->getValue(), 64);
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addSIMDImmType10Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    assert(MCE && "Invalid immediate operand!");
-    uint64_t encoding = ARM64_AM::encodeAdvSIMDModImmType10(MCE->getValue());
-    Inst.addOperand(MCOperand::CreateImm(encoding));
-  }
-
-  void addBranchTarget26Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget19Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addBranchTarget14Operands(MCInst &Inst, unsigned N) const {
-    // Branch operands don't encode the low bits, so shift them off
-    // here. If it's a label, however, just put it on directly as there's
-    // not enough information now to do anything.
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE) {
-      addExpr(Inst, getImm());
-      return;
-    }
-    assert(MCE && "Invalid constant immediate operand!");
-    Inst.addOperand(MCOperand::CreateImm(MCE->getValue() >> 2));
-  }
-
-  void addFPImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getFPImm()));
-  }
-
-  void addBarrierOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getBarrier()));
-  }
-
-  void addSystemRegisterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    if (Kind == k_SystemRegister)
-      Inst.addOperand(MCOperand::CreateImm(getSystemRegister()));
-    else {
-      assert(Kind == k_CPSRField && getCPSRField() == ARM64SYS::cpsr_SPSel);
-      Inst.addOperand(MCOperand::CreateImm(ARM64SYS::SPSel));
-    }
-  }
-
-  void addSystemCPSRFieldOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getCPSRField()));
-  }
-
-  void addSysCROperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getSysCR()));
-  }
-
-  void addPrefetchOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getPrefetch()));
-  }
-
-  void addShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addArithmeticShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMovImm32ShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMovImm64ShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addAddSubShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addLogicalVecShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addLogicalVecHalfWordShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addMoveVecShifterOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getShifter()));
-  }
-
-  void addExtendOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
-      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
-                                       ARM64_AM::getShiftValue(getShifter()));
-      Inst.addOperand(MCOperand::CreateImm(imm));
-    } else
-      Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addExtend64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addExtendLSL64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    // lsl is an alias for UXTX but will be a parsed as a k_Shifter operand.
-    if (isShifter()) {
-      assert(ARM64_AM::getShiftType(getShifter()) == ARM64_AM::LSL);
-      unsigned imm = getArithExtendImm(ARM64_AM::UXTX,
-                                       ARM64_AM::getShiftValue(getShifter()));
-      Inst.addOperand(MCOperand::CreateImm(imm));
-    } else
-      Inst.addOperand(MCOperand::CreateImm(getExtend()));
-  }
-
-  void addMemoryRegisterOffsetOperands(MCInst &Inst, unsigned N, bool DoShift) {
-    assert(N == 3 && "Invalid number of operands!");
-
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-    Inst.addOperand(MCOperand::CreateReg(Mem.OffsetRegNum));
-    unsigned ExtendImm = ARM64_AM::getMemExtendImm(Mem.ExtType, DoShift);
-    Inst.addOperand(MCOperand::CreateImm(ExtendImm));
-  }
-
-  void addMemoryRegisterOffset8Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ExplicitShift);
-  }
-
-  void addMemoryRegisterOffset16Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 1);
-  }
-
-  void addMemoryRegisterOffset32Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 2);
-  }
-
-  void addMemoryRegisterOffset64Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 3);
-  }
-
-  void addMemoryRegisterOffset128Operands(MCInst &Inst, unsigned N) {
-    addMemoryRegisterOffsetOperands(Inst, N, Mem.ShiftVal == 4);
-  }
-
-  void addMemoryIndexedOperands(MCInst &Inst, unsigned N,
-                                unsigned Scale) const {
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    if (!Mem.OffsetImm) {
-      // There isn't an offset.
-      Inst.addOperand(MCOperand::CreateImm(0));
-      return;
-    }
-
-    // Add the offset operand.
-    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm)) {
-      assert(CE->getValue() % Scale == 0 &&
-             "Offset operand must be multiple of the scale!");
-
-      // The MCInst offset operand doesn't include the low bits (like the
-      // instruction encoding).
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue() / Scale));
-    }
-
-    // If this is a pageoff symrefexpr with an addend, the linker will
-    // do the scaling of the addend.
-    //
-    // Otherwise we don't know what this is, so just add the scaling divide to
-    // the expression and let the MC fixup evaluation code deal with it.
-    const MCExpr *Expr = Mem.OffsetImm;
-    ARM64MCExpr::VariantKind ELFRefKind;
-    MCSymbolRefExpr::VariantKind DarwinRefKind;
-    const MCConstantExpr *Addend;
-    if (Scale > 1 &&
-        (!ARM64AsmParser::classifySymbolRef(Expr, ELFRefKind, DarwinRefKind,
-                                            Addend) ||
-         (Addend != 0 && DarwinRefKind != MCSymbolRefExpr::VK_PAGEOFF))) {
-      Expr = MCBinaryExpr::CreateDiv(Expr, MCConstantExpr::Create(Scale, Ctx),
-                                     Ctx);
-    }
-
-    Inst.addOperand(MCOperand::CreateExpr(Expr));
-  }
-
-  void addMemoryUnscaledOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryUnscaled() && "Invalid number of operands!");
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    if (!Mem.OffsetImm)
-      Inst.addOperand(MCOperand::CreateImm(0));
-    else {
-      // Only constant offsets supported.
-      const MCConstantExpr *CE = cast<MCConstantExpr>(Mem.OffsetImm);
-      Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
-    }
-  }
-
-  void addMemoryIndexed128Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed128() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 16);
-  }
-
-  void addMemoryIndexed64Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed64() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed32Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed32() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed16Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed16() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 2);
-  }
-
-  void addMemoryIndexed8Operands(MCInst &Inst, unsigned N) const {
-    assert(N == 2 && isMemoryIndexed8() && "Invalid number of operands!");
-    addMemoryIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemoryNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemorySIMDNoIndexOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && isMemorySIMDNoIndex() && "Invalid number of operands!");
-    // Add the base register operand (the offset is always zero, so ignore it).
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-  }
-
-  void addMemoryWritebackIndexedOperands(MCInst &Inst, unsigned N,
-                                         unsigned Scale) const {
-    assert(N == 2 && "Invalid number of operands!");
-
-    // Add the base register operand.
-    Inst.addOperand(MCOperand::CreateReg(Mem.BaseRegNum));
-
-    // Add the offset operand.
-    int64_t Offset = 0;
-    if (Mem.OffsetImm) {
-      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Mem.OffsetImm);
-      assert(CE && "Non-constant indexed offset operand!");
-      Offset = CE->getValue();
-    }
-
-    if (Scale != 1) {
-      assert(Offset % Scale == 0 &&
-             "Offset operand must be a multiple of the scale!");
-      Offset /= Scale;
-    }
-
-    Inst.addOperand(MCOperand::CreateImm(Offset));
-  }
-
-  void addMemoryIndexedSImm9Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 1);
-  }
-
-  void addMemoryIndexed32SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 4);
-  }
-
-  void addMemoryIndexed64SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 8);
-  }
-
-  void addMemoryIndexed128SImm7Operands(MCInst &Inst, unsigned N) const {
-    addMemoryWritebackIndexedOperands(Inst, N, 16);
-  }
-
-  virtual void print(raw_ostream &OS) const;
-
-  static ARM64Operand *CreateToken(StringRef Str, bool IsSuffix, SMLoc S,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Token, Ctx);
-    Op->Tok.Data = Str.data();
-    Op->Tok.Length = Str.size();
-    Op->Tok.IsSuffix = IsSuffix;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateReg(unsigned RegNum, bool isVector, SMLoc S,
-                                 SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Register, Ctx);
-    Op->Reg.RegNum = RegNum;
-    Op->Reg.isVector = isVector;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
-                                        unsigned NumElements, char ElementKind,
-                                        SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorList, Ctx);
-    Op->VectorList.RegNum = RegNum;
-    Op->VectorList.Count = Count;
-    Op->VectorList.NumElements = NumElements;
-    Op->VectorList.ElementKind = ElementKind;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateVectorIndex(unsigned Idx, SMLoc S, SMLoc E,
-                                         MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_VectorIndex, Ctx);
-    Op->VectorIndex.Val = Idx;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateImm(const MCExpr *Val, SMLoc S, SMLoc E,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Immediate, Ctx);
-    Op->Imm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateFPImm(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_FPImm, Ctx);
-    Op->FPImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateBarrier(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Barrier, Ctx);
-    Op->Barrier.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSystemRegister(uint16_t Val, SMLoc S,
-                                            MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SystemRegister, Ctx);
-    Op->SystemRegister.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateCPSRField(ARM64SYS::CPSRField Field, SMLoc S,
-                                       MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_CPSRField, Ctx);
-    Op->CPSRField.Field = Field;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateMem(unsigned BaseRegNum, const MCExpr *Off,
-                                 SMLoc S, SMLoc E, SMLoc OffsetLoc,
-                                 MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseRegNum;
-    Op->Mem.OffsetRegNum = 0;
-    Op->Mem.OffsetImm = Off;
-    Op->Mem.ExtType = ARM64_AM::UXTX;
-    Op->Mem.ShiftVal = 0;
-    Op->Mem.ExplicitShift = false;
-    Op->Mem.Mode = ImmediateOffset;
-    Op->OffsetLoc = OffsetLoc;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateRegOffsetMem(unsigned BaseReg, unsigned OffsetReg,
-                                          ARM64_AM::ExtendType ExtType,
-                                          unsigned ShiftVal, bool ExplicitShift,
-                                          SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Memory, Ctx);
-    Op->Mem.BaseRegNum = BaseReg;
-    Op->Mem.OffsetRegNum = OffsetReg;
-    Op->Mem.OffsetImm = 0;
-    Op->Mem.ExtType = ExtType;
-    Op->Mem.ShiftVal = ShiftVal;
-    Op->Mem.ExplicitShift = ExplicitShift;
-    Op->Mem.Mode = RegisterOffset;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateSysCR(unsigned Val, SMLoc S, SMLoc E,
-                                   MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_SysCR, Ctx);
-    Op->SysCRImm.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreatePrefetch(unsigned Val, SMLoc S, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Prefetch, Ctx);
-    Op->Prefetch.Val = Val;
-    Op->StartLoc = S;
-    Op->EndLoc = S;
-    return Op;
-  }
-
-  static ARM64Operand *CreateShifter(ARM64_AM::ShiftType ShOp, unsigned Val,
-                                     SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Shifter, Ctx);
-    Op->Shifter.Val = ARM64_AM::getShifterImm(ShOp, Val);
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-
-  static ARM64Operand *CreateExtend(ARM64_AM::ExtendType ExtOp, unsigned Val,
-                                    SMLoc S, SMLoc E, MCContext &Ctx) {
-    ARM64Operand *Op = new ARM64Operand(k_Extend, Ctx);
-    Op->Extend.Val = ARM64_AM::getArithExtendImm(ExtOp, Val);
-    Op->StartLoc = S;
-    Op->EndLoc = E;
-    return Op;
-  }
-};
-
-} // end anonymous namespace.
-
-void ARM64Operand::print(raw_ostream &OS) const {
-  switch (Kind) {
-  case k_FPImm:
-    OS << "<fpimm " << getFPImm() << "(" << ARM64_AM::getFPImmFloat(getFPImm())
-       << ") >";
-    break;
-  case k_Barrier: {
-    const char *Name =
-        ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)getBarrier());
-    OS << "<barrier ";
-    if (Name)
-      OS << Name;
-    else
-      OS << getBarrier();
-    OS << ">";
-    break;
-  }
-  case k_SystemRegister: {
-    const char *Name = ARM64SYS::getSystemRegisterName(
-        (ARM64SYS::SystemRegister)getSystemRegister());
-    OS << "<systemreg ";
-    if (Name)
-      OS << Name;
-    else
-      OS << "#" << getSystemRegister();
-    OS << ">";
-    break;
-  }
-  case k_CPSRField: {
-    const char *Name = ARM64SYS::getCPSRFieldName(getCPSRField());
-    OS << "<cpsrfield " << Name << ">";
-    break;
-  }
-  case k_Immediate:
-    getImm()->print(OS);
-    break;
-  case k_Memory:
-    OS << "<memory>";
-    break;
-  case k_Register:
-    OS << "<register " << getReg() << ">";
-    break;
-  case k_VectorList: {
-    OS << "<vectorlist ";
-    unsigned Reg = getVectorListStart();
-    for (unsigned i = 0, e = getVectorListCount(); i != e; ++i)
-      OS << Reg + i << " ";
-    OS << ">";
-    break;
-  }
-  case k_VectorIndex:
-    OS << "<vectorindex " << getVectorIndex() << ">";
-    break;
-  case k_Token:
-    OS << "'" << getToken() << "'";
-    break;
-  case k_SysCR:
-    OS << "c" << getSysCR();
-    break;
-  case k_Prefetch:
-    OS << "<prfop ";
-    if (ARM64_AM::isNamedPrefetchOp(getPrefetch()))
-      OS << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)getPrefetch());
-    else
-      OS << "#" << getPrefetch();
-    OS << ">";
-    break;
-  case k_Shifter: {
-    unsigned Val = getShifter();
-    OS << "<" << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
-       << ARM64_AM::getShiftValue(Val) << ">";
-    break;
-  }
-  case k_Extend: {
-    unsigned Val = getExtend();
-    OS << "<" << ARM64_AM::getExtendName(ARM64_AM::getArithExtendType(Val))
-       << " #" << ARM64_AM::getArithShiftValue(Val) << ">";
-    break;
-  }
-  }
-}
-
-/// @name Auto-generated Match Functions
-/// {
-
-static unsigned MatchRegisterName(StringRef Name);
-
-/// }
-
-static unsigned matchVectorRegName(StringRef Name) {
-  return StringSwitch<unsigned>(Name)
-      .Case("v0", ARM64::Q0)
-      .Case("v1", ARM64::Q1)
-      .Case("v2", ARM64::Q2)
-      .Case("v3", ARM64::Q3)
-      .Case("v4", ARM64::Q4)
-      .Case("v5", ARM64::Q5)
-      .Case("v6", ARM64::Q6)
-      .Case("v7", ARM64::Q7)
-      .Case("v8", ARM64::Q8)
-      .Case("v9", ARM64::Q9)
-      .Case("v10", ARM64::Q10)
-      .Case("v11", ARM64::Q11)
-      .Case("v12", ARM64::Q12)
-      .Case("v13", ARM64::Q13)
-      .Case("v14", ARM64::Q14)
-      .Case("v15", ARM64::Q15)
-      .Case("v16", ARM64::Q16)
-      .Case("v17", ARM64::Q17)
-      .Case("v18", ARM64::Q18)
-      .Case("v19", ARM64::Q19)
-      .Case("v20", ARM64::Q20)
-      .Case("v21", ARM64::Q21)
-      .Case("v22", ARM64::Q22)
-      .Case("v23", ARM64::Q23)
-      .Case("v24", ARM64::Q24)
-      .Case("v25", ARM64::Q25)
-      .Case("v26", ARM64::Q26)
-      .Case("v27", ARM64::Q27)
-      .Case("v28", ARM64::Q28)
-      .Case("v29", ARM64::Q29)
-      .Case("v30", ARM64::Q30)
-      .Case("v31", ARM64::Q31)
-      .Default(0);
-}
-
-static bool isValidVectorKind(StringRef Name) {
-  return StringSwitch<bool>(Name.lower())
-      .Case(".8b", true)
-      .Case(".16b", true)
-      .Case(".4h", true)
-      .Case(".8h", true)
-      .Case(".2s", true)
-      .Case(".4s", true)
-      .Case(".1d", true)
-      .Case(".2d", true)
-      .Case(".1q", true)
-      // Accept the width neutral ones, too, for verbose syntax. If those
-      // aren't used in the right places, the token operand won't match so
-      // all will work out.
-      .Case(".b", true)
-      .Case(".h", true)
-      .Case(".s", true)
-      .Case(".d", true)
-      .Default(false);
-}
-
-static void parseValidVectorKind(StringRef Name, unsigned &NumElements,
-                                 char &ElementKind) {
-  assert(isValidVectorKind(Name));
-
-  ElementKind = Name.lower()[Name.size() - 1];
-  NumElements = 0;
-
-  if (Name.size() == 2)
-    return;
-
-  // Parse the lane count
-  Name = Name.drop_front();
-  while (isdigit(Name.front())) {
-    NumElements = 10 * NumElements + (Name.front() - '0');
-    Name = Name.drop_front();
-  }
-}
-
-bool ARM64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                   SMLoc &EndLoc) {
-  StartLoc = getLoc();
-  RegNo = tryParseRegister();
-  EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return (RegNo == (unsigned)-1);
-}
-
-/// tryParseRegister - Try to parse a register name. The token must be an
-/// Identifier when called, and if it is a register name the token is eaten and
-/// the register is added to the operand list.
-int ARM64AsmParser::tryParseRegister() {
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  std::string lowerCase = Tok.getString().lower();
-  unsigned RegNum = MatchRegisterName(lowerCase);
-  // Also handle a few aliases of registers.
-  if (RegNum == 0)
-    RegNum = StringSwitch<unsigned>(lowerCase)
-                 .Case("x29", ARM64::FP)
-                 .Case("x30", ARM64::LR)
-                 .Case("x31", ARM64::XZR)
-                 .Case("w31", ARM64::WZR)
-                 .Default(0);
-
-  if (RegNum == 0)
-    return -1;
-
-  Parser.Lex(); // Eat identifier token.
-  return RegNum;
-}
-
-/// tryMatchVectorRegister - Try to parse a vector register name with optional
-/// kind specifier. If it is a register specifier, eat the token and return it.
-int ARM64AsmParser::tryMatchVectorRegister(StringRef &Kind) {
-  if (Parser.getTok().isNot(AsmToken::Identifier)) {
-    TokError("vector register expected");
-    return -1;
-  }
-
-  StringRef Name = Parser.getTok().getString();
-  // If there is a kind specifier, it's separated from the register name by
-  // a '.'.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-  unsigned RegNum = matchVectorRegName(Head);
-  if (RegNum) {
-    if (Next != StringRef::npos) {
-      Kind = Name.slice(Next, StringRef::npos);
-      if (!isValidVectorKind(Kind)) {
-        TokError("invalid vector kind qualifier");
-        return -1;
-      }
-    }
-    Parser.Lex(); // Eat the register token.
-    return RegNum;
-  }
-  return -1;
-}
-
-static int MatchSysCRName(StringRef Name) {
-  // Use the same layout as the tablegen'erated register name matcher. Ugly,
-  // but efficient.
-  switch (Name.size()) {
-  default:
-    break;
-  case 2:
-    if (Name[0] != 'c' && Name[0] != 'C')
-      return -1;
-    switch (Name[1]) {
-    default:
-      return -1;
-    case '0':
-      return 0;
-    case '1':
-      return 1;
-    case '2':
-      return 2;
-    case '3':
-      return 3;
-    case '4':
-      return 4;
-    case '5':
-      return 5;
-    case '6':
-      return 6;
-    case '7':
-      return 7;
-    case '8':
-      return 8;
-    case '9':
-      return 9;
-    }
-    break;
-  case 3:
-    if ((Name[0] != 'c' && Name[0] != 'C') || Name[1] != '1')
-      return -1;
-    switch (Name[2]) {
-    default:
-      return -1;
-    case '0':
-      return 10;
-    case '1':
-      return 11;
-    case '2':
-      return 12;
-    case '3':
-      return 13;
-    case '4':
-      return 14;
-    case '5':
-      return 15;
-    }
-    break;
-  }
-
-  llvm_unreachable("Unhandled SysCR operand string!");
-  return -1;
-}
-
-/// tryParseSysCROperand - Try to parse a system instruction CR operand name.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSysCROperand(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  int Num = MatchSysCRName(Tok.getString());
-  if (Num == -1)
-    return MatchOperand_NoMatch;
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreateSysCR(Num, S, getLoc(), getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParsePrefetch - Try to parse a prefetch operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParsePrefetch(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  // Either an identifier for named values or a 5-bit immediate.
-  if (Tok.is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat hash token.
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for prefetch operand");
-      return MatchOperand_ParseFail;
-    }
-    unsigned prfop = MCE->getValue();
-    if (prfop > 31) {
-      TokError("prefetch operand out of range, [0,31] expected");
-      return MatchOperand_ParseFail;
-    }
-
-    Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  unsigned prfop = StringSwitch<unsigned>(Tok.getString())
-                       .Case("pldl1keep", ARM64_AM::PLDL1KEEP)
-                       .Case("pldl1strm", ARM64_AM::PLDL1STRM)
-                       .Case("pldl2keep", ARM64_AM::PLDL2KEEP)
-                       .Case("pldl2strm", ARM64_AM::PLDL2STRM)
-                       .Case("pldl3keep", ARM64_AM::PLDL3KEEP)
-                       .Case("pldl3strm", ARM64_AM::PLDL3STRM)
-                       .Case("pstl1keep", ARM64_AM::PSTL1KEEP)
-                       .Case("pstl1strm", ARM64_AM::PSTL1STRM)
-                       .Case("pstl2keep", ARM64_AM::PSTL2KEEP)
-                       .Case("pstl2strm", ARM64_AM::PSTL2STRM)
-                       .Case("pstl3keep", ARM64_AM::PSTL3KEEP)
-                       .Case("pstl3strm", ARM64_AM::PSTL3STRM)
-                       .Default(0xff);
-  if (prfop == 0xff) {
-    TokError("pre-fetch hint expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat identifier token.
-  Operands.push_back(ARM64Operand::CreatePrefetch(prfop, S, getContext()));
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrpLabel - Parse and validate a source label for the ADRP
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-  if (parseSymbolicImmVal(Expr))
-    return MatchOperand_ParseFail;
-
-  ARM64MCExpr::VariantKind ELFRefKind;
-  MCSymbolRefExpr::VariantKind DarwinRefKind;
-  const MCConstantExpr *Addend;
-  if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-    Error(S, "modified label reference + constant expected");
-    return MatchOperand_ParseFail;
-  }
-
-  if (DarwinRefKind == MCSymbolRefExpr::VK_None &&
-      ELFRefKind == ARM64MCExpr::VK_INVALID) {
-    // No modifier was specified at all; this is the syntax for an ELF basic
-    // ADRP relocation (unfortunately).
-    Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_ABS_PAGE, getContext());
-  } else if ((DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGE ||
-              DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGE) &&
-             Addend != 0) {
-    Error(S, "gotpage label reference not allowed an addend");
-    return MatchOperand_ParseFail;
-  } else if (DarwinRefKind != MCSymbolRefExpr::VK_PAGE &&
-             DarwinRefKind != MCSymbolRefExpr::VK_GOTPAGE &&
-             DarwinRefKind != MCSymbolRefExpr::VK_TLVPPAGE &&
-             ELFRefKind != ARM64MCExpr::VK_GOT_PAGE &&
-             ELFRefKind != ARM64MCExpr::VK_GOTTPREL_PAGE &&
-             ELFRefKind != ARM64MCExpr::VK_TLSDESC_PAGE) {
-    // The operand must be an @page or @gotpage qualified symbolref.
-    Error(S, "page or gotpage label reference expected");
-    return MatchOperand_ParseFail;
-  }
-
-  // We have a label reference possibly with addend. The addend is a raw value
-  // here. The linker will adjust it to only reference the page.
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseAdrLabel - Parse and validate a source label for the ADR
-/// instruction.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseAdrLabel(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  const MCExpr *Expr;
-  if (getParser().parseExpression(Expr))
-    return MatchOperand_ParseFail;
-
-  // The operand must be an un-qualified assembler local symbolref.
-  // FIXME: wrong for ELF.
-  if (const MCSymbolRefExpr *SRE = dyn_cast<const MCSymbolRefExpr>(Expr)) {
-    // FIXME: Should reference the MachineAsmInfo to get the private prefix.
-    bool isTemporary = SRE->getSymbol().getName().startswith("L");
-    if (!isTemporary || SRE->getKind() != MCSymbolRefExpr::VK_None) {
-      Error(S, "unqualified, assembler-local label name expected");
-      return MatchOperand_ParseFail;
-    }
-  }
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-
-  return MatchOperand_Success;
-}
-
-/// tryParseFPImm - A floating point immediate expression operand.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseFPImm(OperandVector &Operands) {
-  SMLoc S = getLoc();
-
-  if (Parser.getTok().isNot(AsmToken::Hash))
-    return MatchOperand_NoMatch;
-  Parser.Lex(); // Eat the '#'.
-
-  // Handle negation, as that still comes through as a separate token.
-  bool isNegative = false;
-  if (Parser.getTok().is(AsmToken::Minus)) {
-    isNegative = true;
-    Parser.Lex();
-  }
-  const AsmToken &Tok = Parser.getTok();
-  if (Tok.is(AsmToken::Real)) {
-    APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-    // If we had a '-' in front, toggle the sign bit.
-    IntVal ^= (uint64_t)isNegative << 63;
-    int Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    Parser.Lex(); // Eat the token.
-    // Check for out of range values. As an exception, we let Zero through,
-    // as we handle that special case in post-processing before matching in
-    // order to use the zero register for it.
-    if (Val == -1 && !RealVal.isZero()) {
-      TokError("floating point value out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-  if (Tok.is(AsmToken::Integer)) {
-    int64_t Val;
-    if (!isNegative && Tok.getString().startswith("0x")) {
-      Val = Tok.getIntVal();
-      if (Val > 255 || Val < 0) {
-        TokError("encoded floating point value out of range");
-        return MatchOperand_ParseFail;
-      }
-    } else {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      // If we had a '-' in front, toggle the sign bit.
-      IntVal ^= (uint64_t)isNegative << 63;
-      Val = ARM64_AM::getFP64Imm(APInt(64, IntVal));
-    }
-    Parser.Lex(); // Eat the token.
-    Operands.push_back(ARM64Operand::CreateFPImm(Val, S, getContext()));
-    return MatchOperand_Success;
-  }
-
-  TokError("invalid floating point immediate");
-  return MatchOperand_ParseFail;
-}
-
-/// parseCondCodeString - Parse a Condition Code string.
-unsigned ARM64AsmParser::parseCondCodeString(StringRef Cond) {
-  unsigned CC = StringSwitch<unsigned>(Cond)
-                    .Case("eq", ARM64CC::EQ)
-                    .Case("ne", ARM64CC::NE)
-                    .Case("cs", ARM64CC::CS)
-                    .Case("hs", ARM64CC::CS)
-                    .Case("cc", ARM64CC::CC)
-                    .Case("lo", ARM64CC::CC)
-                    .Case("mi", ARM64CC::MI)
-                    .Case("pl", ARM64CC::PL)
-                    .Case("vs", ARM64CC::VS)
-                    .Case("vc", ARM64CC::VC)
-                    .Case("hi", ARM64CC::HI)
-                    .Case("ls", ARM64CC::LS)
-                    .Case("ge", ARM64CC::GE)
-                    .Case("lt", ARM64CC::LT)
-                    .Case("gt", ARM64CC::GT)
-                    .Case("le", ARM64CC::LE)
-                    .Case("al", ARM64CC::AL)
-                // Upper case works too. Not mixed case, though.
-                    .Case("EQ", ARM64CC::EQ)
-                    .Case("NE", ARM64CC::NE)
-                    .Case("CS", ARM64CC::CS)
-                    .Case("HS", ARM64CC::CS)
-                    .Case("CC", ARM64CC::CC)
-                    .Case("LO", ARM64CC::CC)
-                    .Case("MI", ARM64CC::MI)
-                    .Case("PL", ARM64CC::PL)
-                    .Case("VS", ARM64CC::VS)
-                    .Case("VC", ARM64CC::VC)
-                    .Case("HI", ARM64CC::HI)
-                    .Case("LS", ARM64CC::LS)
-                    .Case("GE", ARM64CC::GE)
-                    .Case("LT", ARM64CC::LT)
-                    .Case("GT", ARM64CC::GT)
-                    .Case("LE", ARM64CC::LE)
-                    .Case("AL", ARM64CC::AL)
-                    .Default(~0U);
-  return CC;
-}
-
-/// parseCondCode - Parse a Condition Code operand.
-bool ARM64AsmParser::parseCondCode(OperandVector &Operands,
-                                   bool invertCondCode) {
-  SMLoc S = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  assert(Tok.is(AsmToken::Identifier) && "Token is not an Identifier");
-
-  StringRef Cond = Tok.getString();
-  unsigned CC = parseCondCodeString(Cond);
-  if (CC == ~0U)
-    return TokError("invalid condition code");
-  Parser.Lex(); // Eat identifier token.
-
-  if (invertCondCode)
-    CC = ARM64CC::getInvertedCondCode(ARM64CC::CondCode(CC));
-
-  const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
-  Operands.push_back(
-      ARM64Operand::CreateImm(CCExpr, S, getLoc(), getContext()));
-  return false;
-}
-
-/// ParseOptionalShift - Some operands take an optional shift argument. Parse
-/// them if present.
-bool ARM64AsmParser::parseOptionalShift(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  ARM64_AM::ShiftType ShOp = StringSwitch<ARM64_AM::ShiftType>(Tok.getString())
-                                 .Case("lsl", ARM64_AM::LSL)
-                                 .Case("lsr", ARM64_AM::LSR)
-                                 .Case("asr", ARM64_AM::ASR)
-                                 .Case("ror", ARM64_AM::ROR)
-                                 .Case("msl", ARM64_AM::MSL)
-                                 .Case("LSL", ARM64_AM::LSL)
-                                 .Case("LSR", ARM64_AM::LSR)
-                                 .Case("ASR", ARM64_AM::ASR)
-                                 .Case("ROR", ARM64_AM::ROR)
-                                 .Case("MSL", ARM64_AM::MSL)
-                                 .Default(ARM64_AM::InvalidShift);
-  if (ShOp == ARM64_AM::InvalidShift)
-    return true;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  // We expect a number here.
-  if (getLexer().isNot(AsmToken::Hash))
-    return TokError("immediate value expected for shifter operand");
-  Parser.Lex(); // Eat the '#'.
-
-  SMLoc ExprLoc = getLoc();
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE)
-    return TokError("immediate value expected for shifter operand");
-
-  if ((MCE->getValue() & 0x3f) != MCE->getValue())
-    return Error(ExprLoc, "immediate value too large for shifter operand");
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(
-      ARM64Operand::CreateShifter(ShOp, MCE->getValue(), S, E, getContext()));
-  return false;
-}
-
-/// parseOptionalExtend - Some operands take an optional extend argument. Parse
-/// them if present.
-bool ARM64AsmParser::parseOptionalExtend(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-  ARM64_AM::ExtendType ExtOp =
-      StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
-          .Case("uxtb", ARM64_AM::UXTB)
-          .Case("uxth", ARM64_AM::UXTH)
-          .Case("uxtw", ARM64_AM::UXTW)
-          .Case("uxtx", ARM64_AM::UXTX)
-          .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
-          .Case("sxtb", ARM64_AM::SXTB)
-          .Case("sxth", ARM64_AM::SXTH)
-          .Case("sxtw", ARM64_AM::SXTW)
-          .Case("sxtx", ARM64_AM::SXTX)
-          .Case("UXTB", ARM64_AM::UXTB)
-          .Case("UXTH", ARM64_AM::UXTH)
-          .Case("UXTW", ARM64_AM::UXTW)
-          .Case("UXTX", ARM64_AM::UXTX)
-          .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
-          .Case("SXTB", ARM64_AM::SXTB)
-          .Case("SXTH", ARM64_AM::SXTH)
-          .Case("SXTW", ARM64_AM::SXTW)
-          .Case("SXTX", ARM64_AM::SXTX)
-          .Default(ARM64_AM::InvalidExtend);
-  if (ExtOp == ARM64_AM::InvalidExtend)
-    return true;
-
-  SMLoc S = Tok.getLoc();
-  Parser.Lex();
-
-  if (getLexer().is(AsmToken::EndOfStatement) ||
-      getLexer().is(AsmToken::Comma)) {
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
-    return false;
-  }
-
-  if (getLexer().isNot(AsmToken::Hash)) {
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(
-        ARM64Operand::CreateExtend(ExtOp, 0, S, E, getContext()));
-    return false;
-  }
-
-  Parser.Lex(); // Eat the '#'.
-
-  const MCExpr *ImmVal;
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-  if (!MCE)
-    return TokError("immediate value expected for extend operand");
-
-  SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  Operands.push_back(
-      ARM64Operand::CreateExtend(ExtOp, MCE->getValue(), S, E, getContext()));
-  return false;
-}
-
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
-bool ARM64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
-                                   OperandVector &Operands) {
-  if (Name.find('.') != StringRef::npos)
-    return TokError("invalid operand");
-
-  Mnemonic = Name;
-  Operands.push_back(
-      ARM64Operand::CreateToken("sys", false, NameLoc, getContext()));
-
-  const AsmToken &Tok = Parser.getTok();
-  StringRef Op = Tok.getString();
-  SMLoc S = Tok.getLoc();
-
-  const MCExpr *Expr = 0;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::Create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));             \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));             \
-    Expr = MCConstantExpr::Create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        ARM64Operand::CreateImm(Expr, S, getLoc(), getContext()));             \
-  } while (0)
-
-  if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
-      return TokError("invalid operand for IC instruction");
-    }
-  } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else {
-      return TokError("invalid operand for DC instruction");
-    }
-  } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else {
-      return TokError("invalid operand for AT instruction");
-    }
-  } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
-      return TokError("invalid operand for TLBI instruction");
-    }
-  }
-
-#undef SYS_ALIAS
-
-  Parser.Lex(); // Eat operand.
-
-  // Check for the optional register operand.
-  if (getLexer().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat comma.
-
-    if (Tok.isNot(AsmToken::Identifier) || parseRegister(Operands))
-      return TokError("expected register operand");
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    Parser.eatToEndOfStatement();
-    return TokError("unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseBarrierOperand(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // Can be either a #imm style literal or an option name
-  if (Tok.is(AsmToken::Hash)) {
-    // Immediate operand.
-    Parser.Lex(); // Eat the '#'
-    const MCExpr *ImmVal;
-    SMLoc ExprLoc = getLoc();
-    if (getParser().parseExpression(ImmVal))
-      return MatchOperand_ParseFail;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      Error(ExprLoc, "immediate value expected for barrier operand");
-      return MatchOperand_ParseFail;
-    }
-    if (MCE->getValue() < 0 || MCE->getValue() > 15) {
-      Error(ExprLoc, "barrier operand out of range");
-      return MatchOperand_ParseFail;
-    }
-    Operands.push_back(
-        ARM64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
-    return MatchOperand_Success;
-  }
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    TokError("invalid operand for instruction");
-    return MatchOperand_ParseFail;
-  }
-
-  unsigned Opt = StringSwitch<unsigned>(Tok.getString())
-                     .Case("oshld", ARM64SYS::OSHLD)
-                     .Case("oshst", ARM64SYS::OSHST)
-                     .Case("osh", ARM64SYS::OSH)
-                     .Case("nshld", ARM64SYS::NSHLD)
-                     .Case("nshst", ARM64SYS::NSHST)
-                     .Case("nsh", ARM64SYS::NSH)
-                     .Case("ishld", ARM64SYS::ISHLD)
-                     .Case("ishst", ARM64SYS::ISHST)
-                     .Case("ish", ARM64SYS::ISH)
-                     .Case("ld", ARM64SYS::LD)
-                     .Case("st", ARM64SYS::ST)
-                     .Case("sy", ARM64SYS::SY)
-                     .Default(ARM64SYS::InvalidBarrier);
-  if (Opt == ARM64SYS::InvalidBarrier) {
-    TokError("invalid barrier option name");
-    return MatchOperand_ParseFail;
-  }
-
-  // The only valid named option for ISB is 'sy'
-  if (Mnemonic == "isb" && Opt != ARM64SYS::SY) {
-    TokError("'sy' or #imm operand expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Operands.push_back(ARM64Operand::CreateBarrier(Opt, getLoc(), getContext()));
-  Parser.Lex(); // Consume the option
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseSystemRegister(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  // It can be specified as a symbolic name.
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  auto ID = Tok.getString().lower();
-  ARM64SYS::SystemRegister Reg =
-      StringSwitch<ARM64SYS::SystemRegister>(ID)
-          .Case("spsr_el1", ARM64SYS::SPSR_svc)
-          .Case("spsr_svc", ARM64SYS::SPSR_svc)
-          .Case("elr_el1", ARM64SYS::ELR_EL1)
-          .Case("sp_el0", ARM64SYS::SP_EL0)
-          .Case("spsel", ARM64SYS::SPSel)
-          .Case("daif", ARM64SYS::DAIF)
-          .Case("currentel", ARM64SYS::CurrentEL)
-          .Case("nzcv", ARM64SYS::NZCV)
-          .Case("fpcr", ARM64SYS::FPCR)
-          .Case("fpsr", ARM64SYS::FPSR)
-          .Case("dspsr", ARM64SYS::DSPSR)
-          .Case("dlr", ARM64SYS::DLR)
-          .Case("spsr_el2", ARM64SYS::SPSR_hyp)
-          .Case("spsr_hyp", ARM64SYS::SPSR_hyp)
-          .Case("elr_el2", ARM64SYS::ELR_EL2)
-          .Case("sp_el1", ARM64SYS::SP_EL1)
-          .Case("spsr_irq", ARM64SYS::SPSR_irq)
-          .Case("spsr_abt", ARM64SYS::SPSR_abt)
-          .Case("spsr_und", ARM64SYS::SPSR_und)
-          .Case("spsr_fiq", ARM64SYS::SPSR_fiq)
-          .Case("spsr_el3", ARM64SYS::SPSR_EL3)
-          .Case("elr_el3", ARM64SYS::ELR_EL3)
-          .Case("sp_el2", ARM64SYS::SP_EL2)
-          .Case("midr_el1", ARM64SYS::MIDR_EL1)
-          .Case("ctr_el0", ARM64SYS::CTR_EL0)
-          .Case("mpidr_el1", ARM64SYS::MPIDR_EL1)
-          .Case("ecoidr_el1", ARM64SYS::ECOIDR_EL1)
-          .Case("dczid_el0", ARM64SYS::DCZID_EL0)
-          .Case("mvfr0_el1", ARM64SYS::MVFR0_EL1)
-          .Case("mvfr1_el1", ARM64SYS::MVFR1_EL1)
-          .Case("id_aa64pfr0_el1", ARM64SYS::ID_AA64PFR0_EL1)
-          .Case("id_aa64pfr1_el1", ARM64SYS::ID_AA64PFR1_EL1)
-          .Case("id_aa64dfr0_el1", ARM64SYS::ID_AA64DFR0_EL1)
-          .Case("id_aa64dfr1_el1", ARM64SYS::ID_AA64DFR1_EL1)
-          .Case("id_aa64isar0_el1", ARM64SYS::ID_AA64ISAR0_EL1)
-          .Case("id_aa64isar1_el1", ARM64SYS::ID_AA64ISAR1_EL1)
-          .Case("id_aa64mmfr0_el1", ARM64SYS::ID_AA64MMFR0_EL1)
-          .Case("id_aa64mmfr1_el1", ARM64SYS::ID_AA64MMFR1_EL1)
-          .Case("ccsidr_el1", ARM64SYS::CCSIDR_EL1)
-          .Case("clidr_el1", ARM64SYS::CLIDR_EL1)
-          .Case("aidr_el1", ARM64SYS::AIDR_EL1)
-          .Case("csselr_el1", ARM64SYS::CSSELR_EL1)
-          .Case("vpidr_el2", ARM64SYS::VPIDR_EL2)
-          .Case("vmpidr_el2", ARM64SYS::VMPIDR_EL2)
-          .Case("sctlr_el1", ARM64SYS::SCTLR_EL1)
-          .Case("sctlr_el2", ARM64SYS::SCTLR_EL2)
-          .Case("sctlr_el3", ARM64SYS::SCTLR_EL3)
-          .Case("actlr_el1", ARM64SYS::ACTLR_EL1)
-          .Case("actlr_el2", ARM64SYS::ACTLR_EL2)
-          .Case("actlr_el3", ARM64SYS::ACTLR_EL3)
-          .Case("cpacr_el1", ARM64SYS::CPACR_EL1)
-          .Case("cptr_el2", ARM64SYS::CPTR_EL2)
-          .Case("cptr_el3", ARM64SYS::CPTR_EL3)
-          .Case("scr_el3", ARM64SYS::SCR_EL3)
-          .Case("hcr_el2", ARM64SYS::HCR_EL2)
-          .Case("mdcr_el2", ARM64SYS::MDCR_EL2)
-          .Case("mdcr_el3", ARM64SYS::MDCR_EL3)
-          .Case("hstr_el2", ARM64SYS::HSTR_EL2)
-          .Case("hacr_el2", ARM64SYS::HACR_EL2)
-          .Case("ttbr0_el1", ARM64SYS::TTBR0_EL1)
-          .Case("ttbr1_el1", ARM64SYS::TTBR1_EL1)
-          .Case("ttbr0_el2", ARM64SYS::TTBR0_EL2)
-          .Case("ttbr0_el3", ARM64SYS::TTBR0_EL3)
-          .Case("vttbr_el2", ARM64SYS::VTTBR_EL2)
-          .Case("tcr_el1", ARM64SYS::TCR_EL1)
-          .Case("tcr_el2", ARM64SYS::TCR_EL2)
-          .Case("tcr_el3", ARM64SYS::TCR_EL3)
-          .Case("vtcr_el2", ARM64SYS::VTCR_EL2)
-          .Case("adfsr_el1", ARM64SYS::ADFSR_EL1)
-          .Case("aifsr_el1", ARM64SYS::AIFSR_EL1)
-          .Case("adfsr_el2", ARM64SYS::ADFSR_EL2)
-          .Case("aifsr_el2", ARM64SYS::AIFSR_EL2)
-          .Case("adfsr_el3", ARM64SYS::ADFSR_EL3)
-          .Case("aifsr_el3", ARM64SYS::AIFSR_EL3)
-          .Case("esr_el1", ARM64SYS::ESR_EL1)
-          .Case("esr_el2", ARM64SYS::ESR_EL2)
-          .Case("esr_el3", ARM64SYS::ESR_EL3)
-          .Case("far_el1", ARM64SYS::FAR_EL1)
-          .Case("far_el2", ARM64SYS::FAR_EL2)
-          .Case("far_el3", ARM64SYS::FAR_EL3)
-          .Case("hpfar_el2", ARM64SYS::HPFAR_EL2)
-          .Case("par_el1", ARM64SYS::PAR_EL1)
-          .Case("mair_el1", ARM64SYS::MAIR_EL1)
-          .Case("mair_el2", ARM64SYS::MAIR_EL2)
-          .Case("mair_el3", ARM64SYS::MAIR_EL3)
-          .Case("amair_el1", ARM64SYS::AMAIR_EL1)
-          .Case("amair_el2", ARM64SYS::AMAIR_EL2)
-          .Case("amair_el3", ARM64SYS::AMAIR_EL3)
-          .Case("vbar_el1", ARM64SYS::VBAR_EL1)
-          .Case("vbar_el2", ARM64SYS::VBAR_EL2)
-          .Case("vbar_el3", ARM64SYS::VBAR_EL3)
-          .Case("rvbar_el1", ARM64SYS::RVBAR_EL1)
-          .Case("rvbar_el2", ARM64SYS::RVBAR_EL2)
-          .Case("rvbar_el3", ARM64SYS::RVBAR_EL3)
-          .Case("isr_el1", ARM64SYS::ISR_EL1)
-          .Case("contextidr_el1", ARM64SYS::CONTEXTIDR_EL1)
-          .Case("tpidr_el0", ARM64SYS::TPIDR_EL0)
-          .Case("tpidrro_el0", ARM64SYS::TPIDRRO_EL0)
-          .Case("tpidr_el1", ARM64SYS::TPIDR_EL1)
-          .Case("tpidr_el2", ARM64SYS::TPIDR_EL2)
-          .Case("tpidr_el3", ARM64SYS::TPIDR_EL3)
-          .Case("teecr32_el1", ARM64SYS::TEECR32_EL1)
-          .Case("cntfrq_el0", ARM64SYS::CNTFRQ_EL0)
-          .Case("cntpct_el0", ARM64SYS::CNTPCT_EL0)
-          .Case("cntvct_el0", ARM64SYS::CNTVCT_EL0)
-          .Case("cntvoff_el2", ARM64SYS::CNTVOFF_EL2)
-          .Case("cntkctl_el1", ARM64SYS::CNTKCTL_EL1)
-          .Case("cnthctl_el2", ARM64SYS::CNTHCTL_EL2)
-          .Case("cntp_tval_el0", ARM64SYS::CNTP_TVAL_EL0)
-          .Case("cntp_ctl_el0", ARM64SYS::CNTP_CTL_EL0)
-          .Case("cntp_cval_el0", ARM64SYS::CNTP_CVAL_EL0)
-          .Case("cntv_tval_el0", ARM64SYS::CNTV_TVAL_EL0)
-          .Case("cntv_ctl_el0", ARM64SYS::CNTV_CTL_EL0)
-          .Case("cntv_cval_el0", ARM64SYS::CNTV_CVAL_EL0)
-          .Case("cnthp_tval_el2", ARM64SYS::CNTHP_TVAL_EL2)
-          .Case("cnthp_ctl_el2", ARM64SYS::CNTHP_CTL_EL2)
-          .Case("cnthp_cval_el2", ARM64SYS::CNTHP_CVAL_EL2)
-          .Case("cntps_tval_el1", ARM64SYS::CNTPS_TVAL_EL1)
-          .Case("cntps_ctl_el1", ARM64SYS::CNTPS_CTL_EL1)
-          .Case("cntps_cval_el1", ARM64SYS::CNTPS_CVAL_EL1)
-          .Case("dacr32_el2", ARM64SYS::DACR32_EL2)
-          .Case("ifsr32_el2", ARM64SYS::IFSR32_EL2)
-          .Case("teehbr32_el1", ARM64SYS::TEEHBR32_EL1)
-          .Case("sder32_el3", ARM64SYS::SDER32_EL3)
-          .Case("fpexc32_el2", ARM64SYS::FPEXC32_EL2)
-          .Case("current_el", ARM64SYS::CurrentEL)
-          .Case("pmevcntr0_el0", ARM64SYS::PMEVCNTR0_EL0)
-          .Case("pmevcntr1_el0", ARM64SYS::PMEVCNTR1_EL0)
-          .Case("pmevcntr2_el0", ARM64SYS::PMEVCNTR2_EL0)
-          .Case("pmevcntr3_el0", ARM64SYS::PMEVCNTR3_EL0)
-          .Case("pmevcntr4_el0", ARM64SYS::PMEVCNTR4_EL0)
-          .Case("pmevcntr5_el0", ARM64SYS::PMEVCNTR5_EL0)
-          .Case("pmevcntr6_el0", ARM64SYS::PMEVCNTR6_EL0)
-          .Case("pmevcntr7_el0", ARM64SYS::PMEVCNTR7_EL0)
-          .Case("pmevcntr8_el0", ARM64SYS::PMEVCNTR8_EL0)
-          .Case("pmevcntr9_el0", ARM64SYS::PMEVCNTR9_EL0)
-          .Case("pmevcntr10_el0", ARM64SYS::PMEVCNTR10_EL0)
-          .Case("pmevcntr11_el0", ARM64SYS::PMEVCNTR11_EL0)
-          .Case("pmevcntr12_el0", ARM64SYS::PMEVCNTR12_EL0)
-          .Case("pmevcntr13_el0", ARM64SYS::PMEVCNTR13_EL0)
-          .Case("pmevcntr14_el0", ARM64SYS::PMEVCNTR14_EL0)
-          .Case("pmevcntr15_el0", ARM64SYS::PMEVCNTR15_EL0)
-          .Case("pmevcntr16_el0", ARM64SYS::PMEVCNTR16_EL0)
-          .Case("pmevcntr17_el0", ARM64SYS::PMEVCNTR17_EL0)
-          .Case("pmevcntr18_el0", ARM64SYS::PMEVCNTR18_EL0)
-          .Case("pmevcntr19_el0", ARM64SYS::PMEVCNTR19_EL0)
-          .Case("pmevcntr20_el0", ARM64SYS::PMEVCNTR20_EL0)
-          .Case("pmevcntr21_el0", ARM64SYS::PMEVCNTR21_EL0)
-          .Case("pmevcntr22_el0", ARM64SYS::PMEVCNTR22_EL0)
-          .Case("pmevcntr23_el0", ARM64SYS::PMEVCNTR23_EL0)
-          .Case("pmevcntr24_el0", ARM64SYS::PMEVCNTR24_EL0)
-          .Case("pmevcntr25_el0", ARM64SYS::PMEVCNTR25_EL0)
-          .Case("pmevcntr26_el0", ARM64SYS::PMEVCNTR26_EL0)
-          .Case("pmevcntr27_el0", ARM64SYS::PMEVCNTR27_EL0)
-          .Case("pmevcntr28_el0", ARM64SYS::PMEVCNTR28_EL0)
-          .Case("pmevcntr29_el0", ARM64SYS::PMEVCNTR29_EL0)
-          .Case("pmevcntr30_el0", ARM64SYS::PMEVCNTR30_EL0)
-          .Case("pmevtyper0_el0", ARM64SYS::PMEVTYPER0_EL0)
-          .Case("pmevtyper1_el0", ARM64SYS::PMEVTYPER1_EL0)
-          .Case("pmevtyper2_el0", ARM64SYS::PMEVTYPER2_EL0)
-          .Case("pmevtyper3_el0", ARM64SYS::PMEVTYPER3_EL0)
-          .Case("pmevtyper4_el0", ARM64SYS::PMEVTYPER4_EL0)
-          .Case("pmevtyper5_el0", ARM64SYS::PMEVTYPER5_EL0)
-          .Case("pmevtyper6_el0", ARM64SYS::PMEVTYPER6_EL0)
-          .Case("pmevtyper7_el0", ARM64SYS::PMEVTYPER7_EL0)
-          .Case("pmevtyper8_el0", ARM64SYS::PMEVTYPER8_EL0)
-          .Case("pmevtyper9_el0", ARM64SYS::PMEVTYPER9_EL0)
-          .Case("pmevtyper10_el0", ARM64SYS::PMEVTYPER10_EL0)
-          .Case("pmevtyper11_el0", ARM64SYS::PMEVTYPER11_EL0)
-          .Case("pmevtyper12_el0", ARM64SYS::PMEVTYPER12_EL0)
-          .Case("pmevtyper13_el0", ARM64SYS::PMEVTYPER13_EL0)
-          .Case("pmevtyper14_el0", ARM64SYS::PMEVTYPER14_EL0)
-          .Case("pmevtyper15_el0", ARM64SYS::PMEVTYPER15_EL0)
-          .Case("pmevtyper16_el0", ARM64SYS::PMEVTYPER16_EL0)
-          .Case("pmevtyper17_el0", ARM64SYS::PMEVTYPER17_EL0)
-          .Case("pmevtyper18_el0", ARM64SYS::PMEVTYPER18_EL0)
-          .Case("pmevtyper19_el0", ARM64SYS::PMEVTYPER19_EL0)
-          .Case("pmevtyper20_el0", ARM64SYS::PMEVTYPER20_EL0)
-          .Case("pmevtyper21_el0", ARM64SYS::PMEVTYPER21_EL0)
-          .Case("pmevtyper22_el0", ARM64SYS::PMEVTYPER22_EL0)
-          .Case("pmevtyper23_el0", ARM64SYS::PMEVTYPER23_EL0)
-          .Case("pmevtyper24_el0", ARM64SYS::PMEVTYPER24_EL0)
-          .Case("pmevtyper25_el0", ARM64SYS::PMEVTYPER25_EL0)
-          .Case("pmevtyper26_el0", ARM64SYS::PMEVTYPER26_EL0)
-          .Case("pmevtyper27_el0", ARM64SYS::PMEVTYPER27_EL0)
-          .Case("pmevtyper28_el0", ARM64SYS::PMEVTYPER28_EL0)
-          .Case("pmevtyper29_el0", ARM64SYS::PMEVTYPER29_EL0)
-          .Case("pmevtyper30_el0", ARM64SYS::PMEVTYPER30_EL0)
-          .Case("pmccfiltr_el0", ARM64SYS::PMCCFILTR_EL0)
-          .Case("rmr_el3", ARM64SYS::RMR_EL3)
-          .Case("rmr_el2", ARM64SYS::RMR_EL2)
-          .Case("rmr_el1", ARM64SYS::RMR_EL1)
-          .Case("cpm_ioacc_ctl_el3", ARM64SYS::CPM_IOACC_CTL_EL3)
-          .Case("mdccsr_el0", ARM64SYS::MDCCSR_EL0)
-          .Case("mdccint_el1", ARM64SYS::MDCCINT_EL1)
-          .Case("dbgdtr_el0", ARM64SYS::DBGDTR_EL0)
-          .Case("dbgdtrrx_el0", ARM64SYS::DBGDTRRX_EL0)
-          .Case("dbgdtrtx_el0", ARM64SYS::DBGDTRTX_EL0)
-          .Case("dbgvcr32_el2", ARM64SYS::DBGVCR32_EL2)
-          .Case("osdtrrx_el1", ARM64SYS::OSDTRRX_EL1)
-          .Case("mdscr_el1", ARM64SYS::MDSCR_EL1)
-          .Case("osdtrtx_el1", ARM64SYS::OSDTRTX_EL1)
-          .Case("oseccr_el11", ARM64SYS::OSECCR_EL11)
-          .Case("dbgbvr0_el1", ARM64SYS::DBGBVR0_EL1)
-          .Case("dbgbvr1_el1", ARM64SYS::DBGBVR1_EL1)
-          .Case("dbgbvr2_el1", ARM64SYS::DBGBVR2_EL1)
-          .Case("dbgbvr3_el1", ARM64SYS::DBGBVR3_EL1)
-          .Case("dbgbvr4_el1", ARM64SYS::DBGBVR4_EL1)
-          .Case("dbgbvr5_el1", ARM64SYS::DBGBVR5_EL1)
-          .Case("dbgbvr6_el1", ARM64SYS::DBGBVR6_EL1)
-          .Case("dbgbvr7_el1", ARM64SYS::DBGBVR7_EL1)
-          .Case("dbgbvr8_el1", ARM64SYS::DBGBVR8_EL1)
-          .Case("dbgbvr9_el1", ARM64SYS::DBGBVR9_EL1)
-          .Case("dbgbvr10_el1", ARM64SYS::DBGBVR10_EL1)
-          .Case("dbgbvr11_el1", ARM64SYS::DBGBVR11_EL1)
-          .Case("dbgbvr12_el1", ARM64SYS::DBGBVR12_EL1)
-          .Case("dbgbvr13_el1", ARM64SYS::DBGBVR13_EL1)
-          .Case("dbgbvr14_el1", ARM64SYS::DBGBVR14_EL1)
-          .Case("dbgbvr15_el1", ARM64SYS::DBGBVR15_EL1)
-          .Case("dbgbcr0_el1", ARM64SYS::DBGBCR0_EL1)
-          .Case("dbgbcr1_el1", ARM64SYS::DBGBCR1_EL1)
-          .Case("dbgbcr2_el1", ARM64SYS::DBGBCR2_EL1)
-          .Case("dbgbcr3_el1", ARM64SYS::DBGBCR3_EL1)
-          .Case("dbgbcr4_el1", ARM64SYS::DBGBCR4_EL1)
-          .Case("dbgbcr5_el1", ARM64SYS::DBGBCR5_EL1)
-          .Case("dbgbcr6_el1", ARM64SYS::DBGBCR6_EL1)
-          .Case("dbgbcr7_el1", ARM64SYS::DBGBCR7_EL1)
-          .Case("dbgbcr8_el1", ARM64SYS::DBGBCR8_EL1)
-          .Case("dbgbcr9_el1", ARM64SYS::DBGBCR9_EL1)
-          .Case("dbgbcr10_el1", ARM64SYS::DBGBCR10_EL1)
-          .Case("dbgbcr11_el1", ARM64SYS::DBGBCR11_EL1)
-          .Case("dbgbcr12_el1", ARM64SYS::DBGBCR12_EL1)
-          .Case("dbgbcr13_el1", ARM64SYS::DBGBCR13_EL1)
-          .Case("dbgbcr14_el1", ARM64SYS::DBGBCR14_EL1)
-          .Case("dbgbcr15_el1", ARM64SYS::DBGBCR15_EL1)
-          .Case("dbgwvr0_el1", ARM64SYS::DBGWVR0_EL1)
-          .Case("dbgwvr1_el1", ARM64SYS::DBGWVR1_EL1)
-          .Case("dbgwvr2_el1", ARM64SYS::DBGWVR2_EL1)
-          .Case("dbgwvr3_el1", ARM64SYS::DBGWVR3_EL1)
-          .Case("dbgwvr4_el1", ARM64SYS::DBGWVR4_EL1)
-          .Case("dbgwvr5_el1", ARM64SYS::DBGWVR5_EL1)
-          .Case("dbgwvr6_el1", ARM64SYS::DBGWVR6_EL1)
-          .Case("dbgwvr7_el1", ARM64SYS::DBGWVR7_EL1)
-          .Case("dbgwvr8_el1", ARM64SYS::DBGWVR8_EL1)
-          .Case("dbgwvr9_el1", ARM64SYS::DBGWVR9_EL1)
-          .Case("dbgwvr10_el1", ARM64SYS::DBGWVR10_EL1)
-          .Case("dbgwvr11_el1", ARM64SYS::DBGWVR11_EL1)
-          .Case("dbgwvr12_el1", ARM64SYS::DBGWVR12_EL1)
-          .Case("dbgwvr13_el1", ARM64SYS::DBGWVR13_EL1)
-          .Case("dbgwvr14_el1", ARM64SYS::DBGWVR14_EL1)
-          .Case("dbgwvr15_el1", ARM64SYS::DBGWVR15_EL1)
-          .Case("dbgwcr0_el1", ARM64SYS::DBGWCR0_EL1)
-          .Case("dbgwcr1_el1", ARM64SYS::DBGWCR1_EL1)
-          .Case("dbgwcr2_el1", ARM64SYS::DBGWCR2_EL1)
-          .Case("dbgwcr3_el1", ARM64SYS::DBGWCR3_EL1)
-          .Case("dbgwcr4_el1", ARM64SYS::DBGWCR4_EL1)
-          .Case("dbgwcr5_el1", ARM64SYS::DBGWCR5_EL1)
-          .Case("dbgwcr6_el1", ARM64SYS::DBGWCR6_EL1)
-          .Case("dbgwcr7_el1", ARM64SYS::DBGWCR7_EL1)
-          .Case("dbgwcr8_el1", ARM64SYS::DBGWCR8_EL1)
-          .Case("dbgwcr9_el1", ARM64SYS::DBGWCR9_EL1)
-          .Case("dbgwcr10_el1", ARM64SYS::DBGWCR10_EL1)
-          .Case("dbgwcr11_el1", ARM64SYS::DBGWCR11_EL1)
-          .Case("dbgwcr12_el1", ARM64SYS::DBGWCR12_EL1)
-          .Case("dbgwcr13_el1", ARM64SYS::DBGWCR13_EL1)
-          .Case("dbgwcr14_el1", ARM64SYS::DBGWCR14_EL1)
-          .Case("dbgwcr15_el1", ARM64SYS::DBGWCR15_EL1)
-          .Case("mdrar_el1", ARM64SYS::MDRAR_EL1)
-          .Case("oslar_el1", ARM64SYS::OSLAR_EL1)
-          .Case("oslsr_el1", ARM64SYS::OSLSR_EL1)
-          .Case("osdlr_el1", ARM64SYS::OSDLR_EL1)
-          .Case("dbgprcr_el1", ARM64SYS::DBGPRCR_EL1)
-          .Case("dbgclaimset_el1", ARM64SYS::DBGCLAIMSET_EL1)
-          .Case("dbgclaimclr_el1", ARM64SYS::DBGCLAIMCLR_EL1)
-          .Case("dbgauthstatus_el1", ARM64SYS::DBGAUTHSTATUS_EL1)
-          .Case("dbgdevid2", ARM64SYS::DBGDEVID2)
-          .Case("dbgdevid1", ARM64SYS::DBGDEVID1)
-          .Case("dbgdevid0", ARM64SYS::DBGDEVID0)
-          .Case("id_pfr0_el1", ARM64SYS::ID_PFR0_EL1)
-          .Case("id_pfr1_el1", ARM64SYS::ID_PFR1_EL1)
-          .Case("id_dfr0_el1", ARM64SYS::ID_DFR0_EL1)
-          .Case("id_afr0_el1", ARM64SYS::ID_AFR0_EL1)
-          .Case("id_isar0_el1", ARM64SYS::ID_ISAR0_EL1)
-          .Case("id_isar1_el1", ARM64SYS::ID_ISAR1_EL1)
-          .Case("id_isar2_el1", ARM64SYS::ID_ISAR2_EL1)
-          .Case("id_isar3_el1", ARM64SYS::ID_ISAR3_EL1)
-          .Case("id_isar4_el1", ARM64SYS::ID_ISAR4_EL1)
-          .Case("id_isar5_el1", ARM64SYS::ID_ISAR5_EL1)
-          .Case("afsr1_el1", ARM64SYS::AFSR1_EL1)
-          .Case("afsr0_el1", ARM64SYS::AFSR0_EL1)
-          .Case("revidr_el1", ARM64SYS::REVIDR_EL1)
-          .Default(ARM64SYS::InvalidSystemReg);
-  if (Reg != ARM64SYS::InvalidSystemReg) {
-    // We matched a reg name, so create the operand.
-    Operands.push_back(
-        ARM64Operand::CreateSystemRegister(Reg, getLoc(), getContext()));
-    Parser.Lex(); // Consume the register name.
-    return MatchOperand_Success;
-  }
-
-  // Or we may have an identifier that encodes the sub-operands.
-  // For example, s3_2_c15_c0_0.
-  unsigned op0, op1, CRn, CRm, op2;
-  std::string Desc = ID;
-  if (std::sscanf(Desc.c_str(), "s%u_%u_c%u_c%u_%u", &op0, &op1, &CRn, &CRm,
-                  &op2) != 5)
-    return MatchOperand_NoMatch;
-  if ((op0 != 2 && op0 != 3) || op1 > 7 || CRn > 15 || CRm > 15 || op2 > 7)
-    return MatchOperand_NoMatch;
-
-  unsigned Val = op0 << 14 | op1 << 11 | CRn << 7 | CRm << 3 | op2;
-  Operands.push_back(
-      ARM64Operand::CreateSystemRegister(Val, getLoc(), getContext()));
-  Parser.Lex(); // Consume the register name.
-
-  return MatchOperand_Success;
-}
-
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseCPSRField(OperandVector &Operands) {
-  const AsmToken &Tok = Parser.getTok();
-
-  if (Tok.isNot(AsmToken::Identifier))
-    return MatchOperand_NoMatch;
-
-  ARM64SYS::CPSRField Field =
-      StringSwitch<ARM64SYS::CPSRField>(Tok.getString().lower())
-          .Case("spsel", ARM64SYS::cpsr_SPSel)
-          .Case("daifset", ARM64SYS::cpsr_DAIFSet)
-          .Case("daifclr", ARM64SYS::cpsr_DAIFClr)
-          .Default(ARM64SYS::InvalidCPSRField);
-  if (Field == ARM64SYS::InvalidCPSRField)
-    return MatchOperand_NoMatch;
-  Operands.push_back(
-      ARM64Operand::CreateCPSRField(Field, getLoc(), getContext()));
-  Parser.Lex(); // Consume the register name.
-
-  return MatchOperand_Success;
-}
-
-/// tryParseVectorRegister - Parse a vector register operand.
-bool ARM64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::Identifier))
-    return true;
-
-  SMLoc S = getLoc();
-  // Check for a vector register specifier first.
-  StringRef Kind;
-  int64_t Reg = tryMatchVectorRegister(Kind);
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, true, S, getLoc(), getContext()));
-  // If there was an explicit qualifier, that goes on as a literal text
-  // operand.
-  if (!Kind.empty())
-    Operands.push_back(ARM64Operand::CreateToken(Kind, false, S, getContext()));
-
-  // If there is an index specifier following the register, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-
-  return false;
-}
-
-/// parseRegister - Parse a non-vector register operand.
-bool ARM64AsmParser::parseRegister(OperandVector &Operands) {
-  SMLoc S = getLoc();
-  // Try for a vector register.
-  if (!tryParseVectorRegister(Operands))
-    return false;
-
-  // Try for a scalar register.
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return true;
-  Operands.push_back(
-      ARM64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
-
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  if (getLexer().getKind() == AsmToken::LBrac) {
-    SMLoc LBracS = getLoc();
-    Parser.Lex();
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        if (getLexer().getKind() == AsmToken::RBrac) {
-          SMLoc RBracS = getLoc();
-          Parser.Lex();
-          Operands.push_back(
-              ARM64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              ARM64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
-  return false;
-}
-
-/// tryParseNoIndexMemory - Custom parser method for memory operands that
-///                         do not allow base regisrer writeback modes,
-///                         or those that handle writeback separately from
-///                         the memory operand (like the AdvSIMD ldX/stX
-///                         instructions.
-ARM64AsmParser::OperandMatchResultTy
-ARM64AsmParser::tryParseNoIndexMemory(OperandVector &Operands) {
-  if (Parser.getTok().isNot(AsmToken::LBrac))
-    return MatchOperand_NoMatch;
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier)) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1) {
-    Error(BaseRegTok.getLoc(), "register expected");
-    return MatchOperand_ParseFail;
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac)) {
-    Error(E, "']' expected");
-    return MatchOperand_ParseFail;
-  }
-
-  Parser.Lex(); // Eat right bracket token.
-
-  Operands.push_back(ARM64Operand::CreateMem(Reg, 0, S, E, E, getContext()));
-  return MatchOperand_Success;
-}
-
-/// parseMemory - Parse a memory operand for a basic load/store instruction.
-bool ARM64AsmParser::parseMemory(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LBrac) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-
-  const AsmToken &BaseRegTok = Parser.getTok();
-  if (BaseRegTok.isNot(AsmToken::Identifier))
-    return Error(BaseRegTok.getLoc(), "register expected");
-
-  int64_t Reg = tryParseRegister();
-  if (Reg == -1)
-    return Error(BaseRegTok.getLoc(), "register expected");
-
-  // If there is an offset expression, parse it.
-  const MCExpr *OffsetExpr = 0;
-  SMLoc OffsetLoc;
-  if (Parser.getTok().is(AsmToken::Comma)) {
-    Parser.Lex(); // Eat the comma.
-    OffsetLoc = getLoc();
-
-    // Register offset
-    const AsmToken &OffsetRegTok = Parser.getTok();
-    int Reg2 = OffsetRegTok.is(AsmToken::Identifier) ? tryParseRegister() : -1;
-    if (Reg2 != -1) {
-      // Default shift is LSL, with an omitted shift.  We use the third bit of
-      // the extend value to indicate presence/omission of the immediate offset.
-      ARM64_AM::ExtendType ExtOp = ARM64_AM::UXTX;
-      int64_t ShiftVal = 0;
-      bool ExplicitShift = false;
-
-      if (Parser.getTok().is(AsmToken::Comma)) {
-        // Embedded extend operand.
-        Parser.Lex(); // Eat the comma
-
-        SMLoc ExtLoc = getLoc();
-        const AsmToken &Tok = Parser.getTok();
-        ExtOp = StringSwitch<ARM64_AM::ExtendType>(Tok.getString())
-                    .Case("uxtw", ARM64_AM::UXTW)
-                    .Case("lsl", ARM64_AM::UXTX) // Alias for UXTX
-                    .Case("sxtw", ARM64_AM::SXTW)
-                    .Case("sxtx", ARM64_AM::SXTX)
-                    .Case("UXTW", ARM64_AM::UXTW)
-                    .Case("LSL", ARM64_AM::UXTX) // Alias for UXTX
-                    .Case("SXTW", ARM64_AM::SXTW)
-                    .Case("SXTX", ARM64_AM::SXTX)
-                    .Default(ARM64_AM::InvalidExtend);
-        if (ExtOp == ARM64_AM::InvalidExtend)
-          return Error(ExtLoc, "expected valid extend operation");
-
-        Parser.Lex(); // Eat the extend op.
-
-        if (getLexer().is(AsmToken::RBrac)) {
-          // No immediate operand.
-          if (ExtOp == ARM64_AM::UXTX)
-            return Error(ExtLoc, "LSL extend requires immediate operand");
-        } else if (getLexer().is(AsmToken::Hash)) {
-          // Immediate operand.
-          Parser.Lex(); // Eat the '#'
-          const MCExpr *ImmVal;
-          SMLoc ExprLoc = getLoc();
-          if (getParser().parseExpression(ImmVal))
-            return true;
-          const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-          if (!MCE)
-            return TokError("immediate value expected for extend operand");
-
-          ExplicitShift = true;
-          ShiftVal = MCE->getValue();
-          if (ShiftVal < 0 || ShiftVal > 4)
-            return Error(ExprLoc, "immediate operand out of range");
-        } else
-          return Error(getLoc(), "expected immediate operand");
-      }
-
-      if (Parser.getTok().isNot(AsmToken::RBrac))
-        return Error(getLoc(), "']' expected");
-
-      Parser.Lex(); // Eat right bracket token.
-
-      SMLoc E = getLoc();
-      Operands.push_back(ARM64Operand::CreateRegOffsetMem(
-          Reg, Reg2, ExtOp, ShiftVal, ExplicitShift, S, E, getContext()));
-      return false;
-
-      // Immediate expressions.
-    } else if (Parser.getTok().is(AsmToken::Hash)) {
-      Parser.Lex(); // Eat hash token.
-
-      if (parseSymbolicImmVal(OffsetExpr))
-        return true;
-    } else {
-      // FIXME: We really should make sure that we're dealing with a LDR/STR
-      // instruction that can legally have a symbolic expression here.
-      // Symbol reference.
-      if (Parser.getTok().isNot(AsmToken::Identifier) &&
-          Parser.getTok().isNot(AsmToken::String))
-        return Error(getLoc(), "identifier or immediate expression expected");
-      if (getParser().parseExpression(OffsetExpr))
-        return true;
-      // If this is a plain ref, Make sure a legal variant kind was specified.
-      // Otherwise, it's a more complicated expression and we have to just
-      // assume it's OK and let the relocation stuff puke if it's not.
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      const MCConstantExpr *Addend;
-      if (classifySymbolRef(OffsetExpr, ELFRefKind, DarwinRefKind, Addend) &&
-          Addend == 0) {
-        assert(ELFRefKind == ARM64MCExpr::VK_INVALID &&
-               "ELF symbol modifiers not supported here yet");
-
-        switch (DarwinRefKind) {
-        default:
-          return Error(getLoc(), "expected @pageoff or @gotpageoff modifier");
-        case MCSymbolRefExpr::VK_GOTPAGEOFF:
-        case MCSymbolRefExpr::VK_PAGEOFF:
-        case MCSymbolRefExpr::VK_TLVPPAGEOFF:
-          // These are what we're expecting.
-          break;
-        }
-      }
-    }
-  }
-
-  SMLoc E = getLoc();
-  if (Parser.getTok().isNot(AsmToken::RBrac))
-    return Error(E, "']' expected");
-
-  Parser.Lex(); // Eat right bracket token.
-
-  // Create the memory operand.
-  Operands.push_back(
-      ARM64Operand::CreateMem(Reg, OffsetExpr, S, E, OffsetLoc, getContext()));
-
-  // Check for a '!', indicating pre-indexed addressing with writeback.
-  if (Parser.getTok().is(AsmToken::Exclaim)) {
-    // There needs to have been an immediate or wback doesn't make sense.
-    if (!OffsetExpr)
-      return Error(E, "missing offset for pre-indexed addressing");
-    // Pre-indexed with writeback must have a constant expression for the
-    // offset. FIXME: Theoretically, we'd like to allow fixups so long
-    // as they don't require a relocation.
-    if (!isa<MCConstantExpr>(OffsetExpr))
-      return Error(OffsetLoc, "constant immediate expression expected");
-
-    // Create the Token operand for the '!'.
-    Operands.push_back(ARM64Operand::CreateToken(
-        "!", false, Parser.getTok().getLoc(), getContext()));
-    Parser.Lex(); // Eat the '!' token.
-  }
-
-  return false;
-}
-
-bool ARM64AsmParser::parseSymbolicImmVal(const MCExpr *&ImmVal) {
-  bool HasELFModifier = false;
-  ARM64MCExpr::VariantKind RefKind;
-
-  if (Parser.getTok().is(AsmToken::Colon)) {
-    Parser.Lex(); // Eat ':"
-    HasELFModifier = true;
-
-    if (Parser.getTok().isNot(AsmToken::Identifier)) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    std::string LowerCase = Parser.getTok().getIdentifier().lower();
-    RefKind = StringSwitch<ARM64MCExpr::VariantKind>(LowerCase)
-                  .Case("lo12", ARM64MCExpr::VK_LO12)
-                  .Case("abs_g3", ARM64MCExpr::VK_ABS_G3)
-                  .Case("abs_g2", ARM64MCExpr::VK_ABS_G2)
-                  .Case("abs_g2_nc", ARM64MCExpr::VK_ABS_G2_NC)
-                  .Case("abs_g1", ARM64MCExpr::VK_ABS_G1)
-                  .Case("abs_g1_nc", ARM64MCExpr::VK_ABS_G1_NC)
-                  .Case("abs_g0", ARM64MCExpr::VK_ABS_G0)
-                  .Case("abs_g0_nc", ARM64MCExpr::VK_ABS_G0_NC)
-                  .Case("dtprel_g2", ARM64MCExpr::VK_DTPREL_G2)
-                  .Case("dtprel_g1", ARM64MCExpr::VK_DTPREL_G1)
-                  .Case("dtprel_g1_nc", ARM64MCExpr::VK_DTPREL_G1_NC)
-                  .Case("dtprel_g0", ARM64MCExpr::VK_DTPREL_G0)
-                  .Case("dtprel_g0_nc", ARM64MCExpr::VK_DTPREL_G0_NC)
-                  .Case("dtprel_lo12", ARM64MCExpr::VK_DTPREL_LO12)
-                  .Case("dtprel_lo12_nc", ARM64MCExpr::VK_DTPREL_LO12_NC)
-                  .Case("tprel_g2", ARM64MCExpr::VK_TPREL_G2)
-                  .Case("tprel_g1", ARM64MCExpr::VK_TPREL_G1)
-                  .Case("tprel_g1_nc", ARM64MCExpr::VK_TPREL_G1_NC)
-                  .Case("tprel_g0", ARM64MCExpr::VK_TPREL_G0)
-                  .Case("tprel_g0_nc", ARM64MCExpr::VK_TPREL_G0_NC)
-                  .Case("tprel_lo12", ARM64MCExpr::VK_TPREL_LO12)
-                  .Case("tprel_lo12_nc", ARM64MCExpr::VK_TPREL_LO12_NC)
-                  .Case("tlsdesc_lo12", ARM64MCExpr::VK_TLSDESC_LO12)
-                  .Case("got", ARM64MCExpr::VK_GOT_PAGE)
-                  .Case("got_lo12", ARM64MCExpr::VK_GOT_LO12)
-                  .Case("gottprel", ARM64MCExpr::VK_GOTTPREL_PAGE)
-                  .Case("gottprel_lo12", ARM64MCExpr::VK_GOTTPREL_LO12_NC)
-                  .Case("gottprel_g1", ARM64MCExpr::VK_GOTTPREL_G1)
-                  .Case("gottprel_g0_nc", ARM64MCExpr::VK_GOTTPREL_G0_NC)
-                  .Case("tlsdesc", ARM64MCExpr::VK_TLSDESC_PAGE)
-                  .Default(ARM64MCExpr::VK_INVALID);
-
-    if (RefKind == ARM64MCExpr::VK_INVALID) {
-      Error(Parser.getTok().getLoc(),
-            "expect relocation specifier in operand after ':'");
-      return true;
-    }
-
-    Parser.Lex(); // Eat identifier
-
-    if (Parser.getTok().isNot(AsmToken::Colon)) {
-      Error(Parser.getTok().getLoc(), "expect ':' after relocation specifier");
-      return true;
-    }
-    Parser.Lex(); // Eat ':'
-  }
-
-  if (getParser().parseExpression(ImmVal))
-    return true;
-
-  if (HasELFModifier)
-    ImmVal = ARM64MCExpr::Create(ImmVal, RefKind, getContext());
-
-  return false;
-}
-
-/// parseVectorList - Parse a vector list operand for AdvSIMD instructions.
-bool ARM64AsmParser::parseVectorList(OperandVector &Operands) {
-  assert(Parser.getTok().is(AsmToken::LCurly) && "Token is not a Left Bracket");
-  SMLoc S = getLoc();
-  Parser.Lex(); // Eat left bracket token.
-  StringRef Kind;
-  int64_t FirstReg = tryMatchVectorRegister(Kind);
-  if (FirstReg == -1)
-    return Error(getLoc(), "vector register expected");
-  int64_t PrevReg = FirstReg;
-  unsigned Count = 1;
-  while (Parser.getTok().isNot(AsmToken::RCurly)) {
-    if (Parser.getTok().is(AsmToken::EndOfStatement))
-      Error(getLoc(), "'}' expected");
-
-    if (Parser.getTok().isNot(AsmToken::Comma))
-      return Error(getLoc(), "',' expected");
-    Parser.Lex(); // Eat the comma token.
-
-    SMLoc Loc = getLoc();
-    StringRef NextKind;
-    int64_t Reg = tryMatchVectorRegister(NextKind);
-    if (Reg == -1)
-      return Error(Loc, "vector register expected");
-    // Any Kind suffices must match on all regs in the list.
-    if (Kind != NextKind)
-      return Error(Loc, "mismatched register size suffix");
-
-    // Registers must be incremental (with wraparound at 31)
-    if (getContext().getRegisterInfo()->getEncodingValue(Reg) !=
-        (getContext().getRegisterInfo()->getEncodingValue(PrevReg) + 1) % 32)
-      return Error(Loc, "registers must be sequential");
-
-    PrevReg = Reg;
-    ++Count;
-  }
-  Parser.Lex(); // Eat the '}' token.
-
-  unsigned NumElements = 0;
-  char ElementKind = 0;
-  if (!Kind.empty())
-    parseValidVectorKind(Kind, NumElements, ElementKind);
-
-  Operands.push_back(ARM64Operand::CreateVectorList(
-      FirstReg, Count, NumElements, ElementKind, S, getLoc(), getContext()));
-
-  // If there is an index specifier following the list, parse that too.
-  if (Parser.getTok().is(AsmToken::LBrac)) {
-    SMLoc SIdx = getLoc();
-    Parser.Lex(); // Eat left bracket token.
-
-    const MCExpr *ImmVal;
-    if (getParser().parseExpression(ImmVal))
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(ImmVal);
-    if (!MCE) {
-      TokError("immediate value expected for vector index");
-      return false;
-    }
-
-    SMLoc E = getLoc();
-    if (Parser.getTok().isNot(AsmToken::RBrac)) {
-      Error(E, "']' expected");
-      return false;
-    }
-
-    Parser.Lex(); // Eat right bracket token.
-
-    Operands.push_back(ARM64Operand::CreateVectorIndex(MCE->getValue(), SIdx, E,
-                                                       getContext()));
-  }
-  return false;
-}
-
-/// parseOperand - Parse a arm instruction operand.  For now this parses the
-/// operand regardless of the mnemonic.
-bool ARM64AsmParser::parseOperand(OperandVector &Operands, bool isCondCode,
-                                  bool invertCondCode) {
-  // Check if the current operand has a custom associated parser, if so, try to
-  // custom parse the operand, or fallback to the general approach.
-  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
-  if (ResTy == MatchOperand_Success)
-    return false;
-  // If there wasn't a custom match, try the generic matcher below. Otherwise,
-  // there was a match, but an error occurred, in which case, just return that
-  // the operand parsing failed.
-  if (ResTy == MatchOperand_ParseFail)
-    return true;
-
-  // Nothing custom, so do general case parsing.
-  SMLoc S, E;
-  switch (getLexer().getKind()) {
-  default: {
-    SMLoc S = getLoc();
-    const MCExpr *Expr;
-    if (parseSymbolicImmVal(Expr))
-      return Error(S, "invalid operand");
-
-    SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(Expr, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::LBrac:
-    return parseMemory(Operands);
-  case AsmToken::LCurly:
-    return parseVectorList(Operands);
-  case AsmToken::Identifier: {
-    // If we're expecting a Condition Code operand, then just parse that.
-    if (isCondCode)
-      return parseCondCode(Operands, invertCondCode);
-
-    // If it's a register name, parse it.
-    if (!parseRegister(Operands))
-      return false;
-
-    // This could be an optional "shift" operand.
-    if (!parseOptionalShift(Operands))
-      return false;
-
-    // Or maybe it could be an optional "extend" operand.
-    if (!parseOptionalExtend(Operands))
-      return false;
-
-    // This was not a register so parse other operands that start with an
-    // identifier (like labels) as expressions and create them as immediates.
-    const MCExpr *IdVal;
-    S = getLoc();
-    if (getParser().parseExpression(IdVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(IdVal, S, E, getContext()));
-    return false;
-  }
-  case AsmToken::Hash: {
-    // #42 -> immediate.
-    S = getLoc();
-    Parser.Lex();
-
-    // The only Real that should come through here is a literal #0.0 for
-    // the fcmp[e] r, #0.0 instructions. They expect raw token operands,
-    // so convert the value.
-    const AsmToken &Tok = Parser.getTok();
-    if (Tok.is(AsmToken::Real)) {
-      APFloat RealVal(APFloat::IEEEdouble, Tok.getString());
-      uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
-      if (IntVal != 0 || (Mnemonic != "fcmp" && Mnemonic != "fcmpe"))
-        return TokError("unexpected floating point literal");
-      Parser.Lex(); // Eat the token.
-
-      Operands.push_back(
-          ARM64Operand::CreateToken("#0", false, S, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken(".0", false, S, getContext()));
-      return false;
-    }
-
-    const MCExpr *ImmVal;
-    if (parseSymbolicImmVal(ImmVal))
-      return true;
-
-    E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-    Operands.push_back(ARM64Operand::CreateImm(ImmVal, S, E, getContext()));
-    return false;
-  }
-  }
-}
-
-/// ParseInstruction - Parse an ARM64 instruction mnemonic followed by its
-/// operands.
-bool ARM64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
-                                      StringRef Name, SMLoc NameLoc,
-                                      OperandVector &Operands) {
-  // Create the leading tokens for the mnemonic, split by '.' characters.
-  size_t Start = 0, Next = Name.find('.');
-  StringRef Head = Name.slice(Start, Next);
-
-  // IC, DC, AT, and TLBI instructions are aliases for the SYS instruction.
-  if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi")
-    return parseSysAlias(Head, NameLoc, Operands);
-
-  Operands.push_back(
-      ARM64Operand::CreateToken(Head, false, NameLoc, getContext()));
-  Mnemonic = Head;
-
-  // Handle condition codes for a branch mnemonic
-  if (Head == "b" && Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start + 1, Next);
-
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()));
-    unsigned CC = parseCondCodeString(Head);
-    if (CC == ~0U)
-      return Error(SuffixLoc, "invalid condition code");
-    const MCExpr *CCExpr = MCConstantExpr::Create(CC, getContext());
-    Operands.push_back(
-        ARM64Operand::CreateImm(CCExpr, NameLoc, NameLoc, getContext()));
-  }
-
-  // Add the remaining tokens in the mnemonic.
-  while (Next != StringRef::npos) {
-    Start = Next;
-    Next = Name.find('.', Start + 1);
-    Head = Name.slice(Start, Next);
-    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() +
-                                            (Head.data() - Name.data()) + 1);
-    Operands.push_back(
-        ARM64Operand::CreateToken(Head, true, SuffixLoc, getContext()));
-  }
-
-  // Conditional compare instructions have a Condition Code operand, which needs
-  // to be parsed and an immediate operand created.
-  bool condCodeFourthOperand =
-      (Head == "ccmp" || Head == "ccmn" || Head == "fccmp" ||
-       Head == "fccmpe" || Head == "fcsel" || Head == "csel" ||
-       Head == "csinc" || Head == "csinv" || Head == "csneg");
-
-  // These instructions are aliases to some of the conditional select
-  // instructions. However, the condition code is inverted in the aliased
-  // instruction.
-  //
-  // FIXME: Is this the correct way to handle these? Or should the parser
-  //        generate the aliased instructions directly?
-  bool condCodeSecondOperand = (Head == "cset" || Head == "csetm");
-  bool condCodeThirdOperand =
-      (Head == "cinc" || Head == "cinv" || Head == "cneg");
-
-  // Read the remaining operands.
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    // Read the first operand.
-    if (parseOperand(Operands, false, false)) {
-      Parser.eatToEndOfStatement();
-      return true;
-    }
-
-    unsigned N = 2;
-    while (getLexer().is(AsmToken::Comma)) {
-      Parser.Lex(); // Eat the comma.
-
-      // Parse and remember the operand.
-      if (parseOperand(Operands, (N == 4 && condCodeFourthOperand) ||
-                                     (N == 3 && condCodeThirdOperand) ||
-                                     (N == 2 && condCodeSecondOperand),
-                       condCodeSecondOperand || condCodeThirdOperand)) {
-        Parser.eatToEndOfStatement();
-        return true;
-      }
-
-      ++N;
-    }
-  }
-
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    SMLoc Loc = Parser.getTok().getLoc();
-    Parser.eatToEndOfStatement();
-    return Error(Loc, "unexpected token in argument list");
-  }
-
-  Parser.Lex(); // Consume the EndOfStatement
-  return false;
-}
-
-/// isFPR32Register - Check if a register is in the FPR32 register class.
-/// (The parser does not have the target register info to check the register
-/// class directly.)
-static bool isFPR32Register(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  default:
-    break;
-  case S0:  case S1:  case S2:  case S3:  case S4:  case S5:  case S6:
-  case S7:  case S8:  case S9:  case S10:  case S11:  case S12:  case S13:
-  case S14:  case S15:  case S16:  case S17:  case S18:  case S19:  case S20:
-  case S21:  case S22:  case S23:  case S24:  case S25:  case S26:  case S27:
-  case S28:  case S29:  case S30:  case S31:
-    return true;
-  }
-  return false;
-}
-
-/// isGPR32Register - Check if a register is in the GPR32sp register class.
-/// (The parser does not have the target register info to check the register
-/// class directly.)
-static bool isGPR32Register(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  default:
-    break;
-  case W0:  case W1:  case W2:  case W3:  case W4:  case W5:  case W6:
-  case W7:  case W8:  case W9:  case W10:  case W11:  case W12:  case W13:
-  case W14:  case W15:  case W16:  case W17:  case W18:  case W19:  case W20:
-  case W21:  case W22:  case W23:  case W24:  case W25:  case W26:  case W27:
-  case W28:  case W29:  case W30:  case WSP:
-    return true;
-  }
-  return false;
-}
-
-static bool isGPR64Reg(unsigned Reg) {
-  using namespace ARM64;
-  switch (Reg) {
-  case X0:  case X1:  case X2:  case X3:  case X4:  case X5:  case X6:
-  case X7:  case X8:  case X9:  case X10:  case X11:  case X12:  case X13:
-  case X14:  case X15:  case X16:  case X17:  case X18:  case X19:  case X20:
-  case X21:  case X22:  case X23:  case X24:  case X25:  case X26:  case X27:
-  case X28:  case FP:  case LR:  case SP:  case XZR:
-    return true;
-  default:
-    return false;
-  }
-}
-
-
-// FIXME: This entire function is a giant hack to provide us with decent
-// operand range validation/diagnostics until TableGen/MC can be extended
-// to support autogeneration of this kind of validation.
-bool ARM64AsmParser::validateInstruction(MCInst &Inst,
-                                         SmallVectorImpl<SMLoc> &Loc) {
-  const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  // Check for indexed addressing modes w/ the base register being the
-  // same as a destination/source register or pair load where
-  // the Rt == Rt2. All of those are undefined behaviour.
-  switch (Inst.getOpcode()) {
-  case ARM64::LDPSWpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpost:
-  case ARM64::LDPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable LDP instruction, writeback base "
-                           "is also a destination");
-    // FALLTHROUGH
-  }
-  case ARM64::LDPDpost:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpost:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSpost:
-  case ARM64::LDPSpre:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPWi:
-  case ARM64::LDPXi: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    if (Rt == Rt2)
-      return Error(Loc[1], "unpredictable LDP instruction, Rt2==Rt");
-    break;
-  }
-  case ARM64::STPDpost:
-  case ARM64::STPDpre:
-  case ARM64::STPQpost:
-  case ARM64::STPQpre:
-  case ARM64::STPSpost:
-  case ARM64::STPSpre:
-  case ARM64::STPWpost:
-  case ARM64::STPWpre:
-  case ARM64::STPXpost:
-  case ARM64::STPXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rt2 = Inst.getOperand(1).getReg();
-    unsigned Rn = Inst.getOperand(2).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    if (RI->isSubRegisterEq(Rn, Rt2))
-      return Error(Loc[1], "unpredictable STP instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::LDRBBpre:
-  case ARM64::LDRBpre:
-  case ARM64::LDRHHpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRBBpost:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHHpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable LDR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  case ARM64::STRBBpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHHpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRBBpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHHpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre: {
-    unsigned Rt = Inst.getOperand(0).getReg();
-    unsigned Rn = Inst.getOperand(1).getReg();
-    if (RI->isSubRegisterEq(Rn, Rt))
-      return Error(Loc[0], "unpredictable STR instruction, writeback base "
-                           "is also a source");
-    break;
-  }
-  }
-
-  // Now check immediate ranges. Separate from the above as there is overlap
-  // in the instructions being checked and this keeps the nested conditionals
-  // to a minimum.
-  switch (Inst.getOpcode()) {
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::EORWrs:
-  case ARM64::ORRWrs: {
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[3], "immediate value expected");
-    int64_t shifter = Inst.getOperand(3).getImm();
-    ARM64_AM::ShiftType ST = ARM64_AM::getShiftType(shifter);
-    if (ST == ARM64_AM::LSL && shifter > 31)
-      return Error(Loc[3], "shift value out of range");
-    return false;
-  }
-  case ARM64::ADDSWri:
-  case ARM64::ADDSXri:
-  case ARM64::ADDWri:
-  case ARM64::ADDXri:
-  case ARM64::SUBSWri:
-  case ARM64::SUBSXri:
-  case ARM64::SUBWri:
-  case ARM64::SUBXri: {
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[3], "immediate value expected");
-    int64_t shifter = Inst.getOperand(3).getImm();
-    if (shifter != 0 && shifter != 12)
-      return Error(Loc[3], "shift value out of range");
-    // The imm12 operand can be an expression. Validate that it's legit.
-    // FIXME: We really, really want to allow arbitrary expressions here
-    // and resolve the value and validate the result at fixup time, but
-    // that's hard as we have long since lost any source information we
-    // need to generate good diagnostics by that point.
-    if (Inst.getOpcode() == ARM64::ADDXri && Inst.getOperand(2).isExpr()) {
-      const MCExpr *Expr = Inst.getOperand(2).getExpr();
-      ARM64MCExpr::VariantKind ELFRefKind;
-      MCSymbolRefExpr::VariantKind DarwinRefKind;
-      const MCConstantExpr *Addend;
-      if (!classifySymbolRef(Expr, ELFRefKind, DarwinRefKind, Addend)) {
-        return Error(Loc[2], "invalid immediate expression");
-      }
-
-      if (DarwinRefKind == MCSymbolRefExpr::VK_PAGEOFF ||
-          DarwinRefKind == MCSymbolRefExpr::VK_TLVPPAGEOFF ||
-          ELFRefKind == ARM64MCExpr::VK_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_DTPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12 ||
-          ELFRefKind == ARM64MCExpr::VK_TPREL_LO12_NC ||
-          ELFRefKind == ARM64MCExpr::VK_TLSDESC_LO12) {
-        // Note that we don't range-check the addend. It's adjusted
-        // modulo page size when converted, so there is no "out of range"
-        // condition when using @pageoff. Any validity checking for the value
-        // was done in the is*() predicate function.
-        return false;
-      } else if (DarwinRefKind == MCSymbolRefExpr::VK_GOTPAGEOFF) {
-        // @gotpageoff can only be used directly, not with an addend.
-        return Addend != 0;
-      }
-
-      // Otherwise, we're not sure, so don't allow it for now.
-      return Error(Loc[2], "invalid immediate expression");
-    }
-
-    // If it's anything but an immediate, it's not legit.
-    if (!Inst.getOperand(2).isImm())
-      return Error(Loc[2], "invalid immediate expression");
-    int64_t imm = Inst.getOperand(2).getImm();
-    if (imm > 4095 || imm < 0)
-      return Error(Loc[2], "immediate value out of range");
-    return false;
-  }
-  case ARM64::LDRBpre:
-  case ARM64::LDRHpre:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSpre:
-  case ARM64::LDRDpre:
-  case ARM64::LDRQpre:
-  case ARM64::STRBpre:
-  case ARM64::STRHpre:
-  case ARM64::STRWpre:
-  case ARM64::STRXpre:
-  case ARM64::STRSpre:
-  case ARM64::STRDpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRBpost:
-  case ARM64::LDRHpost:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::LDRWpost:
-  case ARM64::LDRXpost:
-  case ARM64::LDRSpost:
-  case ARM64::LDRDpost:
-  case ARM64::LDRQpost:
-  case ARM64::STRBpost:
-  case ARM64::STRHpost:
-  case ARM64::STRWpost:
-  case ARM64::STRXpost:
-  case ARM64::STRSpost:
-  case ARM64::STRDpost:
-  case ARM64::STRQpost:
-  case ARM64::LDTRXi:
-  case ARM64::LDTRWi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::LDTRSHWi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRWi:
-  case ARM64::STTRXi:
-  case ARM64::STTRHi:
-  case ARM64::STTRBi:
-  case ARM64::LDURWi:
-  case ARM64::LDURXi:
-  case ARM64::LDURSi:
-  case ARM64::LDURDi:
-  case ARM64::LDURQi:
-  case ARM64::LDURHi:
-  case ARM64::LDURBi:
-  case ARM64::LDURSHWi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSBWi:
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSWi:
-  case ARM64::PRFUMi:
-  case ARM64::STURWi:
-  case ARM64::STURXi:
-  case ARM64::STURSi:
-  case ARM64::STURDi:
-  case ARM64::STURQi:
-  case ARM64::STURHi:
-  case ARM64::STURBi: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(2).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t offset = Inst.getOperand(2).getImm();
-    if (offset > 255 || offset < -256)
-      return Error(Loc[1], "offset value out of range");
-    return false;
-  }
-  case ARM64::LDRSro:
-  case ARM64::LDRWro:
-  case ARM64::LDRSWro:
-  case ARM64::STRWro:
-  case ARM64::STRSro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRDro:
-  case ARM64::LDRQro:
-  case ARM64::LDRXro:
-  case ARM64::PRFMro:
-  case ARM64::STRXro:
-  case ARM64::STRDro:
-  case ARM64::STRQro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRHro:
-  case ARM64::LDRHHro:
-  case ARM64::LDRSHWro:
-  case ARM64::LDRSHXro:
-  case ARM64::STRHro:
-  case ARM64::STRHHro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDRBro:
-  case ARM64::LDRBBro:
-  case ARM64::LDRSBWro:
-  case ARM64::LDRSBXro:
-  case ARM64::STRBro:
-  case ARM64::STRBBro: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[1], "immediate value expected");
-    int64_t shift = Inst.getOperand(3).getImm();
-    ARM64_AM::ExtendType type = ARM64_AM::getMemExtendType(shift);
-    if (type != ARM64_AM::UXTW && type != ARM64_AM::UXTX &&
-        type != ARM64_AM::SXTW && type != ARM64_AM::SXTX)
-      return Error(Loc[1], "shift type invalid");
-    return false;
-  }
-  case ARM64::LDPWi:
-  case ARM64::LDPXi:
-  case ARM64::LDPSi:
-  case ARM64::LDPDi:
-  case ARM64::LDPQi:
-  case ARM64::LDPSWi:
-  case ARM64::STPWi:
-  case ARM64::STPXi:
-  case ARM64::STPSi:
-  case ARM64::STPDi:
-  case ARM64::STPQi:
-  case ARM64::LDPWpre:
-  case ARM64::LDPXpre:
-  case ARM64::LDPSpre:
-  case ARM64::LDPDpre:
-  case ARM64::LDPQpre:
-  case ARM64::LDPSWpre:
-  case ARM64::STPWpre:
-  case ARM64::STPXpre:
-  case ARM64::STPSpre:
-  case ARM64::STPDpre:
-  case ARM64::STPQpre:
-  case ARM64::LDPWpost:
-  case ARM64::LDPXpost:
-  case ARM64::LDPSpost:
-  case ARM64::LDPDpost:
-  case ARM64::LDPQpost:
-  case ARM64::LDPSWpost:
-  case ARM64::STPWpost:
-  case ARM64::STPXpost:
-  case ARM64::STPSpost:
-  case ARM64::STPDpost:
-  case ARM64::STPQpost:
-  case ARM64::LDNPWi:
-  case ARM64::LDNPXi:
-  case ARM64::LDNPSi:
-  case ARM64::LDNPDi:
-  case ARM64::LDNPQi:
-  case ARM64::STNPWi:
-  case ARM64::STNPXi:
-  case ARM64::STNPSi:
-  case ARM64::STNPDi:
-  case ARM64::STNPQi: {
-    // FIXME: Should accept expressions and error in fixup evaluation
-    // if out of range.
-    if (!Inst.getOperand(3).isImm())
-      return Error(Loc[2], "immediate value expected");
-    int64_t offset = Inst.getOperand(3).getImm();
-    if (offset > 63 || offset < -64)
-      return Error(Loc[2], "offset value out of range");
-    return false;
-  }
-  default:
-    return false;
-  }
-}
-
-static void rewriteMOV(ARM64AsmParser::OperandVector &Operands,
-                       StringRef mnemonic, uint64_t imm, unsigned shift,
-                       MCContext &Context) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-  Operands[0] =
-      ARM64Operand::CreateToken(mnemonic, false, Op->getStartLoc(), Context);
-
-  const MCExpr *NewImm = MCConstantExpr::Create(imm >> shift, Context);
-  Operands[2] = ARM64Operand::CreateImm(NewImm, Op2->getStartLoc(),
-                                        Op2->getEndLoc(), Context);
-
-  Operands.push_back(ARM64Operand::CreateShifter(
-      ARM64_AM::LSL, shift, Op2->getStartLoc(), Op2->getEndLoc(), Context));
-  delete Op2;
-  delete Op;
-}
-
-bool ARM64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
-  switch (ErrCode) {
-  case Match_MissingFeature:
-    return Error(Loc,
-                 "instruction requires a CPU feature not currently enabled");
-  case Match_InvalidOperand:
-    return Error(Loc, "invalid operand for instruction");
-  case Match_InvalidSuffix:
-    return Error(Loc, "invalid type suffix for instruction");
-  case Match_InvalidMemoryIndexedSImm9:
-    return Error(Loc, "index must be an integer in range [-256,255].");
-  case Match_InvalidMemoryIndexed32SImm7:
-    return Error(Loc, "index must be a multiple of 4 in range [-256,252].");
-  case Match_InvalidMemoryIndexed64SImm7:
-    return Error(Loc, "index must be a multiple of 8 in range [-512,504].");
-  case Match_InvalidMemoryIndexed128SImm7:
-    return Error(Loc, "index must be a multiple of 16 in range [-1024,1008].");
-  case Match_InvalidMemoryIndexed8:
-    return Error(Loc, "index must be an integer in range [0,4095].");
-  case Match_InvalidMemoryIndexed16:
-    return Error(Loc, "index must be a multiple of 2 in range [0,8190].");
-  case Match_InvalidMemoryIndexed32:
-    return Error(Loc, "index must be a multiple of 4 in range [0,16380].");
-  case Match_InvalidMemoryIndexed64:
-    return Error(Loc, "index must be a multiple of 8 in range [0,32760].");
-  case Match_InvalidMemoryIndexed128:
-    return Error(Loc, "index must be a multiple of 16 in range [0,65520].");
-  case Match_InvalidImm1_8:
-    return Error(Loc, "immediate must be an integer in range [1,8].");
-  case Match_InvalidImm1_16:
-    return Error(Loc, "immediate must be an integer in range [1,16].");
-  case Match_InvalidImm1_32:
-    return Error(Loc, "immediate must be an integer in range [1,32].");
-  case Match_InvalidImm1_64:
-    return Error(Loc, "immediate must be an integer in range [1,64].");
-  case Match_MnemonicFail:
-    return Error(Loc, "unrecognized instruction mnemonic");
-  default:
-    assert(0 && "unexpected error code!");
-    return Error(Loc, "invalid instruction format");
-  }
-}
-
-bool ARM64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
-                                             OperandVector &Operands,
-                                             MCStreamer &Out,
-                                             unsigned &ErrorInfo,
-                                             bool MatchingInlineAsm) {
-  assert(!Operands.empty() && "Unexpect empty operand list!");
-  ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-  assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-
-  StringRef Tok = Op->getToken();
-  // Translate CMN/CMP pseudos to ADDS/SUBS with zero register destination.
-  // This needs to be done before the special handling of ADD/SUB immediates.
-  if (Tok == "cmp" || Tok == "cmn") {
-    // Replace the opcode with either ADDS or SUBS.
-    const char *Repl = StringSwitch<const char *>(Tok)
-                           .Case("cmp", "subs")
-                           .Case("cmn", "adds")
-                           .Default(0);
-    assert(Repl && "Unknown compare instruction");
-    delete Operands[0];
-    Operands[0] = ARM64Operand::CreateToken(Repl, false, IDLoc, getContext());
-
-    // Insert WZR or XZR as destination operand.
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    unsigned ZeroReg;
-    if (RegOp->isReg() &&
-        (isGPR32Register(RegOp->getReg()) || RegOp->getReg() == ARM64::WZR))
-      ZeroReg = ARM64::WZR;
-    else
-      ZeroReg = ARM64::XZR;
-    Operands.insert(
-        Operands.begin() + 1,
-        ARM64Operand::CreateReg(ZeroReg, false, IDLoc, IDLoc, getContext()));
-    // Update since we modified it above.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[0]);
-    Tok = Op->getToken();
-  }
-
-  unsigned NumOperands = Operands.size();
-
-  if (Tok == "mov" && NumOperands == 3) {
-    // The MOV mnemomic is aliased to movn/movz, depending on the value of
-    // the immediate being instantiated.
-    // FIXME: Catching this here is a total hack, and we should use tblgen
-    // support to implement this instead as soon as it is available.
-
-    ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op2->isImm()) {
-      if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op2->getImm())) {
-        uint64_t Val = CE->getValue();
-        uint64_t NVal = ~Val;
-
-        // If this is a 32-bit register and the value has none of the upper
-        // set, clear the complemented upper 32-bits so the logic below works
-        // for 32-bit registers too.
-        ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-        if (Op1->isReg() && isGPR32Register(Op1->getReg()) &&
-            (Val & 0xFFFFFFFFULL) == Val)
-          NVal &= 0x00000000FFFFFFFFULL;
-
-        // MOVK Rd, imm << 0
-        if ((Val & 0xFFFF) == Val)
-          rewriteMOV(Operands, "movz", Val, 0, getContext());
-
-        // MOVK Rd, imm << 16
-        else if ((Val & 0xFFFF0000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 16, getContext());
-
-        // MOVK Rd, imm << 32
-        else if ((Val & 0xFFFF00000000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 32, getContext());
-
-        // MOVK Rd, imm << 48
-        else if ((Val & 0xFFFF000000000000ULL) == Val)
-          rewriteMOV(Operands, "movz", Val, 48, getContext());
-
-        // MOVN Rd, (~imm << 0)
-        else if ((NVal & 0xFFFFULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 0, getContext());
-
-        // MOVN Rd, ~(imm << 16)
-        else if ((NVal & 0xFFFF0000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 16, getContext());
-
-        // MOVN Rd, ~(imm << 32)
-        else if ((NVal & 0xFFFF00000000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 32, getContext());
-
-        // MOVN Rd, ~(imm << 48)
-        else if ((NVal & 0xFFFF000000000000ULL) == NVal)
-          rewriteMOV(Operands, "movn", NVal, 48, getContext());
-      }
-    }
-  } else if (NumOperands == 4) {
-    if (Tok == "add" || Tok == "adds" || Tok == "sub" || Tok == "subs") {
-      // Handle the uimm24 immediate form, where the shift is not specified.
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if (Op3->isImm()) {
-        if (const MCConstantExpr *CE =
-                dyn_cast<MCConstantExpr>(Op3->getImm())) {
-          uint64_t Val = CE->getValue();
-          if (Val >= (1 << 24)) {
-            Error(IDLoc, "immediate value is too large");
-            return true;
-          }
-          if (Val < (1 << 12)) {
-            Operands.push_back(ARM64Operand::CreateShifter(
-                ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-          } else if ((Val & 0xfff) == 0) {
-            delete Operands[3];
-            CE = MCConstantExpr::Create(Val >> 12, getContext());
-            Operands[3] =
-                ARM64Operand::CreateImm(CE, IDLoc, IDLoc, getContext());
-            Operands.push_back(ARM64Operand::CreateShifter(
-                ARM64_AM::LSL, 12, IDLoc, IDLoc, getContext()));
-          } else {
-            Error(IDLoc, "immediate value is too large");
-            return true;
-          }
-        } else {
-          Operands.push_back(ARM64Operand::CreateShifter(
-              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-        }
-      }
-
-      // FIXME: Horible hack to handle the LSL -> UBFM alias.
-    } else if (NumOperands == 4 && Tok == "lsl") {
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if (Op2->isReg() && Op3->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        if (Op3CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t NewOp3Val = 0;
-          uint64_t NewOp4Val = 0;
-          if (isGPR32Register(Op2->getReg()) || Op2->getReg() == ARM64::WZR) {
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-            NewOp4Val = 31 - Op3Val;
-          } else {
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-            NewOp4Val = 63 - Op3Val;
-          }
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-
-          Operands[0] = ARM64Operand::CreateToken(
-              "ubfm", false, Op->getStartLoc(), getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands.push_back(ARM64Operand::CreateImm(
-              NewOp4, Op3->getStartLoc(), Op3->getEndLoc(), getContext()));
-          delete Op3;
-          delete Op;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the optional LSL shift for vector
-      //        instructions.
-    } else if (NumOperands == 4 && (Tok == "bic" || Tok == "orr")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
-          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm()))
-        Operands.push_back(ARM64Operand::CreateShifter(ARM64_AM::LSL, 0, IDLoc,
-                                                       IDLoc, getContext()));
-    } else if (NumOperands == 4 && (Tok == "movi" || Tok == "mvni")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op2 = static_cast<ARM64Operand *>(Operands[2]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      if ((Op1->isToken() && Op2->isVectorReg() && Op3->isImm()) ||
-          (Op1->isVectorReg() && Op2->isToken() && Op3->isImm())) {
-        StringRef Suffix = Op1->isToken() ? Op1->getToken() : Op2->getToken();
-        // Canonicalize on lower-case for ease of comparison.
-        std::string CanonicalSuffix = Suffix.lower();
-        if (Tok != "movi" ||
-            (CanonicalSuffix != ".1d" && CanonicalSuffix != ".2d" &&
-             CanonicalSuffix != ".8b" && CanonicalSuffix != ".16b"))
-          Operands.push_back(ARM64Operand::CreateShifter(
-              ARM64_AM::LSL, 0, IDLoc, IDLoc, getContext()));
-      }
-    }
-  } else if (NumOperands == 5) {
-    // FIXME: Horrible hack to handle the BFI -> BFM, SBFIZ->SBFM, and
-    // UBFIZ -> UBFM aliases.
-    if (Tok == "bfi" || Tok == "sbfiz" || Tok == "ubfiz") {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-
-          uint64_t NewOp3Val = 0;
-          if (isGPR32Register(Op1->getReg()))
-            NewOp3Val = (32 - Op3Val) & 0x1f;
-          else
-            NewOp3Val = (64 - Op3Val) & 0x3f;
-
-          uint64_t NewOp4Val = Op4Val - 1;
-
-          const MCExpr *NewOp3 =
-              MCConstantExpr::Create(NewOp3Val, getContext());
-          const MCExpr *NewOp4 =
-              MCConstantExpr::Create(NewOp4Val, getContext());
-          Operands[3] = ARM64Operand::CreateImm(NewOp3, Op3->getStartLoc(),
-                                                Op3->getEndLoc(), getContext());
-          Operands[4] = ARM64Operand::CreateImm(NewOp4, Op4->getStartLoc(),
-                                                Op4->getEndLoc(), getContext());
-          if (Tok == "bfi")
-            Operands[0] = ARM64Operand::CreateToken(
-                "bfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "sbfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "sbfm", false, Op->getStartLoc(), getContext());
-          else if (Tok == "ubfiz")
-            Operands[0] = ARM64Operand::CreateToken(
-                "ubfm", false, Op->getStartLoc(), getContext());
-          else
-            llvm_unreachable("No valid mnemonic for alias?");
-
-          delete Op;
-          delete Op3;
-          delete Op4;
-        }
-      }
-
-      // FIXME: Horrible hack to handle the BFXIL->BFM, SBFX->SBFM, and
-      // UBFX -> UBFM aliases.
-    } else if (NumOperands == 5 &&
-               (Tok == "bfxil" || Tok == "sbfx" || Tok == "ubfx")) {
-      ARM64Operand *Op1 = static_cast<ARM64Operand *>(Operands[1]);
-      ARM64Operand *Op3 = static_cast<ARM64Operand *>(Operands[3]);
-      ARM64Operand *Op4 = static_cast<ARM64Operand *>(Operands[4]);
-
-      if (Op1->isReg() && Op3->isImm() && Op4->isImm()) {
-        const MCConstantExpr *Op3CE = dyn_cast<MCConstantExpr>(Op3->getImm());
-        const MCConstantExpr *Op4CE = dyn_cast<MCConstantExpr>(Op4->getImm());
-
-        if (Op3CE && Op4CE) {
-          uint64_t Op3Val = Op3CE->getValue();
-          uint64_t Op4Val = Op4CE->getValue();
-          uint64_t NewOp4Val = Op3Val + Op4Val - 1;
-
-          if (NewOp4Val >= Op3Val) {
-            const MCExpr *NewOp4 =
-                MCConstantExpr::Create(NewOp4Val, getContext());
-            Operands[4] = ARM64Operand::CreateImm(
-                NewOp4, Op4->getStartLoc(), Op4->getEndLoc(), getContext());
-            if (Tok == "bfxil")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "bfm", false, Op->getStartLoc(), getContext());
-            else if (Tok == "sbfx")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "sbfm", false, Op->getStartLoc(), getContext());
-            else if (Tok == "ubfx")
-              Operands[0] = ARM64Operand::CreateToken(
-                  "ubfm", false, Op->getStartLoc(), getContext());
-            else
-              llvm_unreachable("No valid mnemonic for alias?");
-
-            delete Op;
-            delete Op4;
-          }
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for tbz and tbnz with Wn register operand.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 4 && (Tok == "tbz" || Tok == "tbnz")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isImm()) {
-      if (const MCConstantExpr *OpCE = dyn_cast<MCConstantExpr>(Op->getImm())) {
-        if (OpCE->getValue() < 32) {
-          // The source register can be Wn here, but the matcher expects a
-          // GPR64. Twiddle it here if necessary.
-          ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-          if (Op->isReg()) {
-            unsigned Reg = getXRegFromWReg(Op->getReg());
-            Operands[1] = ARM64Operand::CreateReg(
-                Reg, false, Op->getStartLoc(), Op->getEndLoc(), getContext());
-            delete Op;
-          }
-        }
-      }
-    }
-  }
-  // FIXME: Horrible hack for sxtw and uxtw with Wn src and Xd dst operands.
-  //        InstAlias can't quite handle this since the reg classes aren't
-  //        subclasses.
-  if (NumOperands == 3 && (Tok == "sxtw" || Tok == "uxtw")) {
-    // The source register can be Wn here, but the matcher expects a
-    // GPR64. Twiddle it here if necessary.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-    if (Op->isReg()) {
-      unsigned Reg = getXRegFromWReg(Op->getReg());
-      Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete Op;
-    }
-  }
-  // FIXME: Likewise for [su]xt[bh] with a Xd dst operand
-  else if (NumOperands == 3 &&
-           (Tok == "sxtb" || Tok == "uxtb" || Tok == "sxth" || Tok == "uxth")) {
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[1]);
-    if (Op->isReg() && isGPR64Reg(Op->getReg())) {
-      // The source register can be Wn here, but the matcher expects a
-      // GPR64. Twiddle it here if necessary.
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[2]);
-      if (Op->isReg()) {
-        unsigned Reg = getXRegFromWReg(Op->getReg());
-        Operands[2] = ARM64Operand::CreateReg(Reg, false, Op->getStartLoc(),
-                                              Op->getEndLoc(), getContext());
-        delete Op;
-      }
-    }
-  }
-
-  // Yet another horrible hack to handle FMOV Rd, #0.0 using [WX]ZR.
-  if (NumOperands == 3 && Tok == "fmov") {
-    ARM64Operand *RegOp = static_cast<ARM64Operand *>(Operands[1]);
-    ARM64Operand *ImmOp = static_cast<ARM64Operand *>(Operands[2]);
-    if (RegOp->isReg() && ImmOp->isFPImm() &&
-        ImmOp->getFPImm() == (unsigned)-1) {
-      unsigned zreg =
-          isFPR32Register(RegOp->getReg()) ? ARM64::WZR : ARM64::XZR;
-      Operands[2] = ARM64Operand::CreateReg(zreg, false, Op->getStartLoc(),
-                                            Op->getEndLoc(), getContext());
-      delete ImmOp;
-    }
-  }
-
-  // FIXME: Horrible hack to handle the literal .d[1] vector index on
-  // FMOV instructions. The index isn't an actual instruction operand
-  // but rather syntactic sugar. It really should be part of the mnemonic,
-  // not the operand, but whatever.
-  if ((NumOperands == 5) && Tok == "fmov") {
-    // If the last operand is a vectorindex of '1', then replace it with
-    // a '[' '1' ']' token sequence, which is what the matcher
-    // (annoyingly) expects for a literal vector index operand.
-    ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[NumOperands - 1]);
-    if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
-      SMLoc Loc = Op->getStartLoc();
-      Operands.pop_back();
-      Operands.push_back(
-          ARM64Operand::CreateToken("[", false, Loc, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken("1", false, Loc, getContext()));
-      Operands.push_back(
-          ARM64Operand::CreateToken("]", false, Loc, getContext()));
-    } else if (Op->isReg()) {
-      // Similarly, check the destination operand for the GPR->High-lane
-      // variant.
-      unsigned OpNo = NumOperands - 2;
-      ARM64Operand *Op = static_cast<ARM64Operand *>(Operands[OpNo]);
-      if (Op->isVectorIndexD() && Op->getVectorIndex() == 1) {
-        SMLoc Loc = Op->getStartLoc();
-        Operands[OpNo] =
-            ARM64Operand::CreateToken("[", false, Loc, getContext());
-        Operands.insert(
-            Operands.begin() + OpNo + 1,
-            ARM64Operand::CreateToken("1", false, Loc, getContext()));
-        Operands.insert(
-            Operands.begin() + OpNo + 2,
-            ARM64Operand::CreateToken("]", false, Loc, getContext()));
-      }
-    }
-  }
-
-  MCInst Inst;
-  // First try to match against the secondary set of tables containing the
-  // short-form NEON instructions (e.g. "fadd.2s v0, v1, v2").
-  unsigned MatchResult =
-      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 1);
-
-  // If that fails, try against the alternate table containing long-form NEON:
-  // "fadd v0.2s, v1.2s, v2.2s"
-  if (MatchResult != Match_Success)
-    MatchResult =
-        MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm, 0);
-
-  switch (MatchResult) {
-  case Match_Success: {
-    // Perform range checking and other semantic validations
-    SmallVector<SMLoc, 8> OperandLocs;
-    NumOperands = Operands.size();
-    for (unsigned i = 1; i < NumOperands; ++i)
-      OperandLocs.push_back(Operands[i]->getStartLoc());
-    if (validateInstruction(Inst, OperandLocs))
-      return true;
-
-    Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, STI);
-    return false;
-  }
-  case Match_MissingFeature:
-  case Match_MnemonicFail:
-    return showMatchError(IDLoc, MatchResult);
-  case Match_InvalidOperand: {
-    SMLoc ErrorLoc = IDLoc;
-    if (ErrorInfo != ~0U) {
-      if (ErrorInfo >= Operands.size())
-        return Error(IDLoc, "too few operands for instruction");
-
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-      if (ErrorLoc == SMLoc())
-        ErrorLoc = IDLoc;
-    }
-    // If the match failed on a suffix token operand, tweak the diagnostic
-    // accordingly.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isToken() &&
-        ((ARM64Operand *)Operands[ErrorInfo])->isTokenSuffix())
-      MatchResult = Match_InvalidSuffix;
-
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexedSImm9: {
-    // If there is not a '!' after the memory operand that failed, we really
-    // want the diagnostic for the non-pre-indexed instruction variant instead.
-    // Be careful to check for the post-indexed variant as well, which also
-    // uses this match diagnostic. Also exclude the explicitly unscaled
-    // mnemonics, as they want the unscaled diagnostic as well.
-    if (Operands.size() == ErrorInfo + 1 &&
-        !((ARM64Operand *)Operands[ErrorInfo])->isImm() &&
-        !Tok.startswith("stur") && !Tok.startswith("ldur")) {
-      // whether we want an Indexed64 or Indexed32 diagnostic depends on
-      // the register class of the previous operand. Default to 64 in case
-      // we see something unexpected.
-      MatchResult = Match_InvalidMemoryIndexed64;
-      if (ErrorInfo) {
-        ARM64Operand *PrevOp = (ARM64Operand *)Operands[ErrorInfo - 1];
-        if (PrevOp->isReg() && ARM64MCRegisterClasses[ARM64::GPR32RegClassID]
-                                   .contains(PrevOp->getReg()))
-          MatchResult = Match_InvalidMemoryIndexed32;
-      }
-    }
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  case Match_InvalidMemoryIndexed32:
-  case Match_InvalidMemoryIndexed64:
-  case Match_InvalidMemoryIndexed128:
-    // If there is a '!' after the memory operand that failed, we really
-    // want the diagnostic for the pre-indexed instruction variant instead.
-    if (Operands.size() > ErrorInfo + 1 &&
-        ((ARM64Operand *)Operands[ErrorInfo + 1])->isTokenEqual("!"))
-      MatchResult = Match_InvalidMemoryIndexedSImm9;
-  // FALL THROUGH
-  case Match_InvalidMemoryIndexed8:
-  case Match_InvalidMemoryIndexed16:
-  case Match_InvalidMemoryIndexed32SImm7:
-  case Match_InvalidMemoryIndexed64SImm7:
-  case Match_InvalidMemoryIndexed128SImm7:
-  case Match_InvalidImm1_8:
-  case Match_InvalidImm1_16:
-  case Match_InvalidImm1_32:
-  case Match_InvalidImm1_64: {
-    // Any time we get here, there's nothing fancy to do. Just get the
-    // operand SMLoc and display the diagnostic.
-    SMLoc ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getStartLoc();
-    // If it's a memory operand, the error is with the offset immediate,
-    // so get that location instead.
-    if (((ARM64Operand *)Operands[ErrorInfo])->isMem())
-      ErrorLoc = ((ARM64Operand *)Operands[ErrorInfo])->getOffsetLoc();
-    if (ErrorLoc == SMLoc())
-      ErrorLoc = IDLoc;
-    return showMatchError(ErrorLoc, MatchResult);
-  }
-  }
-
-  llvm_unreachable("Implement any new match types added!");
-  return true;
-}
-
-/// ParseDirective parses the arm specific directives
-bool ARM64AsmParser::ParseDirective(AsmToken DirectiveID) {
-  StringRef IDVal = DirectiveID.getIdentifier();
-  SMLoc Loc = DirectiveID.getLoc();
-  if (IDVal == ".hword")
-    return parseDirectiveWord(2, Loc);
-  if (IDVal == ".word")
-    return parseDirectiveWord(4, Loc);
-  if (IDVal == ".xword")
-    return parseDirectiveWord(8, Loc);
-  if (IDVal == ".tlsdesccall")
-    return parseDirectiveTLSDescCall(Loc);
-
-  return parseDirectiveLOH(IDVal, Loc);
-}
-
-/// parseDirectiveWord
-///  ::= .word [ expression (, expression)* ]
-bool ARM64AsmParser::parseDirectiveWord(unsigned Size, SMLoc L) {
-  if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
-      const MCExpr *Value;
-      if (getParser().parseExpression(Value))
-        return true;
-
-      getParser().getStreamer().EmitValue(Value, Size);
-
-      if (getLexer().is(AsmToken::EndOfStatement))
-        break;
-
-      // FIXME: Improve diagnostic.
-      if (getLexer().isNot(AsmToken::Comma))
-        return Error(L, "unexpected token in directive");
-      Parser.Lex();
-    }
-  }
-
-  Parser.Lex();
-  return false;
-}
-
-// parseDirectiveTLSDescCall:
-//   ::= .tlsdesccall symbol
-bool ARM64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
-  StringRef Name;
-  if (getParser().parseIdentifier(Name))
-    return Error(L, "expected symbol after directive");
-
-  MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  const MCExpr *Expr = MCSymbolRefExpr::Create(Sym, getContext());
-  Expr = ARM64MCExpr::Create(Expr, ARM64MCExpr::VK_TLSDESC, getContext());
-
-  MCInst Inst;
-  Inst.setOpcode(ARM64::TLSDESCCALL);
-  Inst.addOperand(MCOperand::CreateExpr(Expr));
-
-  getParser().getStreamer().EmitInstruction(Inst, STI);
-  return false;
-}
-
-/// ::= .loh <lohName | lohId> label1, ..., labelN
-/// The number of arguments depends on the loh identifier.
-bool ARM64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
-  if (IDVal != MCLOHDirectiveName())
-    return true;
-  MCLOHType Kind;
-  if (getParser().getTok().isNot(AsmToken::Identifier)) {
-    if (getParser().getTok().isNot(AsmToken::Integer))
-      return TokError("expected an identifier or a number in directive");
-    // We successfully get a numeric value for the identifier.
-    // Check if it is valid.
-    int64_t Id = getParser().getTok().getIntVal();
-    Kind = (MCLOHType)Id;
-    // Check that Id does not overflow MCLOHType.
-    if (!isValidMCLOHType(Kind) || Id != Kind)
-      return TokError("invalid numeric identifier in directive");
-  } else {
-    StringRef Name = getTok().getIdentifier();
-    // We successfully parse an identifier.
-    // Check if it is a recognized one.
-    int Id = MCLOHNameToId(Name);
-
-    if (Id == -1)
-      return TokError("invalid identifier in directive");
-    Kind = (MCLOHType)Id;
-  }
-  // Consume the identifier.
-  Lex();
-  // Get the number of arguments of this LOH.
-  int NbArgs = MCLOHIdToNbArgs(Kind);
-
-  assert(NbArgs != -1 && "Invalid number of arguments");
-
-  SmallVector<MCSymbol *, 3> Args;
-  for (int Idx = 0; Idx < NbArgs; ++Idx) {
-    StringRef Name;
-    if (getParser().parseIdentifier(Name))
-      return TokError("expected identifier in directive");
-    Args.push_back(getContext().GetOrCreateSymbol(Name));
-
-    if (Idx + 1 == NbArgs)
-      break;
-    if (getLexer().isNot(AsmToken::Comma))
-      return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-    Lex();
-  }
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '" + Twine(IDVal) + "' directive");
-
-  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
-  return false;
-}
-
-bool
-ARM64AsmParser::classifySymbolRef(const MCExpr *Expr,
-                                  ARM64MCExpr::VariantKind &ELFRefKind,
-                                  MCSymbolRefExpr::VariantKind &DarwinRefKind,
-                                  const MCConstantExpr *&Addend) {
-  ELFRefKind = ARM64MCExpr::VK_INVALID;
-  DarwinRefKind = MCSymbolRefExpr::VK_None;
-
-  if (const ARM64MCExpr *AE = dyn_cast<ARM64MCExpr>(Expr)) {
-    ELFRefKind = AE->getKind();
-    Expr = AE->getSubExpr();
-  }
-
-  const MCSymbolRefExpr *SE = dyn_cast<MCSymbolRefExpr>(Expr);
-  if (SE) {
-    // It's a simple symbol reference with no addend.
-    DarwinRefKind = SE->getKind();
-    Addend = 0;
-    return true;
-  }
-
-  const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr);
-  if (!BE)
-    return false;
-
-  SE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
-  if (!SE)
-    return false;
-  DarwinRefKind = SE->getKind();
-
-  if (BE->getOpcode() != MCBinaryExpr::Add)
-    return false;
-
-  // See if the addend is is a constant, otherwise there's more going
-  // on here than we can deal with.
-  Addend = dyn_cast<MCConstantExpr>(BE->getRHS());
-  if (!Addend)
-    return false;
-
-  // It's some symbol reference + a constant addend, but really
-  // shouldn't use both Darwin and ELF syntax.
-  return ELFRefKind == ARM64MCExpr::VK_INVALID ||
-         DarwinRefKind == MCSymbolRefExpr::VK_None;
-}
-
-/// Force static initialization.
-extern "C" void LLVMInitializeARM64AsmParser() {
-  RegisterMCAsmParser<ARM64AsmParser> X(TheARM64Target);
-}
-
-#define GET_REGISTER_MATCHER
-#define GET_MATCHER_IMPLEMENTATION
-#include "ARM64GenAsmMatcher.inc"
-
-// Define this matcher function after the auto-generated include so we
-// have the match class enum definitions.
-unsigned ARM64AsmParser::validateTargetOperandClass(MCParsedAsmOperand *AsmOp,
-                                                    unsigned Kind) {
-  ARM64Operand *Op = static_cast<ARM64Operand *>(AsmOp);
-  // If the kind is a token for a literal immediate, check if our asm
-  // operand matches. This is for InstAliases which have a fixed-value
-  // immediate in the syntax.
-  int64_t ExpectedVal;
-  switch (Kind) {
-  default:
-    return Match_InvalidOperand;
-  case MCK__35_0:
-    ExpectedVal = 0;
-    break;
-  case MCK__35_1:
-    ExpectedVal = 1;
-    break;
-  case MCK__35_12:
-    ExpectedVal = 12;
-    break;
-  case MCK__35_16:
-    ExpectedVal = 16;
-    break;
-  case MCK__35_2:
-    ExpectedVal = 2;
-    break;
-  case MCK__35_24:
-    ExpectedVal = 24;
-    break;
-  case MCK__35_3:
-    ExpectedVal = 3;
-    break;
-  case MCK__35_32:
-    ExpectedVal = 32;
-    break;
-  case MCK__35_4:
-    ExpectedVal = 4;
-    break;
-  case MCK__35_48:
-    ExpectedVal = 48;
-    break;
-  case MCK__35_6:
-    ExpectedVal = 6;
-    break;
-  case MCK__35_64:
-    ExpectedVal = 64;
-    break;
-  case MCK__35_8:
-    ExpectedVal = 8;
-    break;
-  }
-  if (!Op->isImm())
-    return Match_InvalidOperand;
-  const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Op->getImm());
-  if (!CE)
-    return Match_InvalidOperand;
-  if (CE->getValue() == ExpectedVal)
-    return Match_Success;
-  return Match_InvalidOperand;
-}
diff --git a/lib/Target/ARM64/AsmParser/CMakeLists.txt b/lib/Target/ARM64/AsmParser/CMakeLists.txt
deleted file mode 100644
index 826158b..0000000
--- a/lib/Target/ARM64/AsmParser/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmParser
-  ARM64AsmParser.cpp
-  )
-
diff --git a/lib/Target/ARM64/AsmParser/LLVMBuild.txt b/lib/Target/ARM64/AsmParser/LLVMBuild.txt
deleted file mode 100644
index 2c8fafe..0000000
--- a/lib/Target/ARM64/AsmParser/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/AsmParser/LLVMBuild.txt ---------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmParser
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info MC MCParser Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/AsmParser/Makefile b/lib/Target/ARM64/AsmParser/Makefile
deleted file mode 100644
index d25c47f..0000000
--- a/lib/Target/ARM64/AsmParser/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM/AsmParser/Makefile -------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmParser
-
-# Hack: we need to include 'main' ARM target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/CMakeLists.txt b/lib/Target/ARM64/CMakeLists.txt
deleted file mode 100644
index 6de861c..0000000
--- a/lib/Target/ARM64/CMakeLists.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-set(LLVM_TARGET_DEFINITIONS ARM64.td)
-
-tablegen(LLVM ARM64GenRegisterInfo.inc -gen-register-info)
-tablegen(LLVM ARM64GenInstrInfo.inc -gen-instr-info)
-tablegen(LLVM ARM64GenMCCodeEmitter.inc -gen-emitter -mc-emitter)
-tablegen(LLVM ARM64GenMCPseudoLowering.inc -gen-pseudo-lowering)
-tablegen(LLVM ARM64GenAsmWriter.inc -gen-asm-writer)
-tablegen(LLVM ARM64GenAsmWriter1.inc -gen-asm-writer -asmwriternum=1)
-tablegen(LLVM ARM64GenAsmMatcher.inc -gen-asm-matcher)
-tablegen(LLVM ARM64GenDAGISel.inc -gen-dag-isel)
-tablegen(LLVM ARM64GenFastISel.inc -gen-fast-isel)
-tablegen(LLVM ARM64GenCallingConv.inc -gen-callingconv)
-tablegen(LLVM ARM64GenSubtargetInfo.inc -gen-subtarget)
-tablegen(LLVM ARM64GenDisassemblerTables.inc -gen-disassembler)
-add_public_tablegen_target(ARM64CommonTableGen)
-
-add_llvm_target(ARM64CodeGen
-  ARM64AddressTypePromotion.cpp
-  ARM64AdvSIMDScalarPass.cpp
-  ARM64AsmPrinter.cpp
-  ARM64BranchRelaxation.cpp
-  ARM64CleanupLocalDynamicTLSPass.cpp
-  ARM64CollectLOH.cpp
-  ARM64ConditionalCompares.cpp
-  ARM64DeadRegisterDefinitionsPass.cpp
-  ARM64ExpandPseudoInsts.cpp
-  ARM64FastISel.cpp
-  ARM64FrameLowering.cpp
-  ARM64ISelDAGToDAG.cpp
-  ARM64ISelLowering.cpp
-  ARM64InstrInfo.cpp
-  ARM64LoadStoreOptimizer.cpp
-  ARM64MCInstLower.cpp
-  ARM64PromoteConstant.cpp
-  ARM64RegisterInfo.cpp
-  ARM64SelectionDAGInfo.cpp
-  ARM64StorePairSuppress.cpp
-  ARM64Subtarget.cpp
-  ARM64TargetMachine.cpp
-  ARM64TargetObjectFile.cpp
-  ARM64TargetTransformInfo.cpp
-)
-
-add_dependencies(LLVMARM64CodeGen intrinsics_gen)
-
-add_subdirectory(TargetInfo)
-add_subdirectory(AsmParser)
-add_subdirectory(Disassembler)
-add_subdirectory(InstPrinter)
-add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp b/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
deleted file mode 100644
index 44c501f..0000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.cpp
+++ /dev/null
@@ -1,2142 +0,0 @@
-//===- ARM64Disassembler.cpp - Disassembler for ARM64 -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "arm64-disassembler"
-
-#include "ARM64Disassembler.h"
-#include "ARM64Subtarget.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCFixedLenDisassembler.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/MemoryObject.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-
-// Pull DecodeStatus and its enum values into the global namespace.
-typedef llvm::MCDisassembler::DecodeStatus DecodeStatus;
-
-// Forward declare these because the autogenerated code will reference them.
-// Definitions are further down.
-static DecodeStatus DecodeFPR128RegisterClass(llvm::MCInst &Inst,
-                                              unsigned RegNo, uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeFPR128_loRegisterClass(llvm::MCInst &Inst,
-                                                 unsigned RegNo,
-                                                 uint64_t Address,
-                                                 const void *Decoder);
-static DecodeStatus DecodeFPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR16RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeFPR8RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeGPR64RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR64spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeGPR32RegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeGPR32spRegisterClass(llvm::MCInst &Inst,
-                                               unsigned RegNo, uint64_t Address,
-                                               const void *Decoder);
-static DecodeStatus DecodeQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeQQQQRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                          uint64_t Address,
-                                          const void *Decoder);
-static DecodeStatus DecodeDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeDDDDRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
-                                            uint64_t Address,
-                                            const void *Decoder);
-
-static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
-                                           uint64_t Address,
-                                           const void *Decoder);
-static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Address,
-                                             const void *Decoder);
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn,
-                                                  uint64_t Address,
-                                                  const void *Decoder);
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn,
-                                                   uint64_t Address,
-                                                   const void *Decoder);
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Address,
-                                            const void *Decoder);
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Address,
-                                              const void *Decoder);
-static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Address,
-                                                const void *Decoder);
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
-                                       uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder);
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder);
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder);
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder);
-
-#include "ARM64GenDisassemblerTables.inc"
-#include "ARM64GenInstrInfo.inc"
-
-using namespace llvm;
-
-#define Success llvm::MCDisassembler::Success
-#define Fail llvm::MCDisassembler::Fail
-
-static MCDisassembler *createARM64Disassembler(const Target &T,
-                                               const MCSubtargetInfo &STI) {
-  return new ARM64Disassembler(STI);
-}
-
-DecodeStatus ARM64Disassembler::getInstruction(MCInst &MI, uint64_t &Size,
-                                               const MemoryObject &Region,
-                                               uint64_t Address,
-                                               raw_ostream &os,
-                                               raw_ostream &cs) const {
-  CommentStream = &cs;
-
-  uint8_t bytes[4];
-
-  Size = 0;
-  // We want to read exactly 4 bytes of data.
-  if (Region.readBytes(Address, 4, (uint8_t *)bytes) == -1)
-    return Fail;
-  Size = 4;
-
-  // Encoded as a small-endian 32-bit word in the stream.
-  uint32_t insn =
-      (bytes[3] << 24) | (bytes[2] << 16) | (bytes[1] << 8) | (bytes[0] << 0);
-
-  // Calling the auto-generated decoder function.
-  DecodeStatus result =
-      decodeInstruction(DecoderTable32, MI, insn, Address, this, STI);
-  if (!result)
-    return Fail;
-
-  return Success;
-}
-
-static MCSymbolRefExpr::VariantKind
-getVariant(uint64_t LLVMDisassembler_VariantKind) {
-  switch (LLVMDisassembler_VariantKind) {
-  case LLVMDisassembler_VariantKind_None:
-    return MCSymbolRefExpr::VK_None;
-  case LLVMDisassembler_VariantKind_ARM64_PAGE:
-    return MCSymbolRefExpr::VK_PAGE;
-  case LLVMDisassembler_VariantKind_ARM64_PAGEOFF:
-    return MCSymbolRefExpr::VK_PAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGE:
-    return MCSymbolRefExpr::VK_GOTPAGE;
-  case LLVMDisassembler_VariantKind_ARM64_GOTPAGEOFF:
-    return MCSymbolRefExpr::VK_GOTPAGEOFF;
-  case LLVMDisassembler_VariantKind_ARM64_TLVP:
-  case LLVMDisassembler_VariantKind_ARM64_TLVOFF:
-  default:
-    assert(0 && "bad LLVMDisassembler_VariantKind");
-    return MCSymbolRefExpr::VK_None;
-  }
-}
-
-/// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-/// operand in place of the immediate Value in the MCInst.  The immediate
-/// Value has not had any PC adjustment made by the caller. If the instruction
-/// is a branch that adds the PC to the immediate Value then isBranch is
-/// Success, else Fail.  If the getOpInfo() function was set as part of the
-/// setupForSymbolicDisassembly() call then that function is called to get any
-/// symbolic information at the Address for this instrution.  If that returns
-/// non-zero then the symbolic information it returns is used to create an
-/// MCExpr and that is added as an operand to the MCInst.  If getOpInfo()
-/// returns zero and isBranch is Success then a symbol look up for
-/// Address + Value is done and if a symbol is found an MCExpr is created with
-/// that, else an MCExpr with Address + Value is created.  If getOpInfo()
-/// returns zero and isBranch is Fail then the the Opcode of the MCInst is
-/// tested and for ADRP an other instructions that help to load of pointers
-/// a symbol look up is done to see it is returns a specific reference type
-/// to add to the comment stream.  This function returns Success if it adds
-/// an operand to the MCInst and Fail otherwise.
-bool ARM64Disassembler::tryAddingSymbolicOperand(uint64_t Address, int Value,
-                                                 bool isBranch,
-                                                 uint64_t InstSize, MCInst &MI,
-                                                 uint32_t insn) const {
-  LLVMOpInfoCallback getOpInfo = getLLVMOpInfoCallback();
-
-  struct LLVMOpInfo1 SymbolicOp;
-  memset(&SymbolicOp, '\0', sizeof(struct LLVMOpInfo1));
-  SymbolicOp.Value = Value;
-  void *DisInfo = getDisInfoBlock();
-  uint64_t ReferenceType;
-  const char *ReferenceName;
-  const char *Name;
-  LLVMSymbolLookupCallback SymbolLookUp = getLLVMSymbolLookupCallback();
-  if (!getOpInfo ||
-      !getOpInfo(DisInfo, Address, 0 /* Offset */, InstSize, 1, &SymbolicOp)) {
-    if (isBranch) {
-      if (SymbolLookUp) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_Branch;
-        Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                            &ReferenceName);
-        if (Name) {
-          SymbolicOp.AddSymbol.Name = Name;
-          SymbolicOp.AddSymbol.Present = Success;
-          SymbolicOp.Value = 0;
-        } else {
-          SymbolicOp.Value = Address + Value;
-        }
-        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_SymbolStub)
-          (*CommentStream) << "symbol stub for: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
-          (*CommentStream) << "Objc message: " << ReferenceName;
-      } else {
-        return false;
-      }
-    } else if (MI.getOpcode() == ARM64::ADRP) {
-      if (SymbolLookUp) {
-        ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADRP;
-        Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
-                            &ReferenceName);
-        (*CommentStream) << format("0x%llx",
-                                   0xfffffffffffff000LL & (Address + Value));
-      } else {
-        return false;
-      }
-    } else if (MI.getOpcode() == ARM64::ADDXri ||
-               MI.getOpcode() == ARM64::LDRXui ||
-               MI.getOpcode() == ARM64::LDRXl || MI.getOpcode() == ARM64::ADR) {
-      if (SymbolLookUp) {
-        if (MI.getOpcode() == ARM64::ADDXri)
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADDXri;
-        else if (MI.getOpcode() == ARM64::LDRXui)
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXui;
-        if (MI.getOpcode() == ARM64::LDRXl) {
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_LDRXl;
-          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                              &ReferenceName);
-        } else if (MI.getOpcode() == ARM64::ADR) {
-          ReferenceType = LLVMDisassembler_ReferenceType_In_ARM64_ADR;
-          Name = SymbolLookUp(DisInfo, Address + Value, &ReferenceType, Address,
-                              &ReferenceName);
-        } else {
-          Name = SymbolLookUp(DisInfo, insn, &ReferenceType, Address,
-                              &ReferenceName);
-        }
-        if (ReferenceType == LLVMDisassembler_ReferenceType_Out_LitPool_SymAddr)
-          (*CommentStream) << "literal pool symbol address: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_LitPool_CstrAddr)
-          (*CommentStream) << "literal pool for: \"" << ReferenceName << "\"";
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_CFString_Ref)
-          (*CommentStream) << "Objc cfstring ref: @\"" << ReferenceName << "\"";
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message)
-          (*CommentStream) << "Objc message: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Message_Ref)
-          (*CommentStream) << "Objc message ref: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Selector_Ref)
-          (*CommentStream) << "Objc selector ref: " << ReferenceName;
-        else if (ReferenceType ==
-                 LLVMDisassembler_ReferenceType_Out_Objc_Class_Ref)
-          (*CommentStream) << "Objc class ref: " << ReferenceName;
-        // For these instructions, the SymbolLookUp() above is just to get the
-        // ReferenceType and ReferenceName.  We want to make sure not to
-        // fall through so we don't build an MCExpr to leave the disassembly
-        // of the immediate values of these instructions to the InstPrinter.
-        return false;
-      } else {
-        return false;
-      }
-    } else {
-      return false;
-    }
-  }
-
-  MCContext *Ctx = getMCContext();
-  const MCExpr *Add = NULL;
-  if (SymbolicOp.AddSymbol.Present) {
-    if (SymbolicOp.AddSymbol.Name) {
-      StringRef Name(SymbolicOp.AddSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      MCSymbolRefExpr::VariantKind Variant = getVariant(SymbolicOp.VariantKind);
-      if (Variant != MCSymbolRefExpr::VK_None)
-        Add = MCSymbolRefExpr::Create(Sym, Variant, *Ctx);
-      else
-        Add = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Add = MCConstantExpr::Create(SymbolicOp.AddSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Sub = NULL;
-  if (SymbolicOp.SubtractSymbol.Present) {
-    if (SymbolicOp.SubtractSymbol.Name) {
-      StringRef Name(SymbolicOp.SubtractSymbol.Name);
-      MCSymbol *Sym = Ctx->GetOrCreateSymbol(Name);
-      Sub = MCSymbolRefExpr::Create(Sym, *Ctx);
-    } else {
-      Sub = MCConstantExpr::Create(SymbolicOp.SubtractSymbol.Value, *Ctx);
-    }
-  }
-
-  const MCExpr *Off = NULL;
-  if (SymbolicOp.Value != 0)
-    Off = MCConstantExpr::Create(SymbolicOp.Value, *Ctx);
-
-  const MCExpr *Expr;
-  if (Sub) {
-    const MCExpr *LHS;
-    if (Add)
-      LHS = MCBinaryExpr::CreateSub(Add, Sub, *Ctx);
-    else
-      LHS = MCUnaryExpr::CreateMinus(Sub, *Ctx);
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(LHS, Off, *Ctx);
-    else
-      Expr = LHS;
-  } else if (Add) {
-    if (Off != 0)
-      Expr = MCBinaryExpr::CreateAdd(Add, Off, *Ctx);
-    else
-      Expr = Add;
-  } else {
-    if (Off != 0)
-      Expr = Off;
-    else
-      Expr = MCConstantExpr::Create(0, *Ctx);
-  }
-
-  MI.addOperand(MCOperand::CreateExpr(Expr));
-
-  return true;
-}
-
-extern "C" void LLVMInitializeARM64Disassembler() {
-  TargetRegistry::RegisterMCDisassembler(TheARM64Target,
-                                         createARM64Disassembler);
-}
-
-static const unsigned FPR128DecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeFPR128RegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR128DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFPR128_loRegisterClass(MCInst &Inst, unsigned RegNo,
-                                                 uint64_t Addr,
-                                                 const void *Decoder) {
-  if (RegNo > 15)
-    return Fail;
-  return DecodeFPR128RegisterClass(Inst, RegNo, Addr, Decoder);
-}
-
-static const unsigned FPR64DecoderTable[] = {
-  ARM64::D0,  ARM64::D1,  ARM64::D2,  ARM64::D3,  ARM64::D4,  ARM64::D5,
-  ARM64::D6,  ARM64::D7,  ARM64::D8,  ARM64::D9,  ARM64::D10, ARM64::D11,
-  ARM64::D12, ARM64::D13, ARM64::D14, ARM64::D15, ARM64::D16, ARM64::D17,
-  ARM64::D18, ARM64::D19, ARM64::D20, ARM64::D21, ARM64::D22, ARM64::D23,
-  ARM64::D24, ARM64::D25, ARM64::D26, ARM64::D27, ARM64::D28, ARM64::D29,
-  ARM64::D30, ARM64::D31
-};
-
-static DecodeStatus DecodeFPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR32DecoderTable[] = {
-  ARM64::S0,  ARM64::S1,  ARM64::S2,  ARM64::S3,  ARM64::S4,  ARM64::S5,
-  ARM64::S6,  ARM64::S7,  ARM64::S8,  ARM64::S9,  ARM64::S10, ARM64::S11,
-  ARM64::S12, ARM64::S13, ARM64::S14, ARM64::S15, ARM64::S16, ARM64::S17,
-  ARM64::S18, ARM64::S19, ARM64::S20, ARM64::S21, ARM64::S22, ARM64::S23,
-  ARM64::S24, ARM64::S25, ARM64::S26, ARM64::S27, ARM64::S28, ARM64::S29,
-  ARM64::S30, ARM64::S31
-};
-
-static DecodeStatus DecodeFPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR16DecoderTable[] = {
-  ARM64::H0,  ARM64::H1,  ARM64::H2,  ARM64::H3,  ARM64::H4,  ARM64::H5,
-  ARM64::H6,  ARM64::H7,  ARM64::H8,  ARM64::H9,  ARM64::H10, ARM64::H11,
-  ARM64::H12, ARM64::H13, ARM64::H14, ARM64::H15, ARM64::H16, ARM64::H17,
-  ARM64::H18, ARM64::H19, ARM64::H20, ARM64::H21, ARM64::H22, ARM64::H23,
-  ARM64::H24, ARM64::H25, ARM64::H26, ARM64::H27, ARM64::H28, ARM64::H29,
-  ARM64::H30, ARM64::H31
-};
-
-static DecodeStatus DecodeFPR16RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR16DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned FPR8DecoderTable[] = {
-  ARM64::B0,  ARM64::B1,  ARM64::B2,  ARM64::B3,  ARM64::B4,  ARM64::B5,
-  ARM64::B6,  ARM64::B7,  ARM64::B8,  ARM64::B9,  ARM64::B10, ARM64::B11,
-  ARM64::B12, ARM64::B13, ARM64::B14, ARM64::B15, ARM64::B16, ARM64::B17,
-  ARM64::B18, ARM64::B19, ARM64::B20, ARM64::B21, ARM64::B22, ARM64::B23,
-  ARM64::B24, ARM64::B25, ARM64::B26, ARM64::B27, ARM64::B28, ARM64::B29,
-  ARM64::B30, ARM64::B31
-};
-
-static DecodeStatus DecodeFPR8RegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = FPR8DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR64DecoderTable[] = {
-  ARM64::X0,  ARM64::X1,  ARM64::X2,  ARM64::X3,  ARM64::X4,  ARM64::X5,
-  ARM64::X6,  ARM64::X7,  ARM64::X8,  ARM64::X9,  ARM64::X10, ARM64::X11,
-  ARM64::X12, ARM64::X13, ARM64::X14, ARM64::X15, ARM64::X16, ARM64::X17,
-  ARM64::X18, ARM64::X19, ARM64::X20, ARM64::X21, ARM64::X22, ARM64::X23,
-  ARM64::X24, ARM64::X25, ARM64::X26, ARM64::X27, ARM64::X28, ARM64::FP,
-  ARM64::LR,  ARM64::XZR
-};
-
-static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR64DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR64spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = GPR64DecoderTable[RegNo];
-  if (Register == ARM64::XZR)
-    Register = ARM64::SP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned GPR32DecoderTable[] = {
-  ARM64::W0,  ARM64::W1,  ARM64::W2,  ARM64::W3,  ARM64::W4,  ARM64::W5,
-  ARM64::W6,  ARM64::W7,  ARM64::W8,  ARM64::W9,  ARM64::W10, ARM64::W11,
-  ARM64::W12, ARM64::W13, ARM64::W14, ARM64::W15, ARM64::W16, ARM64::W17,
-  ARM64::W18, ARM64::W19, ARM64::W20, ARM64::W21, ARM64::W22, ARM64::W23,
-  ARM64::W24, ARM64::W25, ARM64::W26, ARM64::W27, ARM64::W28, ARM64::W29,
-  ARM64::W30, ARM64::WZR
-};
-
-static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeGPR32spRegisterClass(MCInst &Inst, unsigned RegNo,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = GPR32DecoderTable[RegNo];
-  if (Register == ARM64::WZR)
-    Register = ARM64::WSP;
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned VectorDecoderTable[] = {
-  ARM64::Q0,  ARM64::Q1,  ARM64::Q2,  ARM64::Q3,  ARM64::Q4,  ARM64::Q5,
-  ARM64::Q6,  ARM64::Q7,  ARM64::Q8,  ARM64::Q9,  ARM64::Q10, ARM64::Q11,
-  ARM64::Q12, ARM64::Q13, ARM64::Q14, ARM64::Q15, ARM64::Q16, ARM64::Q17,
-  ARM64::Q18, ARM64::Q19, ARM64::Q20, ARM64::Q21, ARM64::Q22, ARM64::Q23,
-  ARM64::Q24, ARM64::Q25, ARM64::Q26, ARM64::Q27, ARM64::Q28, ARM64::Q29,
-  ARM64::Q30, ARM64::Q31
-};
-
-static DecodeStatus DecodeVectorRegisterClass(MCInst &Inst, unsigned RegNo,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-
-  unsigned Register = VectorDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQDecoderTable[] = {
-  ARM64::Q0_Q1,   ARM64::Q1_Q2,   ARM64::Q2_Q3,   ARM64::Q3_Q4,
-  ARM64::Q4_Q5,   ARM64::Q5_Q6,   ARM64::Q6_Q7,   ARM64::Q7_Q8,
-  ARM64::Q8_Q9,   ARM64::Q9_Q10,  ARM64::Q10_Q11, ARM64::Q11_Q12,
-  ARM64::Q12_Q13, ARM64::Q13_Q14, ARM64::Q14_Q15, ARM64::Q15_Q16,
-  ARM64::Q16_Q17, ARM64::Q17_Q18, ARM64::Q18_Q19, ARM64::Q19_Q20,
-  ARM64::Q20_Q21, ARM64::Q21_Q22, ARM64::Q22_Q23, ARM64::Q23_Q24,
-  ARM64::Q24_Q25, ARM64::Q25_Q26, ARM64::Q26_Q27, ARM64::Q27_Q28,
-  ARM64::Q28_Q29, ARM64::Q29_Q30, ARM64::Q30_Q31, ARM64::Q31_Q0
-};
-
-static DecodeStatus DecodeQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2,    ARM64::Q1_Q2_Q3,    ARM64::Q2_Q3_Q4,
-  ARM64::Q3_Q4_Q5,    ARM64::Q4_Q5_Q6,    ARM64::Q5_Q6_Q7,
-  ARM64::Q6_Q7_Q8,    ARM64::Q7_Q8_Q9,    ARM64::Q8_Q9_Q10,
-  ARM64::Q9_Q10_Q11,  ARM64::Q10_Q11_Q12, ARM64::Q11_Q12_Q13,
-  ARM64::Q12_Q13_Q14, ARM64::Q13_Q14_Q15, ARM64::Q14_Q15_Q16,
-  ARM64::Q15_Q16_Q17, ARM64::Q16_Q17_Q18, ARM64::Q17_Q18_Q19,
-  ARM64::Q18_Q19_Q20, ARM64::Q19_Q20_Q21, ARM64::Q20_Q21_Q22,
-  ARM64::Q21_Q22_Q23, ARM64::Q22_Q23_Q24, ARM64::Q23_Q24_Q25,
-  ARM64::Q24_Q25_Q26, ARM64::Q25_Q26_Q27, ARM64::Q26_Q27_Q28,
-  ARM64::Q27_Q28_Q29, ARM64::Q28_Q29_Q30, ARM64::Q29_Q30_Q31,
-  ARM64::Q30_Q31_Q0,  ARM64::Q31_Q0_Q1
-};
-
-static DecodeStatus DecodeQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned QQQQDecoderTable[] = {
-  ARM64::Q0_Q1_Q2_Q3,     ARM64::Q1_Q2_Q3_Q4,     ARM64::Q2_Q3_Q4_Q5,
-  ARM64::Q3_Q4_Q5_Q6,     ARM64::Q4_Q5_Q6_Q7,     ARM64::Q5_Q6_Q7_Q8,
-  ARM64::Q6_Q7_Q8_Q9,     ARM64::Q7_Q8_Q9_Q10,    ARM64::Q8_Q9_Q10_Q11,
-  ARM64::Q9_Q10_Q11_Q12,  ARM64::Q10_Q11_Q12_Q13, ARM64::Q11_Q12_Q13_Q14,
-  ARM64::Q12_Q13_Q14_Q15, ARM64::Q13_Q14_Q15_Q16, ARM64::Q14_Q15_Q16_Q17,
-  ARM64::Q15_Q16_Q17_Q18, ARM64::Q16_Q17_Q18_Q19, ARM64::Q17_Q18_Q19_Q20,
-  ARM64::Q18_Q19_Q20_Q21, ARM64::Q19_Q20_Q21_Q22, ARM64::Q20_Q21_Q22_Q23,
-  ARM64::Q21_Q22_Q23_Q24, ARM64::Q22_Q23_Q24_Q25, ARM64::Q23_Q24_Q25_Q26,
-  ARM64::Q24_Q25_Q26_Q27, ARM64::Q25_Q26_Q27_Q28, ARM64::Q26_Q27_Q28_Q29,
-  ARM64::Q27_Q28_Q29_Q30, ARM64::Q28_Q29_Q30_Q31, ARM64::Q29_Q30_Q31_Q0,
-  ARM64::Q30_Q31_Q0_Q1,   ARM64::Q31_Q0_Q1_Q2
-};
-
-static DecodeStatus DecodeQQQQRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = QQQQDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDecoderTable[] = {
-  ARM64::D0_D1,   ARM64::D1_D2,   ARM64::D2_D3,   ARM64::D3_D4,
-  ARM64::D4_D5,   ARM64::D5_D6,   ARM64::D6_D7,   ARM64::D7_D8,
-  ARM64::D8_D9,   ARM64::D9_D10,  ARM64::D10_D11, ARM64::D11_D12,
-  ARM64::D12_D13, ARM64::D13_D14, ARM64::D14_D15, ARM64::D15_D16,
-  ARM64::D16_D17, ARM64::D17_D18, ARM64::D18_D19, ARM64::D19_D20,
-  ARM64::D20_D21, ARM64::D21_D22, ARM64::D22_D23, ARM64::D23_D24,
-  ARM64::D24_D25, ARM64::D25_D26, ARM64::D26_D27, ARM64::D27_D28,
-  ARM64::D28_D29, ARM64::D29_D30, ARM64::D30_D31, ARM64::D31_D0
-};
-
-static DecodeStatus DecodeDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                          uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDecoderTable[] = {
-  ARM64::D0_D1_D2,    ARM64::D1_D2_D3,    ARM64::D2_D3_D4,
-  ARM64::D3_D4_D5,    ARM64::D4_D5_D6,    ARM64::D5_D6_D7,
-  ARM64::D6_D7_D8,    ARM64::D7_D8_D9,    ARM64::D8_D9_D10,
-  ARM64::D9_D10_D11,  ARM64::D10_D11_D12, ARM64::D11_D12_D13,
-  ARM64::D12_D13_D14, ARM64::D13_D14_D15, ARM64::D14_D15_D16,
-  ARM64::D15_D16_D17, ARM64::D16_D17_D18, ARM64::D17_D18_D19,
-  ARM64::D18_D19_D20, ARM64::D19_D20_D21, ARM64::D20_D21_D22,
-  ARM64::D21_D22_D23, ARM64::D22_D23_D24, ARM64::D23_D24_D25,
-  ARM64::D24_D25_D26, ARM64::D25_D26_D27, ARM64::D26_D27_D28,
-  ARM64::D27_D28_D29, ARM64::D28_D29_D30, ARM64::D29_D30_D31,
-  ARM64::D30_D31_D0,  ARM64::D31_D0_D1
-};
-
-static DecodeStatus DecodeDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Addr, const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static const unsigned DDDDDecoderTable[] = {
-  ARM64::D0_D1_D2_D3,     ARM64::D1_D2_D3_D4,     ARM64::D2_D3_D4_D5,
-  ARM64::D3_D4_D5_D6,     ARM64::D4_D5_D6_D7,     ARM64::D5_D6_D7_D8,
-  ARM64::D6_D7_D8_D9,     ARM64::D7_D8_D9_D10,    ARM64::D8_D9_D10_D11,
-  ARM64::D9_D10_D11_D12,  ARM64::D10_D11_D12_D13, ARM64::D11_D12_D13_D14,
-  ARM64::D12_D13_D14_D15, ARM64::D13_D14_D15_D16, ARM64::D14_D15_D16_D17,
-  ARM64::D15_D16_D17_D18, ARM64::D16_D17_D18_D19, ARM64::D17_D18_D19_D20,
-  ARM64::D18_D19_D20_D21, ARM64::D19_D20_D21_D22, ARM64::D20_D21_D22_D23,
-  ARM64::D21_D22_D23_D24, ARM64::D22_D23_D24_D25, ARM64::D23_D24_D25_D26,
-  ARM64::D24_D25_D26_D27, ARM64::D25_D26_D27_D28, ARM64::D26_D27_D28_D29,
-  ARM64::D27_D28_D29_D30, ARM64::D28_D29_D30_D31, ARM64::D29_D30_D31_D0,
-  ARM64::D30_D31_D0_D1,   ARM64::D31_D0_D1_D2
-};
-
-static DecodeStatus DecodeDDDDRegisterClass(MCInst &Inst, unsigned RegNo,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  if (RegNo > 31)
-    return Fail;
-  unsigned Register = DDDDDecoderTable[RegNo];
-  Inst.addOperand(MCOperand::CreateReg(Register));
-  return Success;
-}
-
-static DecodeStatus DecodeFixedPointScaleImm(llvm::MCInst &Inst, unsigned Imm,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeCondBranchTarget(llvm::MCInst &Inst, unsigned Imm,
-                                           uint64_t Addr, const void *Decoder) {
-  int64_t ImmVal = Imm;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 19-bit immediate.
-  if (ImmVal & (1 << (19 - 1)))
-    ImmVal |= ~((1LL << 19) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal << 2,
-                                     Inst.getOpcode() != ARM64::LDRXl, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  return Success;
-}
-
-static DecodeStatus DecodeSystemRegister(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Address,
-                                         const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(Imm | 0x8000));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftRImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm(Add - Imm));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftLImm(llvm::MCInst &Inst, unsigned Imm,
-                                       unsigned Add) {
-  Inst.addOperand(MCOperand::CreateImm((Imm + Add) & (Add - 1)));
-  return Success;
-}
-
-static DecodeStatus DecodeVecShiftR64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftR64ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x20, 64);
-}
-
-static DecodeStatus DecodeVecShiftR32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftR32ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x10, 32);
-}
-
-static DecodeStatus DecodeVecShiftR16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftR16ImmNarrow(llvm::MCInst &Inst, unsigned Imm,
-                                               uint64_t Addr,
-                                               const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm | 0x8, 16);
-}
-
-static DecodeStatus DecodeVecShiftR8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftRImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeVecShiftL64Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 64);
-}
-
-static DecodeStatus DecodeVecShiftL32Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 32);
-}
-
-static DecodeStatus DecodeVecShiftL16Imm(llvm::MCInst &Inst, unsigned Imm,
-                                         uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 16);
-}
-
-static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
-                                        uint64_t Addr, const void *Decoder) {
-  return DecodeVecShiftLImm(Inst, Imm, 8);
-}
-
-static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned shiftHi = fieldFromInstruction(insn, 22, 2);
-  unsigned shiftLo = fieldFromInstruction(insn, 10, 6);
-  unsigned shift = (shiftHi << 6) | shiftLo;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ANDWrs:
-  case ARM64::ANDSWrs:
-  case ARM64::BICWrs:
-  case ARM64::BICSWrs:
-  case ARM64::ORRWrs:
-  case ARM64::ORNWrs:
-  case ARM64::EORWrs:
-  case ARM64::EONWrs:
-  case ARM64::ADDWrs:
-  case ARM64::ADDSWrs:
-  case ARM64::SUBWrs:
-  case ARM64::SUBSWrs: {
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  case ARM64::ANDXrs:
-  case ARM64::ANDSXrs:
-  case ARM64::BICXrs:
-  case ARM64::BICSXrs:
-  case ARM64::ORRXrs:
-  case ARM64::ORNXrs:
-  case ARM64::EORXrs:
-  case ARM64::EONXrs:
-  case ARM64::ADDXrs:
-  case ARM64::ADDSXrs:
-  case ARM64::SUBXrs:
-  case ARM64::SUBSXrs:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeMoveImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned imm = fieldFromInstruction(insn, 5, 16);
-  unsigned shift = fieldFromInstruction(insn, 21, 2);
-  shift <<= 4;
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::MOVZWi:
-  case ARM64::MOVNWi:
-  case ARM64::MOVKWi:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::MOVZXi:
-  case ARM64::MOVNXi:
-  case ARM64::MOVKXi:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  if (Inst.getOpcode() == ARM64::MOVKWi || Inst.getOpcode() == ARM64::MOVKXi)
-    Inst.addOperand(Inst.getOperand(0));
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm(shift));
-  return Success;
-}
-
-static DecodeStatus DecodeUnsignedLdStInstruction(llvm::MCInst &Inst,
-                                                  uint32_t insn, uint64_t Addr,
-                                                  const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned offset = fieldFromInstruction(insn, 10, 12);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFMui:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STRBBui:
-  case ARM64::LDRBBui:
-  case ARM64::LDRSBWui:
-  case ARM64::STRHHui:
-  case ARM64::LDRHHui:
-  case ARM64::LDRSHWui:
-  case ARM64::STRWui:
-  case ARM64::LDRWui:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXui:
-  case ARM64::LDRSHXui:
-  case ARM64::LDRSWui:
-  case ARM64::STRXui:
-  case ARM64::LDRXui:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQui:
-  case ARM64::STRQui:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDui:
-  case ARM64::STRDui:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSui:
-  case ARM64::STRSui:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHui:
-  case ARM64::STRHui:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBui:
-  case ARM64::STRBui:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Addr, offset, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeSignedLdStInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  int64_t offset = fieldFromInstruction(insn, 12, 9);
-
-  // offset is a 9-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (9 - 1)))
-    offset |= ~((1LL << 9) - 1);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::PRFUMi:
-    // Rt is an immediate in prefetch.
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-    break;
-  case ARM64::STURBBi:
-  case ARM64::LDURBBi:
-  case ARM64::LDURSBWi:
-  case ARM64::STURHHi:
-  case ARM64::LDURHHi:
-  case ARM64::LDURSHWi:
-  case ARM64::STURWi:
-  case ARM64::LDURWi:
-  case ARM64::LDTRSBWi:
-  case ARM64::LDTRSHWi:
-  case ARM64::STTRWi:
-  case ARM64::LDTRWi:
-  case ARM64::STTRHi:
-  case ARM64::LDTRHi:
-  case ARM64::LDTRBi:
-  case ARM64::STTRBi:
-  case ARM64::LDRSBWpre:
-  case ARM64::LDRSHWpre:
-  case ARM64::STRBBpre:
-  case ARM64::LDRBBpre:
-  case ARM64::STRHHpre:
-  case ARM64::LDRHHpre:
-  case ARM64::STRWpre:
-  case ARM64::LDRWpre:
-  case ARM64::LDRSBWpost:
-  case ARM64::LDRSHWpost:
-  case ARM64::STRBBpost:
-  case ARM64::LDRBBpost:
-  case ARM64::STRHHpost:
-  case ARM64::LDRHHpost:
-  case ARM64::STRWpost:
-  case ARM64::LDRWpost:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSBXi:
-  case ARM64::LDURSHXi:
-  case ARM64::LDURSWi:
-  case ARM64::STURXi:
-  case ARM64::LDURXi:
-  case ARM64::LDTRSBXi:
-  case ARM64::LDTRSHXi:
-  case ARM64::LDTRSWi:
-  case ARM64::STTRXi:
-  case ARM64::LDTRXi:
-  case ARM64::LDRSBXpre:
-  case ARM64::LDRSHXpre:
-  case ARM64::STRXpre:
-  case ARM64::LDRSWpre:
-  case ARM64::LDRXpre:
-  case ARM64::LDRSBXpost:
-  case ARM64::LDRSHXpost:
-  case ARM64::STRXpost:
-  case ARM64::LDRSWpost:
-  case ARM64::LDRXpost:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURQi:
-  case ARM64::STURQi:
-  case ARM64::LDRQpre:
-  case ARM64::STRQpre:
-  case ARM64::LDRQpost:
-  case ARM64::STRQpost:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURDi:
-  case ARM64::STURDi:
-  case ARM64::LDRDpre:
-  case ARM64::STRDpre:
-  case ARM64::LDRDpost:
-  case ARM64::STRDpost:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURSi:
-  case ARM64::STURSi:
-  case ARM64::LDRSpre:
-  case ARM64::STRSpre:
-  case ARM64::LDRSpost:
-  case ARM64::STRSpost:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURHi:
-  case ARM64::STURHi:
-  case ARM64::LDRHpre:
-  case ARM64::STRHpre:
-  case ARM64::LDRHpost:
-  case ARM64::STRHpost:
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDURBi:
-  case ARM64::STURBi:
-  case ARM64::LDRBpre:
-  case ARM64::STRBpre:
-  case ARM64::LDRBpost:
-  case ARM64::STRBpost:
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeExclusiveLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  unsigned Rs = fieldFromInstruction(insn, 16, 5);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::STLXRW:
-  case ARM64::STLXRB:
-  case ARM64::STLXRH:
-  case ARM64::STXRW:
-  case ARM64::STXRB:
-  case ARM64::STXRH:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARW:
-  case ARM64::LDARB:
-  case ARM64::LDARH:
-  case ARM64::LDAXRW:
-  case ARM64::LDAXRB:
-  case ARM64::LDAXRH:
-  case ARM64::LDXRW:
-  case ARM64::LDXRB:
-  case ARM64::LDXRH:
-  case ARM64::STLRW:
-  case ARM64::STLRB:
-  case ARM64::STLRH:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXRX:
-  case ARM64::STXRX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDARX:
-  case ARM64::LDAXRX:
-  case ARM64::LDXRX:
-  case ARM64::STLRX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::STLXPW:
-  case ARM64::STXPW:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPW:
-  case ARM64::LDXPW:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::STLXPX:
-  case ARM64::STXPX:
-    DecodeGPR32RegisterClass(Inst, Rs, Addr, Decoder);
-  // FALLTHROUGH
-  case ARM64::LDAXPX:
-  case ARM64::LDXPX:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  return Success;
-}
-
-static DecodeStatus DecodePairLdStInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rt2 = fieldFromInstruction(insn, 10, 5);
-  int64_t offset = fieldFromInstruction(insn, 15, 7);
-
-  // offset is a 7-bit signed immediate, so sign extend it to
-  // fill the unsigned.
-  if (offset & (1 << (7 - 1)))
-    offset |= ~((1LL << 7) - 1);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LDNPXi:
-  case ARM64::STNPXi:
-  case ARM64::LDPXpost:
-  case ARM64::STPXpost:
-  case ARM64::LDPSWpost:
-  case ARM64::LDPXi:
-  case ARM64::STPXi:
-  case ARM64::LDPSWi:
-  case ARM64::LDPXpre:
-  case ARM64::STPXpre:
-  case ARM64::LDPSWpre:
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPWi:
-  case ARM64::STNPWi:
-  case ARM64::LDPWpost:
-  case ARM64::STPWpost:
-  case ARM64::LDPWi:
-  case ARM64::STPWi:
-  case ARM64::LDPWpre:
-  case ARM64::STPWpre:
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPQi:
-  case ARM64::STNPQi:
-  case ARM64::LDPQpost:
-  case ARM64::STPQpost:
-  case ARM64::LDPQi:
-  case ARM64::STPQi:
-  case ARM64::LDPQpre:
-  case ARM64::STPQpre:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPDi:
-  case ARM64::STNPDi:
-  case ARM64::LDPDpost:
-  case ARM64::STPDpost:
-  case ARM64::LDPDi:
-  case ARM64::STPDi:
-  case ARM64::LDPDpre:
-  case ARM64::STPDpre:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR64RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  case ARM64::LDNPSi:
-  case ARM64::STNPSi:
-  case ARM64::LDPSpost:
-  case ARM64::STPSpost:
-  case ARM64::LDPSi:
-  case ARM64::STPSi:
-  case ARM64::LDPSpre:
-  case ARM64::STPSpre:
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR32RegisterClass(Inst, Rt2, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(offset));
-  return Success;
-}
-
-static DecodeStatus DecodeRegOffsetLdStInstruction(llvm::MCInst &Inst,
-                                                   uint32_t insn, uint64_t Addr,
-                                                   const void *Decoder) {
-  unsigned Rt = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extendHi = fieldFromInstruction(insn, 13, 3);
-  unsigned extendLo = fieldFromInstruction(insn, 12, 1);
-  unsigned extend = 0;
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LDRSWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRXro:
-  case ARM64::STRXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRWro:
-  case ARM64::STRWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRQro:
-  case ARM64::STRQro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRDro:
-  case ARM64::STRDro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSro:
-  case ARM64::STRSro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR16RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeFPR8RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRBBro:
-  case ARM64::STRBBro:
-  case ARM64::LDRSBWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRHHro:
-  case ARM64::STRHHro:
-  case ARM64::LDRSHWro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSHXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LDRSBXro:
-    extend = (extendHi << 1) | extendLo;
-    DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::PRFMro:
-    extend = (extendHi << 1) | extendLo;
-    Inst.addOperand(MCOperand::CreateImm(Rt));
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-
-  if (extendHi == 0x3)
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-  else
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeAddSubERegInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Rm = fieldFromInstruction(insn, 16, 5);
-  unsigned extend = fieldFromInstruction(insn, 10, 6);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ADDWrx:
-  case ARM64::SUBWrx:
-    DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSWrx:
-  case ARM64::SUBSWrx:
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx:
-  case ARM64::SUBXrx:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDSXrx:
-  case ARM64::SUBSXrx:
-    DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  case ARM64::ADDXrx64:
-  case ARM64::ADDSXrx64:
-  case ARM64::SUBXrx64:
-  case ARM64::SUBSXrx64:
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(extend));
-  return Success;
-}
-
-static DecodeStatus DecodeLogicalImmInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-  unsigned imm;
-
-  if (Datasize) {
-    DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 13);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 64))
-      return Fail;
-  } else {
-    DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32RegisterClass(Inst, Rn, Addr, Decoder);
-    imm = fieldFromInstruction(insn, 10, 12);
-    if (!ARM64_AM::isValidDecodeLogicalImmediate(imm, 32))
-      return Fail;
-  }
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  return Success;
-}
-
-static DecodeStatus DecodeModImmInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                            uint64_t Addr,
-                                            const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  if (Inst.getOpcode() == ARM64::MOVID)
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  else
-    DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-
-  switch (Inst.getOpcode()) {
-  default:
-    break;
-  case ARM64::MOVIv4i16:
-  case ARM64::MOVIv8i16:
-  case ARM64::MVNIv4i16:
-  case ARM64::MVNIv8i16:
-  case ARM64::MOVIv2i32:
-  case ARM64::MOVIv4i32:
-  case ARM64::MVNIv2i32:
-  case ARM64::MVNIv4i32:
-    Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-    break;
-  case ARM64::MOVIv2s_msl:
-  case ARM64::MOVIv4s_msl:
-  case ARM64::MVNIv2s_msl:
-  case ARM64::MVNIv4s_msl:
-    Inst.addOperand(MCOperand::CreateImm(cmode & 1 ? 0x110 : 0x108));
-    break;
-  }
-
-  return Success;
-}
-
-static DecodeStatus DecodeModImmTiedInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned cmode = fieldFromInstruction(insn, 12, 4);
-  unsigned imm = fieldFromInstruction(insn, 16, 3) << 5;
-  imm |= fieldFromInstruction(insn, 5, 5);
-
-  // Tied operands added twice.
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-  DecodeVectorRegisterClass(Inst, Rd, Addr, Decoder);
-
-  Inst.addOperand(MCOperand::CreateImm(imm));
-  Inst.addOperand(MCOperand::CreateImm((cmode & 6) << 2));
-
-  return Success;
-}
-
-static DecodeStatus DecodeAdrInstruction(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  int64_t imm = fieldFromInstruction(insn, 5, 19) << 2;
-  imm |= fieldFromInstruction(insn, 29, 2);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 21-bit immediate.
-  if (imm & (1 << (21 - 1)))
-    imm |= ~((1LL << 21) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-  if (!Dis->tryAddingSymbolicOperand(Addr, imm, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeBaseAddSubImm(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  unsigned Rd = fieldFromInstruction(insn, 0, 5);
-  unsigned Rn = fieldFromInstruction(insn, 5, 5);
-  unsigned Imm = fieldFromInstruction(insn, 10, 14);
-  unsigned S = fieldFromInstruction(insn, 29, 1);
-  unsigned Datasize = fieldFromInstruction(insn, 31, 1);
-
-  unsigned ShifterVal = (Imm >> 12) & 3;
-  unsigned ImmVal = Imm & 0xFFF;
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  if (ShifterVal != 0 && ShifterVal != 1)
-    return Fail;
-
-  if (Datasize) {
-    if (Rd == 31 && !S)
-      DecodeGPR64spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  } else {
-    if (Rd == 31 && !S)
-      DecodeGPR32spRegisterClass(Inst, Rd, Addr, Decoder);
-    else
-      DecodeGPR32RegisterClass(Inst, Rd, Addr, Decoder);
-    DecodeGPR32spRegisterClass(Inst, Rn, Addr, Decoder);
-  }
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, ImmVal, Fail, 4, Inst, insn))
-    Inst.addOperand(MCOperand::CreateImm(ImmVal));
-  Inst.addOperand(MCOperand::CreateImm(12 * ShifterVal));
-  return Success;
-}
-
-static DecodeStatus DecodeUnconditionalBranch(llvm::MCInst &Inst, uint32_t insn,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
-  int64_t imm = fieldFromInstruction(insn, 0, 26);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend the 26-bit immediate.
-  if (imm & (1 << (26 - 1)))
-    imm |= ~((1LL << 26) - 1);
-
-  if (!Dis->tryAddingSymbolicOperand(Addr, imm << 2, true, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSystemCPSRInstruction(llvm::MCInst &Inst,
-                                                uint32_t insn, uint64_t Addr,
-                                                const void *Decoder) {
-  uint64_t op1 = fieldFromInstruction(insn, 16, 3);
-  uint64_t op2 = fieldFromInstruction(insn, 5, 3);
-  uint64_t crm = fieldFromInstruction(insn, 8, 4);
-
-  Inst.addOperand(MCOperand::CreateImm((op1 << 3) | op2));
-  Inst.addOperand(MCOperand::CreateImm(crm));
-
-  return Success;
-}
-
-static DecodeStatus DecodeTestAndBranch(llvm::MCInst &Inst, uint32_t insn,
-                                        uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t bit = fieldFromInstruction(insn, 31, 1) << 5;
-  bit |= fieldFromInstruction(insn, 19, 5);
-  int64_t dst = fieldFromInstruction(insn, 5, 14);
-  const ARM64Disassembler *Dis =
-      static_cast<const ARM64Disassembler *>(Decoder);
-
-  // Sign-extend 14-bit immediate.
-  if (dst & (1 << (14 - 1)))
-    dst |= ~((1LL << 14) - 1);
-
-  DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
-  Inst.addOperand(MCOperand::CreateImm(bit));
-  if (!Dis->tryAddingSymbolicOperand(Addr, dst << 2, true, 4, Inst))
-    Inst.addOperand(MCOperand::CreateImm(dst));
-
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStPost(llvm::MCInst &Inst, uint32_t insn,
-                                       uint64_t Addr, const void *Decoder) {
-  uint64_t Rd = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::ST1Onev8b_POST:
-  case ARM64::ST1Onev4h_POST:
-  case ARM64::ST1Onev2s_POST:
-  case ARM64::ST1Onev1d_POST:
-  case ARM64::LD1Onev8b_POST:
-  case ARM64::LD1Onev4h_POST:
-  case ARM64::LD1Onev2s_POST:
-  case ARM64::LD1Onev1d_POST:
-    DecodeFPR64RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Onev16b_POST:
-  case ARM64::ST1Onev8h_POST:
-  case ARM64::ST1Onev4s_POST:
-  case ARM64::ST1Onev2d_POST:
-  case ARM64::LD1Onev16b_POST:
-  case ARM64::LD1Onev8h_POST:
-  case ARM64::LD1Onev4s_POST:
-  case ARM64::LD1Onev2d_POST:
-    DecodeFPR128RegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Twov8b_POST:
-  case ARM64::ST1Twov4h_POST:
-  case ARM64::ST1Twov2s_POST:
-  case ARM64::ST1Twov1d_POST:
-  case ARM64::ST2Twov8b_POST:
-  case ARM64::ST2Twov4h_POST:
-  case ARM64::ST2Twov2s_POST:
-  case ARM64::LD1Twov8b_POST:
-  case ARM64::LD1Twov4h_POST:
-  case ARM64::LD1Twov2s_POST:
-  case ARM64::LD1Twov1d_POST:
-  case ARM64::LD2Twov8b_POST:
-  case ARM64::LD2Twov4h_POST:
-  case ARM64::LD2Twov2s_POST:
-    DecodeDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Threev8b_POST:
-  case ARM64::ST1Threev4h_POST:
-  case ARM64::ST1Threev2s_POST:
-  case ARM64::ST1Threev1d_POST:
-  case ARM64::ST3Threev8b_POST:
-  case ARM64::ST3Threev4h_POST:
-  case ARM64::ST3Threev2s_POST:
-  case ARM64::LD1Threev8b_POST:
-  case ARM64::LD1Threev4h_POST:
-  case ARM64::LD1Threev2s_POST:
-  case ARM64::LD1Threev1d_POST:
-  case ARM64::LD3Threev8b_POST:
-  case ARM64::LD3Threev4h_POST:
-  case ARM64::LD3Threev2s_POST:
-    DecodeDDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Fourv8b_POST:
-  case ARM64::ST1Fourv4h_POST:
-  case ARM64::ST1Fourv2s_POST:
-  case ARM64::ST1Fourv1d_POST:
-  case ARM64::ST4Fourv8b_POST:
-  case ARM64::ST4Fourv4h_POST:
-  case ARM64::ST4Fourv2s_POST:
-  case ARM64::LD1Fourv8b_POST:
-  case ARM64::LD1Fourv4h_POST:
-  case ARM64::LD1Fourv2s_POST:
-  case ARM64::LD1Fourv1d_POST:
-  case ARM64::LD4Fourv8b_POST:
-  case ARM64::LD4Fourv4h_POST:
-  case ARM64::LD4Fourv2s_POST:
-    DecodeDDDDRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Twov16b_POST:
-  case ARM64::ST1Twov8h_POST:
-  case ARM64::ST1Twov4s_POST:
-  case ARM64::ST1Twov2d_POST:
-  case ARM64::ST2Twov16b_POST:
-  case ARM64::ST2Twov8h_POST:
-  case ARM64::ST2Twov4s_POST:
-  case ARM64::ST2Twov2d_POST:
-  case ARM64::LD1Twov16b_POST:
-  case ARM64::LD1Twov8h_POST:
-  case ARM64::LD1Twov4s_POST:
-  case ARM64::LD1Twov2d_POST:
-  case ARM64::LD2Twov16b_POST:
-  case ARM64::LD2Twov8h_POST:
-  case ARM64::LD2Twov4s_POST:
-  case ARM64::LD2Twov2d_POST:
-    DecodeQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Threev16b_POST:
-  case ARM64::ST1Threev8h_POST:
-  case ARM64::ST1Threev4s_POST:
-  case ARM64::ST1Threev2d_POST:
-  case ARM64::ST3Threev16b_POST:
-  case ARM64::ST3Threev8h_POST:
-  case ARM64::ST3Threev4s_POST:
-  case ARM64::ST3Threev2d_POST:
-  case ARM64::LD1Threev16b_POST:
-  case ARM64::LD1Threev8h_POST:
-  case ARM64::LD1Threev4s_POST:
-  case ARM64::LD1Threev2d_POST:
-  case ARM64::LD3Threev16b_POST:
-  case ARM64::LD3Threev8h_POST:
-  case ARM64::LD3Threev4s_POST:
-  case ARM64::LD3Threev2d_POST:
-    DecodeQQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  case ARM64::ST1Fourv16b_POST:
-  case ARM64::ST1Fourv8h_POST:
-  case ARM64::ST1Fourv4s_POST:
-  case ARM64::ST1Fourv2d_POST:
-  case ARM64::ST4Fourv16b_POST:
-  case ARM64::ST4Fourv8h_POST:
-  case ARM64::ST4Fourv4s_POST:
-  case ARM64::ST4Fourv2d_POST:
-  case ARM64::LD1Fourv16b_POST:
-  case ARM64::LD1Fourv8h_POST:
-  case ARM64::LD1Fourv4s_POST:
-  case ARM64::LD1Fourv2d_POST:
-  case ARM64::LD4Fourv16b_POST:
-  case ARM64::LD4Fourv8h_POST:
-  case ARM64::LD4Fourv4s_POST:
-  case ARM64::LD4Fourv2d_POST:
-    DecodeQQQQRegisterClass(Inst, Rd, Addr, Decoder);
-    break;
-  }
-
-  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
-  DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStSingle(llvm::MCInst &Inst, uint32_t insn,
-                                         uint64_t Addr, const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-  uint64_t size = fieldFromInstruction(insn, 10, 2);
-  uint64_t S = fieldFromInstruction(insn, 12, 1);
-  uint64_t Q = fieldFromInstruction(insn, 30, 1);
-  uint64_t index = 0;
-
-  switch (Inst.getOpcode()) {
-  case ARM64::ST1i8:
-  case ARM64::ST1i8_POST:
-  case ARM64::ST2i8:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i8:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i8:
-    index = (Q << 3) | (S << 2) | size;
-    break;
-  case ARM64::ST1i16:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST2i16:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i16:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i16:
-    index = (Q << 2) | (S << 1) | (size >> 1);
-    break;
-  case ARM64::ST1i32:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST2i32:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i32:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i32:
-    index = (Q << 1) | S;
-    break;
-  case ARM64::ST1i64:
-  case ARM64::ST1i64_POST:
-  case ARM64::ST2i64:
-  case ARM64::ST2i64_POST:
-  case ARM64::ST3i64_POST:
-  case ARM64::ST3i64:
-  case ARM64::ST4i64_POST:
-  case ARM64::ST4i64:
-    index = Q;
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LD1Rv8b:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv4h:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv2s:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv1d:
-  case ARM64::LD1Rv1d_POST:
-    DecodeFPR64RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD1Rv16b:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv8h:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv4s:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv2d:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::ST1i8:
-  case ARM64::ST1i8_POST:
-  case ARM64::ST1i16:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST1i32:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST1i64:
-  case ARM64::ST1i64_POST:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2Rv16b:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv8h:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv4s:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::ST2i8:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST2i16:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST2i32:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST2i64:
-  case ARM64::ST2i64_POST:
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2Rv8b:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv4h:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv2s:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv1d:
-  case ARM64::LD2Rv1d_POST:
-    DecodeDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3Rv8b:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv4h:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv2s:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv1d:
-  case ARM64::LD3Rv1d_POST:
-    DecodeDDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3Rv16b:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv8h:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv4s:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::ST3i8:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i16:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i32:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i64:
-  case ARM64::ST3i64_POST:
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4Rv8b:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv4h:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv2s:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv1d:
-  case ARM64::LD4Rv1d_POST:
-    DecodeDDDDRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4Rv16b:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv8h:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv4s:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::ST4i8:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i16:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i32:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i64:
-  case ARM64::ST4i64_POST:
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1Rv8b:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv16b:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv4h:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv8h:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv4s:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv2s:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv1d:
-  case ARM64::LD1Rv1d_POST:
-  case ARM64::LD1Rv2d:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::LD2Rv8b:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv16b:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv4h:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv8h:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv2s:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv4s:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::LD2Rv1d:
-  case ARM64::LD2Rv1d_POST:
-  case ARM64::LD3Rv8b:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv16b:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv4h:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv8h:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv2s:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv4s:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::LD3Rv1d:
-  case ARM64::LD3Rv1d_POST:
-  case ARM64::LD4Rv8b:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv16b:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv4h:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv8h:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv2s:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv4s:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::LD4Rv1d:
-  case ARM64::LD4Rv1d_POST:
-    break;
-  default:
-    Inst.addOperand(MCOperand::CreateImm(index));
-  }
-
-  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-
-  switch (Inst.getOpcode()) {
-  case ARM64::ST1i8_POST:
-  case ARM64::ST1i16_POST:
-  case ARM64::ST1i32_POST:
-  case ARM64::ST1i64_POST:
-  case ARM64::LD1Rv8b_POST:
-  case ARM64::LD1Rv16b_POST:
-  case ARM64::LD1Rv4h_POST:
-  case ARM64::LD1Rv8h_POST:
-  case ARM64::LD1Rv2s_POST:
-  case ARM64::LD1Rv4s_POST:
-  case ARM64::LD1Rv1d_POST:
-  case ARM64::LD1Rv2d_POST:
-  case ARM64::ST2i8_POST:
-  case ARM64::ST2i16_POST:
-  case ARM64::ST2i32_POST:
-  case ARM64::ST2i64_POST:
-  case ARM64::LD2Rv8b_POST:
-  case ARM64::LD2Rv16b_POST:
-  case ARM64::LD2Rv4h_POST:
-  case ARM64::LD2Rv8h_POST:
-  case ARM64::LD2Rv2s_POST:
-  case ARM64::LD2Rv4s_POST:
-  case ARM64::LD2Rv2d_POST:
-  case ARM64::LD2Rv1d_POST:
-  case ARM64::ST3i8_POST:
-  case ARM64::ST3i16_POST:
-  case ARM64::ST3i32_POST:
-  case ARM64::ST3i64_POST:
-  case ARM64::LD3Rv8b_POST:
-  case ARM64::LD3Rv16b_POST:
-  case ARM64::LD3Rv4h_POST:
-  case ARM64::LD3Rv8h_POST:
-  case ARM64::LD3Rv2s_POST:
-  case ARM64::LD3Rv4s_POST:
-  case ARM64::LD3Rv2d_POST:
-  case ARM64::LD3Rv1d_POST:
-  case ARM64::ST4i8_POST:
-  case ARM64::ST4i16_POST:
-  case ARM64::ST4i32_POST:
-  case ARM64::ST4i64_POST:
-  case ARM64::LD4Rv8b_POST:
-  case ARM64::LD4Rv16b_POST:
-  case ARM64::LD4Rv4h_POST:
-  case ARM64::LD4Rv8h_POST:
-  case ARM64::LD4Rv2s_POST:
-  case ARM64::LD4Rv4s_POST:
-  case ARM64::LD4Rv2d_POST:
-  case ARM64::LD4Rv1d_POST:
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  return Success;
-}
-
-static DecodeStatus DecodeSIMDLdStSingleTied(llvm::MCInst &Inst, uint32_t insn,
-                                             uint64_t Addr,
-                                             const void *Decoder) {
-  uint64_t Rt = fieldFromInstruction(insn, 0, 5);
-  uint64_t Rn = fieldFromInstruction(insn, 5, 5);
-  uint64_t Rm = fieldFromInstruction(insn, 16, 5);
-  uint64_t size = fieldFromInstruction(insn, 10, 2);
-  uint64_t S = fieldFromInstruction(insn, 12, 1);
-  uint64_t Q = fieldFromInstruction(insn, 30, 1);
-  uint64_t index = 0;
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1i8:
-  case ARM64::LD1i8_POST:
-  case ARM64::LD2i8:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i8:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i8:
-    index = (Q << 3) | (S << 2) | size;
-    break;
-  case ARM64::LD1i16:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD2i16:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i16:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i16:
-    index = (Q << 2) | (S << 1) | (size >> 1);
-    break;
-  case ARM64::LD1i32:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD2i32:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i32:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i32:
-    index = (Q << 1) | S;
-    break;
-  case ARM64::LD1i64:
-  case ARM64::LD1i64_POST:
-  case ARM64::LD2i64:
-  case ARM64::LD2i64_POST:
-  case ARM64::LD3i64_POST:
-  case ARM64::LD3i64:
-  case ARM64::LD4i64_POST:
-  case ARM64::LD4i64:
-    index = Q;
-    break;
-  }
-
-  switch (Inst.getOpcode()) {
-  default:
-    return Fail;
-  case ARM64::LD1i8:
-  case ARM64::LD1i8_POST:
-  case ARM64::LD1i16:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD1i32:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD1i64:
-  case ARM64::LD1i64_POST:
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeFPR128RegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD2i8:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD2i16:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD2i32:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD2i64:
-  case ARM64::LD2i64_POST:
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD3i8:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i16:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i32:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i64:
-  case ARM64::LD3i64_POST:
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  case ARM64::LD4i8:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i16:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i32:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i64:
-  case ARM64::LD4i64_POST:
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    DecodeQQQQRegisterClass(Inst, Rt, Addr, Decoder);
-    break;
-  }
-
-  Inst.addOperand(MCOperand::CreateImm(index));
-  DecodeGPR64RegisterClass(Inst, Rn, Addr, Decoder);
-
-  switch (Inst.getOpcode()) {
-  case ARM64::LD1i8_POST:
-  case ARM64::LD1i16_POST:
-  case ARM64::LD1i32_POST:
-  case ARM64::LD1i64_POST:
-  case ARM64::LD2i8_POST:
-  case ARM64::LD2i16_POST:
-  case ARM64::LD2i32_POST:
-  case ARM64::LD2i64_POST:
-  case ARM64::LD3i8_POST:
-  case ARM64::LD3i16_POST:
-  case ARM64::LD3i32_POST:
-  case ARM64::LD3i64_POST:
-  case ARM64::LD4i8_POST:
-  case ARM64::LD4i16_POST:
-  case ARM64::LD4i32_POST:
-  case ARM64::LD4i64_POST:
-    DecodeGPR64RegisterClass(Inst, Rm, Addr, Decoder);
-    break;
-  }
-  return Success;
-}
diff --git a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h b/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
deleted file mode 100644
index 35efc8d..0000000
--- a/lib/Target/ARM64/Disassembler/ARM64Disassembler.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===- ARM64Disassembler.h - Disassembler for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64DISASSEMBLER_H
-#define ARM64DISASSEMBLER_H
-
-#include "llvm/MC/MCDisassembler.h"
-
-namespace llvm {
-
-class MCInst;
-class MemoryObject;
-class raw_ostream;
-
-class ARM64Disassembler : public MCDisassembler {
-public:
-  ARM64Disassembler(const MCSubtargetInfo &STI) : MCDisassembler(STI) {}
-
-  ~ARM64Disassembler() {}
-
-  /// getInstruction - See MCDisassembler.
-  MCDisassembler::DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
-                                              const MemoryObject &region,
-                                              uint64_t address,
-                                              raw_ostream &vStream,
-                                              raw_ostream &cStream) const;
-
-  /// tryAddingSymbolicOperand - tryAddingSymbolicOperand trys to add a symbolic
-  /// operand in place of the immediate Value in the MCInst.  The immediate
-  /// Value has not had any PC adjustment made by the caller. If the instruction
-  /// adds the PC to the immediate Value then InstsAddsAddressToValue is true,
-  /// else false.  If the getOpInfo() function was set as part of the
-  /// setupForSymbolicDisassembly() call then that function is called to get any
-  /// symbolic information at the Address for this instrution.  If that returns
-  /// non-zero then the symbolic information it returns is used to create an
-  /// MCExpr and that is added as an operand to the MCInst.  This function
-  /// returns true if it adds an operand to the MCInst and false otherwise.
-  bool tryAddingSymbolicOperand(uint64_t Address, int Value,
-                                bool InstsAddsAddressToValue, uint64_t InstSize,
-                                MCInst &MI, uint32_t insn = 0) const;
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/Disassembler/CMakeLists.txt b/lib/Target/ARM64/Disassembler/CMakeLists.txt
deleted file mode 100644
index ad998c2..0000000
--- a/lib/Target/ARM64/Disassembler/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Disassembler
-  ARM64Disassembler.cpp
-  )
-# workaround for hanging compilation on MSVC8, 9 and 10
-#if( MSVC_VERSION EQUAL 1400 OR MSVC_VERSION EQUAL 1500 OR MSVC_VERSION EQUAL 1600 )
-#set_property(
-#  SOURCE ARMDisassembler.cpp
-#  PROPERTY COMPILE_FLAGS "/Od"
-#  )
-#endif()
-add_dependencies(LLVMARM64Disassembler ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/Disassembler/LLVMBuild.txt b/lib/Target/ARM64/Disassembler/LLVMBuild.txt
deleted file mode 100644
index 5935ee6..0000000
--- a/lib/Target/ARM64/Disassembler/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Disassembler
-parent = ARM64
-required_libraries = ARM64Desc ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/Disassembler/Makefile b/lib/Target/ARM64/Disassembler/Makefile
deleted file mode 100644
index 479d00c..0000000
--- a/lib/Target/ARM64/Disassembler/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/Disassembler/Makefile --------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Disassembler
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
deleted file mode 100644
index bb90707..0000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.cpp
+++ /dev/null
@@ -1,1428 +0,0 @@
-//===-- ARM64InstPrinter.cpp - Convert ARM64 MCInst to assembly syntax ----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "asm-printer"
-#include "ARM64InstPrinter.h"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter.inc"
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "ARM64GenAsmWriter1.inc"
-
-ARM64InstPrinter::ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                                   const MCRegisterInfo &MRI,
-                                   const MCSubtargetInfo &STI)
-    : MCInstPrinter(MAI, MII, MRI) {
-  // Initialize the set of available features.
-  setAvailableFeatures(STI.getFeatureBits());
-}
-
-ARM64AppleInstPrinter::ARM64AppleInstPrinter(const MCAsmInfo &MAI,
-                                             const MCInstrInfo &MII,
-                                             const MCRegisterInfo &MRI,
-                                             const MCSubtargetInfo &STI)
-    : ARM64InstPrinter(MAI, MII, MRI, STI) {}
-
-void ARM64InstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  // This is for .cfi directives.
-  OS << getRegisterName(RegNo);
-}
-
-void ARM64InstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                 StringRef Annot) {
-  // Check for special encodings and print the cannonical alias instead.
-
-  unsigned Opcode = MI->getOpcode();
-
-  if (Opcode == ARM64::SYS || Opcode == ARM64::SYSxt)
-    if (printSysAlias(MI, O)) {
-      printAnnotation(O, Annot);
-      return;
-    }
-
-  // TBZ/TBNZ should print the register operand as a Wreg if the bit
-  // number is < 32.
-  if ((Opcode == ARM64::TBNZ || Opcode == ARM64::TBZ) &&
-      MI->getOperand(1).getImm() < 32) {
-    MCInst newMI = *MI;
-    unsigned Reg = MI->getOperand(0).getReg();
-    newMI.getOperand(0).setReg(getWRegFromXReg(Reg));
-    printInstruction(&newMI, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  // SBFM/UBFM should print to a nicer aliased form if possible.
-  if (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri ||
-      Opcode == ARM64::UBFMXri || Opcode == ARM64::UBFMWri) {
-    const MCOperand &Op0 = MI->getOperand(0);
-    const MCOperand &Op1 = MI->getOperand(1);
-    const MCOperand &Op2 = MI->getOperand(2);
-    const MCOperand &Op3 = MI->getOperand(3);
-
-    if (Op2.isImm() && Op2.getImm() == 0 && Op3.isImm()) {
-      bool IsSigned = (Opcode == ARM64::SBFMXri || Opcode == ARM64::SBFMWri);
-      const char *AsmMnemonic = 0;
-
-      switch (Op3.getImm()) {
-      default:
-        break;
-      case 7:
-        AsmMnemonic = IsSigned ? "sxtb" : "uxtb";
-        break;
-      case 15:
-        AsmMnemonic = IsSigned ? "sxth" : "uxth";
-        break;
-      case 31:
-        AsmMnemonic = IsSigned ? "sxtw" : "uxtw";
-        break;
-      }
-
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg());
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-
-    // All immediate shifts are aliases, implemented using the Bitfield
-    // instruction. In all cases the immediate shift amount shift must be in
-    // the range 0 to (reg.size -1).
-    if (Op2.isImm() && Op3.isImm()) {
-      const char *AsmMnemonic = 0;
-      int shift = 0;
-      int64_t immr = Op2.getImm();
-      int64_t imms = Op3.getImm();
-      if (Opcode == ARM64::UBFMWri && imms != 0x1F && ((imms + 1) == immr)) {
-        AsmMnemonic = "lsl";
-        shift = 31 - imms;
-      } else if (Opcode == ARM64::UBFMXri && imms != 0x3f &&
-                 ((imms + 1 == immr))) {
-        AsmMnemonic = "lsl";
-        shift = 63 - imms;
-      } else if (Opcode == ARM64::UBFMWri && imms == 0x1f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::UBFMXri && imms == 0x3f) {
-        AsmMnemonic = "lsr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMWri && imms == 0x1f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      } else if (Opcode == ARM64::SBFMXri && imms == 0x3f) {
-        AsmMnemonic = "asr";
-        shift = immr;
-      }
-      if (AsmMnemonic) {
-        O << '\t' << AsmMnemonic << '\t' << getRegisterName(Op0.getReg())
-          << ", " << getRegisterName(Op1.getReg()) << ", #" << shift;
-        printAnnotation(O, Annot);
-        return;
-      }
-    }
-  }
-
-  // Symbolic operands for MOVZ, MOVN and MOVK already imply a shift
-  // (e.g. :gottprel_g1: is always going to be "lsl #16") so it should not be
-  // printed.
-  if ((Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi ||
-       Opcode == ARM64::MOVNXi || Opcode == ARM64::MOVNWi) &&
-      MI->getOperand(1).isExpr()) {
-    if (Opcode == ARM64::MOVZXi || Opcode == ARM64::MOVZWi)
-      O << "\tmovz\t";
-    else
-      O << "\tmovn\t";
-
-    O << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(1).getExpr();
-    return;
-  }
-
-  if ((Opcode == ARM64::MOVKXi || Opcode == ARM64::MOVKWi) &&
-      MI->getOperand(2).isExpr()) {
-    O << "\tmovk\t" << getRegisterName(MI->getOperand(0).getReg()) << ", #"
-      << *MI->getOperand(2).getExpr();
-    return;
-  }
-
-  // ANDS WZR, Wn, #imm ==> TST Wn, #imm
-  // ANDS XZR, Xn, #imm ==> TST Xn, #imm
-  if (Opcode == ARM64::ANDSWri && MI->getOperand(0).getReg() == ARM64::WZR) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printLogicalImm32(MI, 2, O);
-    return;
-  }
-  if (Opcode == ARM64::ANDSXri && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printLogicalImm64(MI, 2, O);
-    return;
-  }
-  // ANDS WZR, Wn, Wm{, lshift #imm} ==> TST Wn{, lshift #imm}
-  // ANDS XZR, Xn, Xm{, lshift #imm} ==> TST Xn{, lshift #imm}
-  if ((Opcode == ARM64::ANDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ANDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\ttst\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-
-  // SUBS WZR, Wn, #imm ==> CMP Wn, #imm
-  // SUBS XZR, Xn, #imm ==> CMP Xn, #imm
-  if ((Opcode == ARM64::SUBSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::SUBSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printAddSubImm(MI, 2, O);
-    return;
-  }
-  // SUBS WZR, Wn, Wm{, lshift #imm} ==> CMP Wn, Wm{, lshift #imm}
-  // SUBS XZR, Xn, Xm{, lshift #imm} ==> CMP Xn, Xm{, lshift #imm}
-  if ((Opcode == ARM64::SUBSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::SUBSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-  // SUBS XZR, Xn, Wm, uxtb #imm ==> CMP Xn, uxtb #imm
-  // SUBS WZR, Wn, Xm, uxtb #imm ==> CMP Wn, uxtb #imm
-  if ((Opcode == ARM64::SUBSXrx && MI->getOperand(0).getReg() == ARM64::XZR) ||
-      (Opcode == ARM64::SUBSWrx && MI->getOperand(0).getReg() == ARM64::WZR)) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printExtendedRegister(MI, 2, O);
-    return;
-  }
-  // SUBS XZR, Xn, Xm, uxtx #imm ==> CMP Xn, uxtb #imm
-  if (Opcode == ARM64::SUBSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmp\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
-      << getRegisterName(MI->getOperand(2).getReg());
-    printExtend(MI, 3, O);
-    return;
-  }
-
-  // ADDS WZR, Wn, #imm ==> CMN Wn, #imm
-  // ADDS XZR, Xn, #imm ==> CMN Xn, #imm
-  if ((Opcode == ARM64::ADDSWri && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ADDSXri && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printAddSubImm(MI, 2, O);
-    return;
-  }
-  // ADDS WZR, Wn, Wm{, lshift #imm} ==> CMN Wn, Wm{, lshift #imm}
-  // ADDS XZR, Xn, Xm{, lshift #imm} ==> CMN Xn, Xm{, lshift #imm}
-  if ((Opcode == ARM64::ADDSWrs && MI->getOperand(0).getReg() == ARM64::WZR) ||
-      (Opcode == ARM64::ADDSXrs && MI->getOperand(0).getReg() == ARM64::XZR)) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printShiftedRegister(MI, 2, O);
-    return;
-  }
-  // ADDS XZR, Xn, Wm, uxtb #imm ==> CMN Xn, uxtb #imm
-  if (Opcode == ARM64::ADDSXrx && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", ";
-    printExtendedRegister(MI, 2, O);
-    return;
-  }
-  // ADDS XZR, Xn, Xm, uxtx #imm ==> CMN Xn, uxtb #imm
-  if (Opcode == ARM64::ADDSXrx64 && MI->getOperand(0).getReg() == ARM64::XZR) {
-    O << "\tcmn\t" << getRegisterName(MI->getOperand(1).getReg()) << ", "
-      << getRegisterName(MI->getOperand(2).getReg());
-    printExtend(MI, 3, O);
-    return;
-  }
-
-  if (!printAliasInstr(MI, O))
-    printInstruction(MI, O);
-
-  printAnnotation(O, Annot);
-}
-
-static bool isTblTbxInstruction(unsigned Opcode, StringRef &Layout,
-                                bool &IsTbx) {
-  switch (Opcode) {
-  case ARM64::TBXv8i8One:
-  case ARM64::TBXv8i8Two:
-  case ARM64::TBXv8i8Three:
-  case ARM64::TBXv8i8Four:
-    IsTbx = true;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBLv8i8One:
-  case ARM64::TBLv8i8Two:
-  case ARM64::TBLv8i8Three:
-  case ARM64::TBLv8i8Four:
-    IsTbx = false;
-    Layout = ".8b";
-    return true;
-  case ARM64::TBXv16i8One:
-  case ARM64::TBXv16i8Two:
-  case ARM64::TBXv16i8Three:
-  case ARM64::TBXv16i8Four:
-    IsTbx = true;
-    Layout = ".16b";
-    return true;
-  case ARM64::TBLv16i8One:
-  case ARM64::TBLv16i8Two:
-  case ARM64::TBLv16i8Three:
-  case ARM64::TBLv16i8Four:
-    IsTbx = false;
-    Layout = ".16b";
-    return true;
-  default:
-    return false;
-  }
-}
-
-struct LdStNInstrDesc {
-  unsigned Opcode;
-  const char *Mnemonic;
-  const char *Layout;
-  int LaneOperand;
-  int NaturalOffset;
-};
-
-static LdStNInstrDesc LdStNInstInfo[] = {
-  { ARM64::LD1i8,             "ld1",  ".b",     2, 0  },
-  { ARM64::LD1i16,            "ld1",  ".h",     2, 0  },
-  { ARM64::LD1i32,            "ld1",  ".s",     2, 0  },
-  { ARM64::LD1i64,            "ld1",  ".d",     2, 0  },
-  { ARM64::LD1i8_POST,        "ld1",  ".b",     2, 1  },
-  { ARM64::LD1i16_POST,       "ld1",  ".h",     2, 2  },
-  { ARM64::LD1i32_POST,       "ld1",  ".s",     2, 4  },
-  { ARM64::LD1i64_POST,       "ld1",  ".d",     2, 8  },
-  { ARM64::LD1Rv16b,          "ld1r", ".16b",   0, 0  },
-  { ARM64::LD1Rv8h,           "ld1r", ".8h",    0, 0  },
-  { ARM64::LD1Rv4s,           "ld1r", ".4s",    0, 0  },
-  { ARM64::LD1Rv2d,           "ld1r", ".2d",    0, 0  },
-  { ARM64::LD1Rv8b,           "ld1r", ".8b",    0, 0  },
-  { ARM64::LD1Rv4h,           "ld1r", ".4h",    0, 0  },
-  { ARM64::LD1Rv2s,           "ld1r", ".2s",    0, 0  },
-  { ARM64::LD1Rv1d,           "ld1r", ".1d",    0, 0  },
-  { ARM64::LD1Rv16b_POST,     "ld1r", ".16b",   0, 1  },
-  { ARM64::LD1Rv8h_POST,      "ld1r", ".8h",    0, 2  },
-  { ARM64::LD1Rv4s_POST,      "ld1r", ".4s",    0, 4  },
-  { ARM64::LD1Rv2d_POST,      "ld1r", ".2d",    0, 8  },
-  { ARM64::LD1Rv8b_POST,      "ld1r", ".8b",    0, 1  },
-  { ARM64::LD1Rv4h_POST,      "ld1r", ".4h",    0, 2  },
-  { ARM64::LD1Rv2s_POST,      "ld1r", ".2s",    0, 4  },
-  { ARM64::LD1Rv1d_POST,      "ld1r", ".1d",    0, 8  },
-  { ARM64::LD1Onev16b,        "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Onev8h,         "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Onev4s,         "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Onev2d,         "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Onev8b,         "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Onev4h,         "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Onev2s,         "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Onev1d,         "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Onev16b_POST,   "ld1",  ".16b",   0, 16 },
-  { ARM64::LD1Onev8h_POST,    "ld1",  ".8h",    0, 16 },
-  { ARM64::LD1Onev4s_POST,    "ld1",  ".4s",    0, 16 },
-  { ARM64::LD1Onev2d_POST,    "ld1",  ".2d",    0, 16 },
-  { ARM64::LD1Onev8b_POST,    "ld1",  ".8b",    0, 8  },
-  { ARM64::LD1Onev4h_POST,    "ld1",  ".4h",    0, 8  },
-  { ARM64::LD1Onev2s_POST,    "ld1",  ".2s",    0, 8  },
-  { ARM64::LD1Onev1d_POST,    "ld1",  ".1d",    0, 8  },
-  { ARM64::LD1Twov16b,        "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Twov8h,         "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Twov4s,         "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Twov2d,         "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Twov8b,         "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Twov4h,         "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Twov2s,         "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Twov1d,         "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Twov16b_POST,   "ld1",  ".16b",   0, 32 },
-  { ARM64::LD1Twov8h_POST,    "ld1",  ".8h",    0, 32 },
-  { ARM64::LD1Twov4s_POST,    "ld1",  ".4s",    0, 32 },
-  { ARM64::LD1Twov2d_POST,    "ld1",  ".2d",    0, 32 },
-  { ARM64::LD1Twov8b_POST,    "ld1",  ".8b",    0, 16 },
-  { ARM64::LD1Twov4h_POST,    "ld1",  ".4h",    0, 16 },
-  { ARM64::LD1Twov2s_POST,    "ld1",  ".2s",    0, 16 },
-  { ARM64::LD1Twov1d_POST,    "ld1",  ".1d",    0, 16 },
-  { ARM64::LD1Threev16b,      "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Threev8h,       "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Threev4s,       "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Threev2d,       "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Threev8b,       "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Threev4h,       "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Threev2s,       "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Threev1d,       "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Threev16b_POST, "ld1",  ".16b",   0, 48 },
-  { ARM64::LD1Threev8h_POST,  "ld1",  ".8h",    0, 48 },
-  { ARM64::LD1Threev4s_POST,  "ld1",  ".4s",    0, 48 },
-  { ARM64::LD1Threev2d_POST,  "ld1",  ".2d",    0, 48 },
-  { ARM64::LD1Threev8b_POST,  "ld1",  ".8b",    0, 24 },
-  { ARM64::LD1Threev4h_POST,  "ld1",  ".4h",    0, 24 },
-  { ARM64::LD1Threev2s_POST,  "ld1",  ".2s",    0, 24 },
-  { ARM64::LD1Threev1d_POST,  "ld1",  ".1d",    0, 24 },
-  { ARM64::LD1Fourv16b,       "ld1",  ".16b",   0, 0  },
-  { ARM64::LD1Fourv8h,        "ld1",  ".8h",    0, 0  },
-  { ARM64::LD1Fourv4s,        "ld1",  ".4s",    0, 0  },
-  { ARM64::LD1Fourv2d,        "ld1",  ".2d",    0, 0  },
-  { ARM64::LD1Fourv8b,        "ld1",  ".8b",    0, 0  },
-  { ARM64::LD1Fourv4h,        "ld1",  ".4h",    0, 0  },
-  { ARM64::LD1Fourv2s,        "ld1",  ".2s",    0, 0  },
-  { ARM64::LD1Fourv1d,        "ld1",  ".1d",    0, 0  },
-  { ARM64::LD1Fourv16b_POST,  "ld1",  ".16b",   0, 64 },
-  { ARM64::LD1Fourv8h_POST,   "ld1",  ".8h",    0, 64 },
-  { ARM64::LD1Fourv4s_POST,   "ld1",  ".4s",    0, 64 },
-  { ARM64::LD1Fourv2d_POST,   "ld1",  ".2d",    0, 64 },
-  { ARM64::LD1Fourv8b_POST,   "ld1",  ".8b",    0, 32 },
-  { ARM64::LD1Fourv4h_POST,   "ld1",  ".4h",    0, 32 },
-  { ARM64::LD1Fourv2s_POST,   "ld1",  ".2s",    0, 32 },
-  { ARM64::LD1Fourv1d_POST,   "ld1",  ".1d",    0, 32 },
-  { ARM64::LD2i8,             "ld2",  ".b",     2, 0  },
-  { ARM64::LD2i16,            "ld2",  ".h",     2, 0  },
-  { ARM64::LD2i32,            "ld2",  ".s",     2, 0  },
-  { ARM64::LD2i64,            "ld2",  ".d",     2, 0  },
-  { ARM64::LD2i8_POST,        "ld2",  ".b",     2, 2  },
-  { ARM64::LD2i16_POST,       "ld2",  ".h",     2, 4  },
-  { ARM64::LD2i32_POST,       "ld2",  ".s",     2, 8  },
-  { ARM64::LD2i64_POST,       "ld2",  ".d",     2, 16  },
-  { ARM64::LD2Rv16b,          "ld2r", ".16b",   0, 0  },
-  { ARM64::LD2Rv8h,           "ld2r", ".8h",    0, 0  },
-  { ARM64::LD2Rv4s,           "ld2r", ".4s",    0, 0  },
-  { ARM64::LD2Rv2d,           "ld2r", ".2d",    0, 0  },
-  { ARM64::LD2Rv8b,           "ld2r", ".8b",    0, 0  },
-  { ARM64::LD2Rv4h,           "ld2r", ".4h",    0, 0  },
-  { ARM64::LD2Rv2s,           "ld2r", ".2s",    0, 0  },
-  { ARM64::LD2Rv1d,           "ld2r", ".1d",    0, 0  },
-  { ARM64::LD2Rv16b_POST,     "ld2r", ".16b",   0, 2  },
-  { ARM64::LD2Rv8h_POST,      "ld2r", ".8h",    0, 4  },
-  { ARM64::LD2Rv4s_POST,      "ld2r", ".4s",    0, 8  },
-  { ARM64::LD2Rv2d_POST,      "ld2r", ".2d",    0, 16 },
-  { ARM64::LD2Rv8b_POST,      "ld2r", ".8b",    0, 2  },
-  { ARM64::LD2Rv4h_POST,      "ld2r", ".4h",    0, 4  },
-  { ARM64::LD2Rv2s_POST,      "ld2r", ".2s",    0, 8  },
-  { ARM64::LD2Rv1d_POST,      "ld2r", ".1d",    0, 16 },
-  { ARM64::LD2Twov16b,        "ld2",  ".16b",   0, 0  },
-  { ARM64::LD2Twov8h,         "ld2",  ".8h",    0, 0  },
-  { ARM64::LD2Twov4s,         "ld2",  ".4s",    0, 0  },
-  { ARM64::LD2Twov2d,         "ld2",  ".2d",    0, 0  },
-  { ARM64::LD2Twov8b,         "ld2",  ".8b",    0, 0  },
-  { ARM64::LD2Twov4h,         "ld2",  ".4h",    0, 0  },
-  { ARM64::LD2Twov2s,         "ld2",  ".2s",    0, 0  },
-  { ARM64::LD2Twov16b_POST,   "ld2",  ".16b",   0, 32 },
-  { ARM64::LD2Twov8h_POST,    "ld2",  ".8h",    0, 32 },
-  { ARM64::LD2Twov4s_POST,    "ld2",  ".4s",    0, 32 },
-  { ARM64::LD2Twov2d_POST,    "ld2",  ".2d",    0, 32 },
-  { ARM64::LD2Twov8b_POST,    "ld2",  ".8b",    0, 16 },
-  { ARM64::LD2Twov4h_POST,    "ld2",  ".4h",    0, 16 },
-  { ARM64::LD2Twov2s_POST,    "ld2",  ".2s",    0, 16 },
-  { ARM64::LD3i8,             "ld3",  ".b",     2, 0  },
-  { ARM64::LD3i16,            "ld3",  ".h",     2, 0  },
-  { ARM64::LD3i32,            "ld3",  ".s",     2, 0  },
-  { ARM64::LD3i64,            "ld3",  ".d",     2, 0  },
-  { ARM64::LD3i8_POST,        "ld3",  ".b",     2, 3  },
-  { ARM64::LD3i16_POST,       "ld3",  ".h",     2, 6  },
-  { ARM64::LD3i32_POST,       "ld3",  ".s",     2, 12  },
-  { ARM64::LD3i64_POST,       "ld3",  ".d",     2, 24  },
-  { ARM64::LD3Rv16b,          "ld3r", ".16b",   0, 0  },
-  { ARM64::LD3Rv8h,           "ld3r", ".8h",    0, 0  },
-  { ARM64::LD3Rv4s,           "ld3r", ".4s",    0, 0  },
-  { ARM64::LD3Rv2d,           "ld3r", ".2d",    0, 0  },
-  { ARM64::LD3Rv8b,           "ld3r", ".8b",    0, 0  },
-  { ARM64::LD3Rv4h,           "ld3r", ".4h",    0, 0  },
-  { ARM64::LD3Rv2s,           "ld3r", ".2s",    0, 0  },
-  { ARM64::LD3Rv1d,           "ld3r", ".1d",    0, 0  },
-  { ARM64::LD3Rv16b_POST,     "ld3r", ".16b",   0, 3  },
-  { ARM64::LD3Rv8h_POST,      "ld3r", ".8h",    0, 6  },
-  { ARM64::LD3Rv4s_POST,      "ld3r", ".4s",    0, 12 },
-  { ARM64::LD3Rv2d_POST,      "ld3r", ".2d",    0, 24 },
-  { ARM64::LD3Rv8b_POST,      "ld3r", ".8b",    0, 3  },
-  { ARM64::LD3Rv4h_POST,      "ld3r", ".4h",    0, 6  },
-  { ARM64::LD3Rv2s_POST,      "ld3r", ".2s",    0, 12 },
-  { ARM64::LD3Rv1d_POST,      "ld3r", ".1d",    0, 24 },
-  { ARM64::LD3Threev16b,      "ld3",  ".16b",   0, 0  },
-  { ARM64::LD3Threev8h,       "ld3",  ".8h",    0, 0  },
-  { ARM64::LD3Threev4s,       "ld3",  ".4s",    0, 0  },
-  { ARM64::LD3Threev2d,       "ld3",  ".2d",    0, 0  },
-  { ARM64::LD3Threev8b,       "ld3",  ".8b",    0, 0  },
-  { ARM64::LD3Threev4h,       "ld3",  ".4h",    0, 0  },
-  { ARM64::LD3Threev2s,       "ld3",  ".2s",    0, 0  },
-  { ARM64::LD3Threev16b_POST, "ld3",  ".16b",   0, 48 },
-  { ARM64::LD3Threev8h_POST,  "ld3",  ".8h",    0, 48 },
-  { ARM64::LD3Threev4s_POST,  "ld3",  ".4s",    0, 48 },
-  { ARM64::LD3Threev2d_POST,  "ld3",  ".2d",    0, 48 },
-  { ARM64::LD3Threev8b_POST,  "ld3",  ".8b",    0, 24 },
-  { ARM64::LD3Threev4h_POST,  "ld3",  ".4h",    0, 24 },
-  { ARM64::LD3Threev2s_POST,  "ld3",  ".2s",    0, 24 },
-  { ARM64::LD4i8,             "ld4",  ".b",     2, 0  },
-  { ARM64::LD4i16,            "ld4",  ".h",     2, 0  },
-  { ARM64::LD4i32,            "ld4",  ".s",     2, 0  },
-  { ARM64::LD4i64,            "ld4",  ".d",     2, 0  },
-  { ARM64::LD4i8_POST,        "ld4",  ".b",     2, 4  },
-  { ARM64::LD4i16_POST,       "ld4",  ".h",     2, 8  },
-  { ARM64::LD4i32_POST,       "ld4",  ".s",     2, 16 },
-  { ARM64::LD4i64_POST,       "ld4",  ".d",     2, 32 },
-  { ARM64::LD4Rv16b,          "ld4r", ".16b",   0, 0  },
-  { ARM64::LD4Rv8h,           "ld4r", ".8h",    0, 0  },
-  { ARM64::LD4Rv4s,           "ld4r", ".4s",    0, 0  },
-  { ARM64::LD4Rv2d,           "ld4r", ".2d",    0, 0  },
-  { ARM64::LD4Rv8b,           "ld4r", ".8b",    0, 0  },
-  { ARM64::LD4Rv4h,           "ld4r", ".4h",    0, 0  },
-  { ARM64::LD4Rv2s,           "ld4r", ".2s",    0, 0  },
-  { ARM64::LD4Rv1d,           "ld4r", ".1d",    0, 0  },
-  { ARM64::LD4Rv16b_POST,     "ld4r", ".16b",   0, 4  },
-  { ARM64::LD4Rv8h_POST,      "ld4r", ".8h",    0, 8  },
-  { ARM64::LD4Rv4s_POST,      "ld4r", ".4s",    0, 16 },
-  { ARM64::LD4Rv2d_POST,      "ld4r", ".2d",    0, 32 },
-  { ARM64::LD4Rv8b_POST,      "ld4r", ".8b",    0, 4  },
-  { ARM64::LD4Rv4h_POST,      "ld4r", ".4h",    0, 8  },
-  { ARM64::LD4Rv2s_POST,      "ld4r", ".2s",    0, 16 },
-  { ARM64::LD4Rv1d_POST,      "ld4r", ".1d",    0, 32 },
-  { ARM64::LD4Fourv16b,       "ld4",  ".16b",   0, 0  },
-  { ARM64::LD4Fourv8h,        "ld4",  ".8h",    0, 0  },
-  { ARM64::LD4Fourv4s,        "ld4",  ".4s",    0, 0  },
-  { ARM64::LD4Fourv2d,        "ld4",  ".2d",    0, 0  },
-  { ARM64::LD4Fourv8b,        "ld4",  ".8b",    0, 0  },
-  { ARM64::LD4Fourv4h,        "ld4",  ".4h",    0, 0  },
-  { ARM64::LD4Fourv2s,        "ld4",  ".2s",    0, 0  },
-  { ARM64::LD4Fourv16b_POST,  "ld4",  ".16b",   0, 64 },
-  { ARM64::LD4Fourv8h_POST,   "ld4",  ".8h",    0, 64 },
-  { ARM64::LD4Fourv4s_POST,   "ld4",  ".4s",    0, 64 },
-  { ARM64::LD4Fourv2d_POST,   "ld4",  ".2d",    0, 64 },
-  { ARM64::LD4Fourv8b_POST,   "ld4",  ".8b",    0, 32 },
-  { ARM64::LD4Fourv4h_POST,   "ld4",  ".4h",    0, 32 },
-  { ARM64::LD4Fourv2s_POST,   "ld4",  ".2s",    0, 32 },
-  { ARM64::ST1i8,             "st1",  ".b",     1, 0  },
-  { ARM64::ST1i16,            "st1",  ".h",     1, 0  },
-  { ARM64::ST1i32,            "st1",  ".s",     1, 0  },
-  { ARM64::ST1i64,            "st1",  ".d",     1, 0  },
-  { ARM64::ST1i8_POST,        "st1",  ".b",     1, 1  },
-  { ARM64::ST1i16_POST,       "st1",  ".h",     1, 2  },
-  { ARM64::ST1i32_POST,       "st1",  ".s",     1, 4  },
-  { ARM64::ST1i64_POST,       "st1",  ".d",     1, 8  },
-  { ARM64::ST1Onev16b,        "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Onev8h,         "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Onev4s,         "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Onev2d,         "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Onev8b,         "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Onev4h,         "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Onev2s,         "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Onev1d,         "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Onev16b_POST,   "st1",  ".16b",   0, 16 },
-  { ARM64::ST1Onev8h_POST,    "st1",  ".8h",    0, 16 },
-  { ARM64::ST1Onev4s_POST,    "st1",  ".4s",    0, 16 },
-  { ARM64::ST1Onev2d_POST,    "st1",  ".2d",    0, 16 },
-  { ARM64::ST1Onev8b_POST,    "st1",  ".8b",    0, 8  },
-  { ARM64::ST1Onev4h_POST,    "st1",  ".4h",    0, 8  },
-  { ARM64::ST1Onev2s_POST,    "st1",  ".2s",    0, 8  },
-  { ARM64::ST1Onev1d_POST,    "st1",  ".1d",    0, 8  },
-  { ARM64::ST1Twov16b,        "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Twov8h,         "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Twov4s,         "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Twov2d,         "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Twov8b,         "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Twov4h,         "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Twov2s,         "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Twov1d,         "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Twov16b_POST,   "st1",  ".16b",   0, 32 },
-  { ARM64::ST1Twov8h_POST,    "st1",  ".8h",    0, 32 },
-  { ARM64::ST1Twov4s_POST,    "st1",  ".4s",    0, 32 },
-  { ARM64::ST1Twov2d_POST,    "st1",  ".2d",    0, 32 },
-  { ARM64::ST1Twov8b_POST,    "st1",  ".8b",    0, 16 },
-  { ARM64::ST1Twov4h_POST,    "st1",  ".4h",    0, 16 },
-  { ARM64::ST1Twov2s_POST,    "st1",  ".2s",    0, 16 },
-  { ARM64::ST1Twov1d_POST,    "st1",  ".1d",    0, 16 },
-  { ARM64::ST1Threev16b,      "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Threev8h,       "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Threev4s,       "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Threev2d,       "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Threev8b,       "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Threev4h,       "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Threev2s,       "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Threev1d,       "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Threev16b_POST, "st1",  ".16b",   0, 48 },
-  { ARM64::ST1Threev8h_POST,  "st1",  ".8h",    0, 48 },
-  { ARM64::ST1Threev4s_POST,  "st1",  ".4s",    0, 48 },
-  { ARM64::ST1Threev2d_POST,  "st1",  ".2d",    0, 48 },
-  { ARM64::ST1Threev8b_POST,  "st1",  ".8b",    0, 24 },
-  { ARM64::ST1Threev4h_POST,  "st1",  ".4h",    0, 24 },
-  { ARM64::ST1Threev2s_POST,  "st1",  ".2s",    0, 24 },
-  { ARM64::ST1Threev1d_POST,  "st1",  ".1d",    0, 24 },
-  { ARM64::ST1Fourv16b,       "st1",  ".16b",   0, 0  },
-  { ARM64::ST1Fourv8h,        "st1",  ".8h",    0, 0  },
-  { ARM64::ST1Fourv4s,        "st1",  ".4s",    0, 0  },
-  { ARM64::ST1Fourv2d,        "st1",  ".2d",    0, 0  },
-  { ARM64::ST1Fourv8b,        "st1",  ".8b",    0, 0  },
-  { ARM64::ST1Fourv4h,        "st1",  ".4h",    0, 0  },
-  { ARM64::ST1Fourv2s,        "st1",  ".2s",    0, 0  },
-  { ARM64::ST1Fourv1d,        "st1",  ".1d",    0, 0  },
-  { ARM64::ST1Fourv16b_POST,  "st1",  ".16b",   0, 64 },
-  { ARM64::ST1Fourv8h_POST,   "st1",  ".8h",    0, 64 },
-  { ARM64::ST1Fourv4s_POST,   "st1",  ".4s",    0, 64 },
-  { ARM64::ST1Fourv2d_POST,   "st1",  ".2d",    0, 64 },
-  { ARM64::ST1Fourv8b_POST,   "st1",  ".8b",    0, 32 },
-  { ARM64::ST1Fourv4h_POST,   "st1",  ".4h",    0, 32 },
-  { ARM64::ST1Fourv2s_POST,   "st1",  ".2s",    0, 32 },
-  { ARM64::ST1Fourv1d_POST,   "st1",  ".1d",    0, 32 },
-  { ARM64::ST2i8,             "st2",  ".b",     1, 0  },
-  { ARM64::ST2i16,            "st2",  ".h",     1, 0  },
-  { ARM64::ST2i32,            "st2",  ".s",     1, 0  },
-  { ARM64::ST2i64,            "st2",  ".d",     1, 0  },
-  { ARM64::ST2i8_POST,        "st2",  ".b",     1, 2  },
-  { ARM64::ST2i16_POST,       "st2",  ".h",     1, 4  },
-  { ARM64::ST2i32_POST,       "st2",  ".s",     1, 8  },
-  { ARM64::ST2i64_POST,       "st2",  ".d",     1, 16 },
-  { ARM64::ST2Twov16b,        "st2",  ".16b",   0, 0  },
-  { ARM64::ST2Twov8h,         "st2",  ".8h",    0, 0  },
-  { ARM64::ST2Twov4s,         "st2",  ".4s",    0, 0  },
-  { ARM64::ST2Twov2d,         "st2",  ".2d",    0, 0  },
-  { ARM64::ST2Twov8b,         "st2",  ".8b",    0, 0  },
-  { ARM64::ST2Twov4h,         "st2",  ".4h",    0, 0  },
-  { ARM64::ST2Twov2s,         "st2",  ".2s",    0, 0  },
-  { ARM64::ST2Twov16b_POST,   "st2",  ".16b",   0, 32 },
-  { ARM64::ST2Twov8h_POST,    "st2",  ".8h",    0, 32 },
-  { ARM64::ST2Twov4s_POST,    "st2",  ".4s",    0, 32 },
-  { ARM64::ST2Twov2d_POST,    "st2",  ".2d",    0, 32 },
-  { ARM64::ST2Twov8b_POST,    "st2",  ".8b",    0, 16 },
-  { ARM64::ST2Twov4h_POST,    "st2",  ".4h",    0, 16 },
-  { ARM64::ST2Twov2s_POST,    "st2",  ".2s",    0, 16 },
-  { ARM64::ST3i8,             "st3",  ".b",     1, 0  },
-  { ARM64::ST3i16,            "st3",  ".h",     1, 0  },
-  { ARM64::ST3i32,            "st3",  ".s",     1, 0  },
-  { ARM64::ST3i64,            "st3",  ".d",     1, 0  },
-  { ARM64::ST3i8_POST,        "st3",  ".b",     1, 3  },
-  { ARM64::ST3i16_POST,       "st3",  ".h",     1, 6  },
-  { ARM64::ST3i32_POST,       "st3",  ".s",     1, 12 },
-  { ARM64::ST3i64_POST,       "st3",  ".d",     1, 24 },
-  { ARM64::ST3Threev16b,      "st3",  ".16b",   0, 0  },
-  { ARM64::ST3Threev8h,       "st3",  ".8h",    0, 0  },
-  { ARM64::ST3Threev4s,       "st3",  ".4s",    0, 0  },
-  { ARM64::ST3Threev2d,       "st3",  ".2d",    0, 0  },
-  { ARM64::ST3Threev8b,       "st3",  ".8b",    0, 0  },
-  { ARM64::ST3Threev4h,       "st3",  ".4h",    0, 0  },
-  { ARM64::ST3Threev2s,       "st3",  ".2s",    0, 0  },
-  { ARM64::ST3Threev16b_POST, "st3",  ".16b",   0, 48 },
-  { ARM64::ST3Threev8h_POST,  "st3",  ".8h",    0, 48 },
-  { ARM64::ST3Threev4s_POST,  "st3",  ".4s",    0, 48 },
-  { ARM64::ST3Threev2d_POST,  "st3",  ".2d",    0, 48 },
-  { ARM64::ST3Threev8b_POST,  "st3",  ".8b",    0, 24 },
-  { ARM64::ST3Threev4h_POST,  "st3",  ".4h",    0, 24 },
-  { ARM64::ST3Threev2s_POST,  "st3",  ".2s",    0, 24 },
-  { ARM64::ST4i8,             "st4",  ".b",     1, 0  },
-  { ARM64::ST4i16,            "st4",  ".h",     1, 0  },
-  { ARM64::ST4i32,            "st4",  ".s",     1, 0  },
-  { ARM64::ST4i64,            "st4",  ".d",     1, 0  },
-  { ARM64::ST4i8_POST,        "st4",  ".b",     1, 4  },
-  { ARM64::ST4i16_POST,       "st4",  ".h",     1, 8  },
-  { ARM64::ST4i32_POST,       "st4",  ".s",     1, 16 },
-  { ARM64::ST4i64_POST,       "st4",  ".d",     1, 32 },
-  { ARM64::ST4Fourv16b,       "st4",  ".16b",   0, 0  },
-  { ARM64::ST4Fourv8h,        "st4",  ".8h",    0, 0  },
-  { ARM64::ST4Fourv4s,        "st4",  ".4s",    0, 0  },
-  { ARM64::ST4Fourv2d,        "st4",  ".2d",    0, 0  },
-  { ARM64::ST4Fourv8b,        "st4",  ".8b",    0, 0  },
-  { ARM64::ST4Fourv4h,        "st4",  ".4h",    0, 0  },
-  { ARM64::ST4Fourv2s,        "st4",  ".2s",    0, 0  },
-  { ARM64::ST4Fourv16b_POST,  "st4",  ".16b",   0, 64 },
-  { ARM64::ST4Fourv8h_POST,   "st4",  ".8h",    0, 64 },
-  { ARM64::ST4Fourv4s_POST,   "st4",  ".4s",    0, 64 },
-  { ARM64::ST4Fourv2d_POST,   "st4",  ".2d",    0, 64 },
-  { ARM64::ST4Fourv8b_POST,   "st4",  ".8b",    0, 32 },
-  { ARM64::ST4Fourv4h_POST,   "st4",  ".4h",    0, 32 },
-  { ARM64::ST4Fourv2s_POST,   "st4",  ".2s",    0, 32 },
-};
-
-static LdStNInstrDesc *getLdStNInstrDesc(unsigned Opcode) {
-  unsigned Idx;
-  for (Idx = 0; Idx != array_lengthof(LdStNInstInfo); ++Idx)
-    if (LdStNInstInfo[Idx].Opcode == Opcode)
-      return &LdStNInstInfo[Idx];
-
-  return 0;
-}
-
-void ARM64AppleInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
-                                      StringRef Annot) {
-  unsigned Opcode = MI->getOpcode();
-  StringRef Layout, Mnemonic;
-
-  bool IsTbx;
-  if (isTblTbxInstruction(MI->getOpcode(), Layout, IsTbx)) {
-    O << "\t" << (IsTbx ? "tbx" : "tbl") << Layout << '\t'
-      << getRegisterName(MI->getOperand(0).getReg(), ARM64::vreg) << ", ";
-
-    unsigned ListOpNum = IsTbx ? 2 : 1;
-    printVectorList(MI, ListOpNum, O, "");
-
-    O << ", "
-      << getRegisterName(MI->getOperand(ListOpNum + 1).getReg(), ARM64::vreg);
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  if (LdStNInstrDesc *LdStDesc = getLdStNInstrDesc(Opcode)) {
-    O << "\t" << LdStDesc->Mnemonic << LdStDesc->Layout << '\t';
-
-    // Now onto the operands: first a vector list with possible lane
-    // specifier. E.g. { v0 }[2]
-    printVectorList(MI, 0, O, "");
-
-    if (LdStDesc->LaneOperand != 0)
-      O << '[' << MI->getOperand(LdStDesc->LaneOperand).getImm() << ']';
-
-    // Next the address: [xN]
-    unsigned AddrOpNum = LdStDesc->LaneOperand + 1;
-    unsigned AddrReg = MI->getOperand(AddrOpNum).getReg();
-    O << ", [" << getRegisterName(AddrReg) << ']';
-
-    // Finally, there might be a post-indexed offset.
-    if (LdStDesc->NaturalOffset != 0) {
-      unsigned Reg = MI->getOperand(AddrOpNum + 1).getReg();
-      if (Reg != ARM64::XZR)
-        O << ", " << getRegisterName(Reg);
-      else {
-        assert(LdStDesc->NaturalOffset && "no offset on post-inc instruction?");
-        O << ", #" << LdStDesc->NaturalOffset;
-      }
-    }
-
-    printAnnotation(O, Annot);
-    return;
-  }
-
-  ARM64InstPrinter::printInst(MI, O, Annot);
-}
-
-bool ARM64InstPrinter::printSysAlias(const MCInst *MI, raw_ostream &O) {
-#ifndef NDEBUG
-  unsigned Opcode = MI->getOpcode();
-  assert((Opcode == ARM64::SYS || Opcode == ARM64::SYSxt) &&
-         "Invalid opcode for SYS alias!");
-#endif
-
-  const char *Asm = 0;
-  const MCOperand &Op1 = MI->getOperand(0);
-  const MCOperand &Cn = MI->getOperand(1);
-  const MCOperand &Cm = MI->getOperand(2);
-  const MCOperand &Op2 = MI->getOperand(3);
-
-  unsigned Op1Val = Op1.getImm();
-  unsigned CnVal = Cn.getImm();
-  unsigned CmVal = Cm.getImm();
-  unsigned Op2Val = Op2.getImm();
-
-  if (CnVal == 7) {
-    switch (CmVal) {
-    default:
-      break;
-
-    // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
-    // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
-    // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    }
-  } else if (CnVal == 8) {
-    // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
-  }
-
-  if (Asm) {
-    O << '\t' << Asm;
-    if (MI->getNumOperands() == 5)
-      O << ", " << getRegisterName(MI->getOperand(4).getReg());
-  }
-
-  return Asm != 0;
-}
-
-void ARM64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                    raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    O << getRegisterName(Reg);
-  } else if (Op.isImm()) {
-    O << '#' << Op.getImm();
-  } else {
-    assert(Op.isExpr() && "unknown operand kind in printOperand");
-    O << *Op.getExpr();
-  }
-}
-
-void ARM64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
-                                           unsigned Imm, raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isReg()) {
-    unsigned Reg = Op.getReg();
-    if (Reg == ARM64::XZR)
-      O << "#" << Imm;
-    else
-      O << getRegisterName(Reg);
-  } else
-    assert(0 && "unknown operand kind in printPostIncOperand64");
-}
-
-void ARM64InstPrinter::printPostIncOperand1(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 1, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand2(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 2, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand3(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 3, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand4(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 4, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand6(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 6, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand8(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 8, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand12(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 12, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand16(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 16, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand24(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 24, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand32(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 32, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand48(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 48, O);
-}
-
-void ARM64InstPrinter::printPostIncOperand64(const MCInst *MI, unsigned OpNo,
-                                             raw_ostream &O) {
-  printPostIncOperand(MI, OpNo, 64, O);
-}
-
-void ARM64InstPrinter::printVRegOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isReg() && "Non-register vreg operand!");
-  unsigned Reg = Op.getReg();
-  O << getRegisterName(Reg, ARM64::vreg);
-}
-
-void ARM64InstPrinter::printSysCROperand(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
-  assert(Op.isImm() && "System instruction C[nm] operands must be immediates!");
-  O << "c" << Op.getImm();
-}
-
-void ARM64InstPrinter::printAddSubImm(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  if (MO.isImm()) {
-    unsigned Val = (MO.getImm() & 0xfff);
-    assert(Val == MO.getImm() && "Add/sub immediate out of range!");
-    unsigned Shift =
-        ARM64_AM::getShiftValue(MI->getOperand(OpNum + 1).getImm());
-    O << '#' << (Val << Shift);
-    // Distinguish "0, lsl #12" from "0, lsl #0".
-    if (Val == 0 && Shift != 0)
-      printShifter(MI, OpNum + 1, O);
-  } else {
-    assert(MO.isExpr() && "Unexpected operand type!");
-    O << *MO.getExpr();
-    printShifter(MI, OpNum + 1, O);
-  }
-}
-
-void ARM64InstPrinter::printLogicalImm32(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 32));
-}
-
-void ARM64InstPrinter::printLogicalImm64(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  uint64_t Val = MI->getOperand(OpNum).getImm();
-  O << "#0x";
-  O.write_hex(ARM64_AM::decodeLogicalImmediate(Val, 64));
-}
-
-void ARM64InstPrinter::printShifter(const MCInst *MI, unsigned OpNum,
-                                    raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  // LSL #0 should not be printed.
-  if (ARM64_AM::getShiftType(Val) == ARM64_AM::LSL &&
-      ARM64_AM::getShiftValue(Val) == 0)
-    return;
-  O << ", " << ARM64_AM::getShiftName(ARM64_AM::getShiftType(Val)) << " #"
-    << ARM64_AM::getShiftValue(Val);
-}
-
-void ARM64InstPrinter::printShiftedRegister(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printShifter(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtendedRegister(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  O << getRegisterName(MI->getOperand(OpNum).getReg());
-  printExtend(MI, OpNum + 1, O);
-}
-
-void ARM64InstPrinter::printExtend(const MCInst *MI, unsigned OpNum,
-                                   raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNum).getImm();
-  ARM64_AM::ExtendType ExtType = ARM64_AM::getArithExtendType(Val);
-  unsigned ShiftVal = ARM64_AM::getArithShiftValue(Val);
-
-  // If the destination or first source register operand is [W]SP, print
-  // UXTW/UXTX as LSL, and if the shift amount is also zero, print nothing at
-  // all.
-  if (ExtType == ARM64_AM::UXTW || ExtType == ARM64_AM::UXTX) {
-    unsigned Dest = MI->getOperand(0).getReg();
-    unsigned Src1 = MI->getOperand(1).getReg();
-    if (Dest == ARM64::SP || Dest == ARM64::WSP || Src1 == ARM64::SP ||
-        Src1 == ARM64::WSP) {
-      if (ShiftVal != 0)
-        O << ", lsl #" << ShiftVal;
-      return;
-    }
-  }
-  O << ", " << ARM64_AM::getExtendName(ExtType);
-  if (ShiftVal != 0)
-    O << " #" << ShiftVal;
-}
-
-void ARM64InstPrinter::printDotCondCode(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  if (CC != ARM64CC::AL)
-    O << '.' << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printCondCode(const MCInst *MI, unsigned OpNum,
-                                     raw_ostream &O) {
-  ARM64CC::CondCode CC = (ARM64CC::CondCode)MI->getOperand(OpNum).getImm();
-  O << ARM64CC::getCondCodeName(CC);
-}
-
-void ARM64InstPrinter::printAMNoIndex(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']';
-}
-
-void ARM64InstPrinter::printImmScale4(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '#' << 4 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printImmScale8(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  O << '#' << 8 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printImmScale16(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  O << '#' << 16 * MI->getOperand(OpNum).getImm();
-}
-
-void ARM64InstPrinter::printAMIndexed(const MCInst *MI, unsigned OpNum,
-                                      unsigned Scale, raw_ostream &O) {
-  const MCOperand MO1 = MI->getOperand(OpNum + 1);
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg());
-  if (MO1.isImm()) {
-    if (MO1.getImm() != 0)
-      O << ", #" << (MO1.getImm() * Scale);
-  } else {
-    assert(MO1.isExpr() && "Unexpected operand type!");
-    O << ", " << *MO1.getExpr();
-  }
-  O << ']';
-}
-
-void ARM64InstPrinter::printPrefetchOp(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O) {
-  unsigned prfop = MI->getOperand(OpNum).getImm();
-  if (ARM64_AM::isNamedPrefetchOp(prfop))
-    O << ARM64_AM::getPrefetchOpName((ARM64_AM::PrefetchOp)prfop);
-  else
-    O << '#' << prfop;
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed32(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 4 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed64(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 8 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed128(const MCInst *MI,
-                                                 unsigned OpNum,
-                                                 raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << 16 * MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryPostIndexed(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ']' << ", #"
-    << MI->getOperand(OpNum + 1).getImm();
-}
-
-void ARM64InstPrinter::printMemoryRegOffset(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O, int LegalShiftAmt) {
-  O << '[' << getRegisterName(MI->getOperand(OpNum).getReg()) << ", "
-    << getRegisterName(MI->getOperand(OpNum + 1).getReg());
-
-  unsigned Val = MI->getOperand(OpNum + 2).getImm();
-  ARM64_AM::ExtendType ExtType = ARM64_AM::getMemExtendType(Val);
-  bool DoShift = ARM64_AM::getMemDoShift(Val);
-
-  if (ExtType == ARM64_AM::UXTX) {
-    if (DoShift)
-      O << ", lsl";
-  } else
-    O << ", " << ARM64_AM::getExtendName(ExtType);
-
-  if (DoShift)
-    O << " #" << LegalShiftAmt;
-
-  O << "]";
-}
-
-void ARM64InstPrinter::printMemoryRegOffset8(const MCInst *MI, unsigned OpNum,
-                                             raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 0);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset16(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 1);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset32(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 2);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset64(const MCInst *MI, unsigned OpNum,
-                                              raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 3);
-}
-
-void ARM64InstPrinter::printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
-                                               raw_ostream &O) {
-  printMemoryRegOffset(MI, OpNum, O, 4);
-}
-
-void ARM64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
-                                         raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(OpNum);
-  O << '#';
-  if (MO.isFPImm())
-    // FIXME: Should this ever happen?
-    O << MO.getFPImm();
-  else
-    O << ARM64_AM::getFPImmFloat(MO.getImm());
-}
-
-static unsigned getNextVectorRegister(unsigned Reg, unsigned Stride = 1) {
-  while (Stride--) {
-    switch (Reg) {
-    default:
-      assert(0 && "Vector register expected!");
-    case ARM64::Q0:  Reg = ARM64::Q1;  break;
-    case ARM64::Q1:  Reg = ARM64::Q2;  break;
-    case ARM64::Q2:  Reg = ARM64::Q3;  break;
-    case ARM64::Q3:  Reg = ARM64::Q4;  break;
-    case ARM64::Q4:  Reg = ARM64::Q5;  break;
-    case ARM64::Q5:  Reg = ARM64::Q6;  break;
-    case ARM64::Q6:  Reg = ARM64::Q7;  break;
-    case ARM64::Q7:  Reg = ARM64::Q8;  break;
-    case ARM64::Q8:  Reg = ARM64::Q9;  break;
-    case ARM64::Q9:  Reg = ARM64::Q10; break;
-    case ARM64::Q10: Reg = ARM64::Q11; break;
-    case ARM64::Q11: Reg = ARM64::Q12; break;
-    case ARM64::Q12: Reg = ARM64::Q13; break;
-    case ARM64::Q13: Reg = ARM64::Q14; break;
-    case ARM64::Q14: Reg = ARM64::Q15; break;
-    case ARM64::Q15: Reg = ARM64::Q16; break;
-    case ARM64::Q16: Reg = ARM64::Q17; break;
-    case ARM64::Q17: Reg = ARM64::Q18; break;
-    case ARM64::Q18: Reg = ARM64::Q19; break;
-    case ARM64::Q19: Reg = ARM64::Q20; break;
-    case ARM64::Q20: Reg = ARM64::Q21; break;
-    case ARM64::Q21: Reg = ARM64::Q22; break;
-    case ARM64::Q22: Reg = ARM64::Q23; break;
-    case ARM64::Q23: Reg = ARM64::Q24; break;
-    case ARM64::Q24: Reg = ARM64::Q25; break;
-    case ARM64::Q25: Reg = ARM64::Q26; break;
-    case ARM64::Q26: Reg = ARM64::Q27; break;
-    case ARM64::Q27: Reg = ARM64::Q28; break;
-    case ARM64::Q28: Reg = ARM64::Q29; break;
-    case ARM64::Q29: Reg = ARM64::Q30; break;
-    case ARM64::Q30: Reg = ARM64::Q31; break;
-    // Vector lists can wrap around.
-    case ARM64::Q31:
-      Reg = ARM64::Q0;
-      break;
-    }
-  }
-  return Reg;
-}
-
-void ARM64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
-                                       raw_ostream &O, StringRef LayoutSuffix) {
-  unsigned Reg = MI->getOperand(OpNum).getReg();
-
-  O << "{ ";
-
-  // Work out how many registers there are in the list (if there is an actual
-  // list).
-  unsigned NumRegs = 1;
-  if (MRI.getRegClass(ARM64::DDRegClassID).contains(Reg) ||
-      MRI.getRegClass(ARM64::QQRegClassID).contains(Reg))
-    NumRegs = 2;
-  else if (MRI.getRegClass(ARM64::DDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQRegClassID).contains(Reg))
-    NumRegs = 3;
-  else if (MRI.getRegClass(ARM64::DDDDRegClassID).contains(Reg) ||
-           MRI.getRegClass(ARM64::QQQQRegClassID).contains(Reg))
-    NumRegs = 4;
-
-  // Now forget about the list and find out what the first register is.
-  if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::dsub0))
-    Reg = FirstReg;
-  else if (unsigned FirstReg = MRI.getSubReg(Reg, ARM64::qsub0))
-    Reg = FirstReg;
-
-  // If it's a D-reg, we need to promote it to the equivalent Q-reg before
-  // printing (otherwise getRegisterName fails).
-  if (MRI.getRegClass(ARM64::FPR64RegClassID).contains(Reg)) {
-    const MCRegisterClass &FPR128RC = MRI.getRegClass(ARM64::FPR128RegClassID);
-    Reg = MRI.getMatchingSuperReg(Reg, ARM64::dsub, &FPR128RC);
-  }
-
-  for (unsigned i = 0; i < NumRegs; ++i, Reg = getNextVectorRegister(Reg)) {
-    O << getRegisterName(Reg, ARM64::vreg) << LayoutSuffix;
-    if (i + 1 != NumRegs)
-      O << ", ";
-  }
-
-  O << " }";
-}
-
-void ARM64InstPrinter::printImplicitlyTypedVectorList(const MCInst *MI,
-                                                      unsigned OpNum,
-                                                      raw_ostream &O) {
-  printVectorList(MI, OpNum, O, "");
-}
-
-template <unsigned NumLanes, char LaneKind>
-void ARM64InstPrinter::printTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                            raw_ostream &O) {
-  std::string Suffix(".");
-  if (NumLanes)
-    Suffix += itostr(NumLanes) + LaneKind;
-  else
-    Suffix += LaneKind;
-
-  printVectorList(MI, OpNum, O, Suffix);
-}
-
-void ARM64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
-                                        raw_ostream &O) {
-  O << "[" << MI->getOperand(OpNum).getImm() << "]";
-}
-
-void ARM64InstPrinter::printAlignedBranchTarget(const MCInst *MI,
-                                                unsigned OpNum,
-                                                raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 2);
-    return;
-  }
-
-  // If the branch target is simply an address then print it in hex.
-  const MCConstantExpr *BranchTarget =
-      dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
-  int64_t Address;
-  if (BranchTarget && BranchTarget->EvaluateAsAbsolute(Address)) {
-    O << "0x";
-    O.write_hex(Address);
-  } else {
-    // Otherwise, just print the expression.
-    O << *MI->getOperand(OpNum).getExpr();
-  }
-}
-
-void ARM64InstPrinter::printAdrpLabel(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNum);
-
-  // If the label has already been resolved to an immediate offset (say, when
-  // we're running the disassembler), just print the immediate.
-  if (Op.isImm()) {
-    O << "#" << (Op.getImm() << 12);
-    return;
-  }
-
-  // Otherwise, just print the expression.
-  O << *MI->getOperand(OpNum).getExpr();
-}
-
-void ARM64InstPrinter::printBarrierOption(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name = ARM64SYS::getBarrierOptName((ARM64SYS::BarrierOption)Val);
-  if (Name)
-    O << Name;
-  else
-    O << "#" << Val;
-}
-
-void ARM64InstPrinter::printSystemRegister(const MCInst *MI, unsigned OpNo,
-                                           raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name =
-      ARM64SYS::getSystemRegisterName((ARM64SYS::SystemRegister)Val);
-  if (Name) {
-    O << Name;
-    return;
-  }
-
-  unsigned Op0 = 2 | ((Val >> 14) & 1);
-  unsigned Op1 = (Val >> 11) & 7;
-  unsigned CRn = (Val >> 7) & 0xf;
-  unsigned CRm = (Val >> 3) & 0xf;
-  unsigned Op2 = Val & 7;
-
-  O << 'S' << Op0 << '_' << Op1 << "_C" << CRn << "_C" << CRm << '_' << Op2;
-}
-
-void ARM64InstPrinter::printSystemCPSRField(const MCInst *MI, unsigned OpNo,
-                                            raw_ostream &O) {
-  unsigned Val = MI->getOperand(OpNo).getImm();
-  const char *Name = ARM64SYS::getCPSRFieldName((ARM64SYS::CPSRField)Val);
-  O << Name;
-}
-
-void ARM64InstPrinter::printSIMDType10Operand(const MCInst *MI, unsigned OpNo,
-                                              raw_ostream &O) {
-  unsigned RawVal = MI->getOperand(OpNo).getImm();
-  uint64_t Val = ARM64_AM::decodeAdvSIMDModImmType10(RawVal);
-  O << format("#%#016llx", Val);
-}
diff --git a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h b/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
deleted file mode 100644
index ff66ff0..0000000
--- a/lib/Target/ARM64/InstPrinter/ARM64InstPrinter.h
+++ /dev/null
@@ -1,157 +0,0 @@
-//===-- ARM64InstPrinter.h - Convert ARM64 MCInst to assembly syntax ------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an ARM64 MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64INSTPRINTER_H
-#define ARM64INSTPRINTER_H
-
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/MC/MCInstPrinter.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-
-namespace llvm {
-
-class MCOperand;
-
-class ARM64InstPrinter : public MCInstPrinter {
-public:
-  ARM64InstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                   const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-
-  // Autogenerated by tblgen.
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-
-protected:
-  bool printSysAlias(const MCInst *MI, raw_ostream &O);
-  // Operand printers
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
-                           raw_ostream &O);
-  void printPostIncOperand1(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand2(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand3(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand4(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand6(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand8(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand12(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand16(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand24(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand32(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand48(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printPostIncOperand64(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printVRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printSysCROperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printAddSubImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printLogicalImm64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShifter(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printShiftedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtendedRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printExtend(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printDotCondCode(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAlignedBranchTarget(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printAMIndexed(const MCInst *MI, unsigned OpNum, unsigned Scale,
-                      raw_ostream &O);
-  void printAMIndexed128(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 16, O);
-  }
-
-  void printAMIndexed64(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 8, O);
-  }
-
-  void printAMIndexed32(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 4, O);
-  }
-
-  void printAMIndexed16(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 2, O);
-  }
-
-  void printAMIndexed8(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 1, O);
-  }
-  void printAMUnscaled(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
-    printAMIndexed(MI, OpNum, 1, O);
-  }
-  void printAMNoIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale4(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printImmScale16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printPrefetchOp(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryPostIndexed(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryPostIndexed32(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printMemoryPostIndexed64(const MCInst *MI, unsigned OpNum,
-                                raw_ostream &O);
-  void printMemoryPostIndexed128(const MCInst *MI, unsigned OpNum,
-                                 raw_ostream &O);
-  void printMemoryRegOffset(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                            int LegalShiftAmt);
-  void printMemoryRegOffset8(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset16(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset32(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset64(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printMemoryRegOffset128(const MCInst *MI, unsigned OpNum,
-                               raw_ostream &O);
-
-  void printFPImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O,
-                       StringRef LayoutSuffix);
-
-  /// Print a list of vector registers where the type suffix is implicit
-  /// (i.e. attached to the instruction rather than the registers).
-  void printImplicitlyTypedVectorList(const MCInst *MI, unsigned OpNum,
-                                      raw_ostream &O);
-
-  template <unsigned NumLanes, char LaneKind>
-  void printTypedVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-
-  void printVectorIndex(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printAdrpLabel(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printBarrierOption(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSystemCPSRField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-};
-
-class ARM64AppleInstPrinter : public ARM64InstPrinter {
-public:
-  ARM64AppleInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
-                        const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
-
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
-
-  virtual void printInstruction(const MCInst *MI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  virtual StringRef getRegName(unsigned RegNo) const {
-    return getRegisterName(RegNo);
-  }
-  static const char *getRegisterName(unsigned RegNo,
-                                     unsigned AltIdx = ARM64::NoRegAltName);
-};
-}
-
-#endif
diff --git a/lib/Target/ARM64/InstPrinter/CMakeLists.txt b/lib/Target/ARM64/InstPrinter/CMakeLists.txt
deleted file mode 100644
index b8ee12c..0000000
--- a/lib/Target/ARM64/InstPrinter/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64AsmPrinter
-  ARM64InstPrinter.cpp
-  )
-
-add_dependencies(LLVMARM64AsmPrinter ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt b/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
deleted file mode 100644
index 2ec83d2..0000000
--- a/lib/Target/ARM64/InstPrinter/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64AsmPrinter
-parent = ARM64
-required_libraries = MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/InstPrinter/Makefile b/lib/Target/ARM64/InstPrinter/Makefile
deleted file mode 100644
index a59efb0..0000000
--- a/lib/Target/ARM64/InstPrinter/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/AsmPrinter/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64AsmPrinter
-
-# Hack: we need to include 'main' arm target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/LLVMBuild.txt b/lib/Target/ARM64/LLVMBuild.txt
deleted file mode 100644
index 45b0628..0000000
--- a/lib/Target/ARM64/LLVMBuild.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-;===- ./lib/Target/ARM64/LLVMBuild.txt -------------------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[common]
-subdirectories = AsmParser Disassembler InstPrinter MCTargetDesc TargetInfo
-
-[component_0]
-type = TargetGroup
-name = ARM64
-parent = Target
-has_asmparser = 1
-has_asmprinter = 1
-has_disassembler = 1
-has_jit = 1
-
-[component_1]
-type = Library
-name = ARM64CodeGen
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Desc ARM64Info Analysis AsmPrinter CodeGen Core MC SelectionDAG Support Target
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
deleted file mode 100644
index 26813e2..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64AsmBackend.cpp
+++ /dev/null
@@ -1,533 +0,0 @@
-//===-- ARM64AsmBackend.cpp - ARM64 Assembler Backend ---------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64.h"
-#include "ARM64RegisterInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCDirectives.h"
-#include "llvm/MC/MCFixupKindInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/MachO.h"
-using namespace llvm;
-
-namespace {
-
-class ARM64AsmBackend : public MCAsmBackend {
-  static const unsigned PCRelFlagVal =
-      MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
-
-public:
-  ARM64AsmBackend(const Target &T) : MCAsmBackend() {}
-
-  unsigned getNumFixupKinds() const { return ARM64::NumTargetFixupKinds; }
-
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
-    const static MCFixupKindInfo Infos[ARM64::NumTargetFixupKinds] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // ARM64FixupKinds.h.
-      //
-      // Name                           Offset (bits) Size (bits)     Flags
-      { "fixup_arm64_pcrel_adr_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_pcrel_adrp_imm21", 0, 32, PCRelFlagVal },
-      { "fixup_arm64_add_imm12", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale1", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale2", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale4", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale8", 10, 12, 0 },
-      { "fixup_arm64_ldst_imm12_scale16", 10, 12, 0 },
-      { "fixup_arm64_movw", 5, 16, 0 },
-      { "fixup_arm64_pcrel_branch14", 5, 14, PCRelFlagVal },
-      { "fixup_arm64_pcrel_imm19", 5, 19, PCRelFlagVal },
-      { "fixup_arm64_pcrel_branch26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_pcrel_call26", 0, 26, PCRelFlagVal },
-      { "fixup_arm64_tlsdesc_call", 0, 0, 0 }
-    };
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
-
-  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const;
-
-  bool mayNeedRelaxation(const MCInst &Inst) const;
-  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                            const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const;
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const;
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
-
-  void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
-
-  unsigned getPointerSize() const { return 8; }
-};
-
-} // end anonymous namespace
-
-/// \brief The number of bytes the fixup may change.
-static unsigned getFixupKindNumBytes(unsigned Kind) {
-  switch (Kind) {
-  default:
-    assert(0 && "Unknown fixup kind!");
-
-  case ARM64::fixup_arm64_tlsdesc_call:
-    return 0;
-
-  case FK_Data_1:
-    return 1;
-
-  case FK_Data_2:
-  case ARM64::fixup_arm64_movw:
-    return 2;
-
-  case ARM64::fixup_arm64_pcrel_branch14:
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-  case ARM64::fixup_arm64_pcrel_imm19:
-    return 3;
-
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-  case FK_Data_4:
-    return 4;
-
-  case FK_Data_8:
-    return 8;
-  }
-}
-
-static unsigned AdrImmBits(unsigned Value) {
-  unsigned lo2 = Value & 0x3;
-  unsigned hi19 = (Value & 0x1ffffc) >> 2;
-  return (hi19 << 5) | (lo2 << 29);
-}
-
-static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
-  int64_t SignedValue = static_cast<int64_t>(Value);
-  switch (Kind) {
-  default:
-    assert(false && "Unknown fixup kind!");
-  case ARM64::fixup_arm64_pcrel_adr_imm21:
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    return AdrImmBits(Value & 0x1fffffULL);
-  case ARM64::fixup_arm64_pcrel_adrp_imm21:
-    return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
-  case ARM64::fixup_arm64_pcrel_imm19:
-    // Signed 21-bit immediate
-    if (SignedValue > 2097151 || SignedValue < -2097152)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded.
-    return (Value >> 2) & 0x7ffff;
-  case ARM64::fixup_arm64_add_imm12:
-  case ARM64::fixup_arm64_ldst_imm12_scale1:
-    // Unsigned 12-bit immediate
-    if (Value >= 0x1000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value;
-  case ARM64::fixup_arm64_ldst_imm12_scale2:
-    // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Value & 1 || Value >= 0x2000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 1;
-  case ARM64::fixup_arm64_ldst_imm12_scale4:
-    // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Value & 3 || Value >= 0x4000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 2;
-  case ARM64::fixup_arm64_ldst_imm12_scale8:
-    // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Value & 7 || Value >= 0x8000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 3;
-  case ARM64::fixup_arm64_ldst_imm12_scale16:
-    // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Value & 15 || Value >= 0x10000)
-      report_fatal_error("invalid imm12 fixup value");
-    return Value >> 4;
-  case ARM64::fixup_arm64_movw:
-    report_fatal_error("no resolvable MOVZ/MOVK fixups supported yet");
-    return Value;
-  case ARM64::fixup_arm64_pcrel_branch14:
-    // Signed 16-bit immediate
-    if (SignedValue > 32767 || SignedValue < -32768)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3fff;
-  case ARM64::fixup_arm64_pcrel_branch26:
-  case ARM64::fixup_arm64_pcrel_call26:
-    // Signed 28-bit immediate
-    if (SignedValue > 134217727 || SignedValue < -134217728)
-      report_fatal_error("fixup value out of range");
-    // Low two bits are not encoded (4-byte alignment assumed).
-    if (Value & 0x3)
-      report_fatal_error("fixup not sufficiently aligned");
-    return (Value >> 2) & 0x3ffffff;
-  case FK_Data_1:
-  case FK_Data_2:
-  case FK_Data_4:
-  case FK_Data_8:
-    return Value;
-  }
-}
-
-void ARM64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                 unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  if (!Value)
-    return; // Doesn't change encoding.
-  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
-  // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup.getKind(), Value);
-
-  // Shift the value into position.
-  Value <<= Info.TargetOffset;
-
-  unsigned Offset = Fixup.getOffset();
-  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
-
-  // For each byte of the fragment that the fixup touches, mask in the
-  // bits from the fixup value.
-  for (unsigned i = 0; i != NumBytes; ++i)
-    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
-}
-
-bool ARM64AsmBackend::mayNeedRelaxation(const MCInst &Inst) const {
-  return false;
-}
-
-bool ARM64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                           const MCRelaxableFragment *DF,
-                                           const MCAsmLayout &Layout) const {
-  // FIXME:  This isn't correct for ARM64. Just moving the "generic" logic
-  // into the targets for now.
-  //
-  // Relax if the value is too big for a (signed) i8.
-  return int64_t(Value) != int64_t(int8_t(Value));
-}
-
-void ARM64AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
-  assert(false && "ARM64AsmBackend::relaxInstruction() unimplemented");
-}
-
-bool ARM64AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  // If the count is not 4-byte aligned, we must be writing data into the text
-  // section (otherwise we have unaligned instructions, and thus have far
-  // bigger problems), so just write zeros instead.
-  if ((Count & 3) != 0) {
-    for (uint64_t i = 0, e = (Count & 3); i != e; ++i)
-      OW->Write8(0);
-  }
-
-  // We are properly aligned, so write NOPs as requested.
-  Count /= 4;
-  for (uint64_t i = 0; i != Count; ++i)
-    OW->Write32(0xd503201f);
-  return true;
-}
-
-namespace {
-
-namespace CU {
-
-/// \brief Compact unwind encoding values.
-enum CompactUnwindEncodings {
-  /// \brief A "frameless" leaf function, where no non-volatile registers are
-  /// saved. The return remains in LR throughout the function.
-  UNWIND_ARM64_MODE_FRAMELESS = 0x02000000,
-
-  /// \brief No compact unwind encoding available. Instead the low 23-bits of
-  /// the compact unwind encoding is the offset of the DWARF FDE in the
-  /// __eh_frame section. This mode is never used in object files. It is only
-  /// generated by the linker in final linked images, which have only DWARF info
-  /// for a function.
-  UNWIND_ARM64_MODE_DWARF = 0x03000000,
-
-  /// \brief This is a standard arm64 prologue where FP/LR are immediately
-  /// pushed on the stack, then SP is copied to FP. If there are any
-  /// non-volatile register saved, they are copied into the stack fame in pairs
-  /// in a contiguous ranger right below the saved FP/LR pair. Any subset of the
-  /// five X pairs and four D pairs can be saved, but the memory layout must be
-  /// in register number order.
-  UNWIND_ARM64_MODE_FRAME = 0x04000000,
-
-  /// \brief Frame register pair encodings.
-  UNWIND_ARM64_FRAME_X19_X20_PAIR = 0x00000001,
-  UNWIND_ARM64_FRAME_X21_X22_PAIR = 0x00000002,
-  UNWIND_ARM64_FRAME_X23_X24_PAIR = 0x00000004,
-  UNWIND_ARM64_FRAME_X25_X26_PAIR = 0x00000008,
-  UNWIND_ARM64_FRAME_X27_X28_PAIR = 0x00000010,
-  UNWIND_ARM64_FRAME_D8_D9_PAIR = 0x00000100,
-  UNWIND_ARM64_FRAME_D10_D11_PAIR = 0x00000200,
-  UNWIND_ARM64_FRAME_D12_D13_PAIR = 0x00000400,
-  UNWIND_ARM64_FRAME_D14_D15_PAIR = 0x00000800
-};
-
-} // end CU namespace
-
-// FIXME: This should be in a separate file.
-class DarwinARM64AsmBackend : public ARM64AsmBackend {
-  const MCRegisterInfo &MRI;
-
-  /// \brief Encode compact unwind stack adjustment for frameless functions.
-  /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
-  /// The stack size always needs to be 16 byte aligned.
-  uint32_t encodeStackAdjustment(uint32_t StackSize) const {
-    return (StackSize / 16) << 12;
-  }
-
-public:
-  DarwinARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI)
-      : ARM64AsmBackend(T), MRI(MRI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARM64MachObjectWriter(OS, MachO::CPU_TYPE_ARM64,
-                                       MachO::CPU_SUBTYPE_ARM64_ALL);
-  }
-
-  virtual bool doesSectionRequireSymbols(const MCSection &Section) const {
-    // Any section for which the linker breaks things into atoms needs to
-    // preserve symbols, including assembler local symbols, to identify
-    // those atoms. These sections are:
-    // Sections of type:
-    //
-    //    S_CSTRING_LITERALS  (e.g. __cstring)
-    //    S_LITERAL_POINTERS  (e.g.  objc selector pointers)
-    //    S_16BYTE_LITERALS, S_8BYTE_LITERALS, S_4BYTE_LITERALS
-    //
-    // Sections named:
-    //
-    //    __TEXT,__eh_frame
-    //    __TEXT,__ustring
-    //    __DATA,__cfstring
-    //    __DATA,__objc_classrefs
-    //    __DATA,__objc_catlist
-    //
-    // FIXME: It would be better if the compiler used actual linker local
-    // symbols for each of these sections rather than preserving what
-    // are ostensibly assembler local symbols.
-    const MCSectionMachO &SMO = static_cast<const MCSectionMachO &>(Section);
-    return (SMO.getType() == MachO::S_CSTRING_LITERALS ||
-            SMO.getType() == MachO::S_4BYTE_LITERALS ||
-            SMO.getType() == MachO::S_8BYTE_LITERALS ||
-            SMO.getType() == MachO::S_16BYTE_LITERALS ||
-            SMO.getType() == MachO::S_LITERAL_POINTERS ||
-            (SMO.getSegmentName() == "__TEXT" &&
-             (SMO.getSectionName() == "__eh_frame" ||
-              SMO.getSectionName() == "__ustring")) ||
-            (SMO.getSegmentName() == "__DATA" &&
-             (SMO.getSectionName() == "__cfstring" ||
-              SMO.getSectionName() == "__objc_classrefs" ||
-              SMO.getSectionName() == "__objc_catlist")));
-  }
-
-  /// \brief Generate the compact unwind encoding from the CFI directives.
-  virtual uint32_t
-  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const
-      override {
-    if (Instrs.empty())
-      return CU::UNWIND_ARM64_MODE_FRAMELESS;
-
-    bool HasFP = false;
-    unsigned StackSize = 0;
-
-    uint32_t CompactUnwindEncoding = 0;
-    for (size_t i = 0, e = Instrs.size(); i != e; ++i) {
-      const MCCFIInstruction &Inst = Instrs[i];
-
-      switch (Inst.getOperation()) {
-      default:
-        // Cannot handle this directive:  bail out.
-        return CU::UNWIND_ARM64_MODE_DWARF;
-      case MCCFIInstruction::OpDefCfa: {
-        // Defines a frame pointer.
-        assert(getXRegFromWReg(MRI.getLLVMRegNum(Inst.getRegister(), true)) ==
-                   ARM64::FP &&
-               "Invalid frame pointer!");
-        assert(i + 2 < e && "Insufficient CFI instructions to define a frame!");
-
-        const MCCFIInstruction &LRPush = Instrs[++i];
-        assert(LRPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Link register not pushed!");
-        const MCCFIInstruction &FPPush = Instrs[++i];
-        assert(FPPush.getOperation() == MCCFIInstruction::OpOffset &&
-               "Frame pointer not pushed!");
-
-        unsigned LRReg = MRI.getLLVMRegNum(LRPush.getRegister(), true);
-        unsigned FPReg = MRI.getLLVMRegNum(FPPush.getRegister(), true);
-
-        LRReg = getXRegFromWReg(LRReg);
-        FPReg = getXRegFromWReg(FPReg);
-
-        assert(LRReg == ARM64::LR && FPReg == ARM64::FP &&
-               "Pushing invalid registers for frame!");
-
-        // Indicate that the function has a frame.
-        CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAME;
-        HasFP = true;
-        break;
-      }
-      case MCCFIInstruction::OpDefCfaOffset: {
-        assert(StackSize == 0 && "We already have the CFA offset!");
-        StackSize = std::abs(Inst.getOffset());
-        break;
-      }
-      case MCCFIInstruction::OpOffset: {
-        // Registers are saved in pairs. We expect there to be two consecutive
-        // `.cfi_offset' instructions with the appropriate registers specified.
-        unsigned Reg1 = MRI.getLLVMRegNum(Inst.getRegister(), true);
-        if (i + 1 == e)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-
-        const MCCFIInstruction &Inst2 = Instrs[++i];
-        if (Inst2.getOperation() != MCCFIInstruction::OpOffset)
-          return CU::UNWIND_ARM64_MODE_DWARF;
-        unsigned Reg2 = MRI.getLLVMRegNum(Inst2.getRegister(), true);
-
-        // N.B. The encodings must be in register number order, and the X
-        // registers before the D registers.
-
-        // X19/X20 pair = 0x00000001,
-        // X21/X22 pair = 0x00000002,
-        // X23/X24 pair = 0x00000004,
-        // X25/X26 pair = 0x00000008,
-        // X27/X28 pair = 0x00000010
-        Reg1 = getXRegFromWReg(Reg1);
-        Reg2 = getXRegFromWReg(Reg2);
-
-        if (Reg1 == ARM64::X19 && Reg2 == ARM64::X20 &&
-            (CompactUnwindEncoding & 0xF1E) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X19_X20_PAIR;
-        else if (Reg1 == ARM64::X21 && Reg2 == ARM64::X22 &&
-                 (CompactUnwindEncoding & 0xF1C) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X21_X22_PAIR;
-        else if (Reg1 == ARM64::X23 && Reg2 == ARM64::X24 &&
-                 (CompactUnwindEncoding & 0xF18) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X23_X24_PAIR;
-        else if (Reg1 == ARM64::X25 && Reg2 == ARM64::X26 &&
-                 (CompactUnwindEncoding & 0xF10) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X25_X26_PAIR;
-        else if (Reg1 == ARM64::X27 && Reg2 == ARM64::X28 &&
-                 (CompactUnwindEncoding & 0xF00) == 0)
-          CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_X27_X28_PAIR;
-        else {
-          Reg1 = getDRegFromBReg(Reg1);
-          Reg2 = getDRegFromBReg(Reg2);
-
-          // D8/D9 pair   = 0x00000100,
-          // D10/D11 pair = 0x00000200,
-          // D12/D13 pair = 0x00000400,
-          // D14/D15 pair = 0x00000800
-          if (Reg1 == ARM64::D8 && Reg2 == ARM64::D9 &&
-              (CompactUnwindEncoding & 0xE00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D8_D9_PAIR;
-          else if (Reg1 == ARM64::D10 && Reg2 == ARM64::D11 &&
-                   (CompactUnwindEncoding & 0xC00) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D10_D11_PAIR;
-          else if (Reg1 == ARM64::D12 && Reg2 == ARM64::D13 &&
-                   (CompactUnwindEncoding & 0x800) == 0)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D12_D13_PAIR;
-          else if (Reg1 == ARM64::D14 && Reg2 == ARM64::D15)
-            CompactUnwindEncoding |= CU::UNWIND_ARM64_FRAME_D14_D15_PAIR;
-          else
-            // A pair was pushed which we cannot handle.
-            return CU::UNWIND_ARM64_MODE_DWARF;
-        }
-
-        break;
-      }
-      }
-    }
-
-    if (!HasFP) {
-      // With compact unwind info we can only represent stack adjustments of up
-      // to 65520 bytes.
-      if (StackSize > 65520)
-        return CU::UNWIND_ARM64_MODE_DWARF;
-
-      CompactUnwindEncoding |= CU::UNWIND_ARM64_MODE_FRAMELESS;
-      CompactUnwindEncoding |= encodeStackAdjustment(StackSize);
-    }
-
-    return CompactUnwindEncoding;
-  }
-};
-
-} // end anonymous namespace
-
-namespace {
-
-class ELFARM64AsmBackend : public ARM64AsmBackend {
-public:
-  uint8_t OSABI;
-
-  ELFARM64AsmBackend(const Target &T, uint8_t OSABI)
-      : ARM64AsmBackend(T), OSABI(OSABI) {}
-
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
-    return createARM64ELFObjectWriter(OS, OSABI);
-  }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-};
-
-void ELFARM64AsmBackend::processFixupValue(const MCAssembler &Asm,
-                                           const MCAsmLayout &Layout,
-                                           const MCFixup &Fixup,
-                                           const MCFragment *DF,
-                                           const MCValue &Target,
-                                           uint64_t &Value, bool &IsResolved) {
-  // The ADRP instruction adds some multiple of 0x1000 to the current PC &
-  // ~0xfff. This means that the required offset to reach a symbol can vary by
-  // up to one step depending on where the ADRP is in memory. For example:
-  //
-  //     ADRP x0, there
-  //  there:
-  //
-  // If the ADRP occurs at address 0xffc then "there" will be at 0x1000 and
-  // we'll need that as an offset. At any other address "there" will be in the
-  // same page as the ADRP and the instruction should encode 0x0. Assuming the
-  // section isn't 0x1000-aligned, we therefore need to delegate this decision
-  // to the linker -- a relocation!
-  if ((uint32_t)Fixup.getKind() == ARM64::fixup_arm64_pcrel_adrp_imm21)
-    IsResolved = false;
-}
-}
-
-MCAsmBackend *llvm::createARM64AsmBackend(const Target &T,
-                                          const MCRegisterInfo &MRI,
-                                          StringRef TT, StringRef CPU) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return new DarwinARM64AsmBackend(T, MRI);
-
-  assert(TheTriple.isOSBinFormatELF() && "Expect either MachO or ELF target");
-  return new ELFARM64AsmBackend(T, TheTriple.getOS());
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
deleted file mode 100644
index d3c2cf7..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64BaseInfo.h
+++ /dev/null
@@ -1,998 +0,0 @@
-//===-- ARM64BaseInfo.h - Top level definitions for ARM64 -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains small standalone helper functions and enum definitions for
-// the ARM64 target useful for the compiler back-end and the MC libraries.
-// As such, it deliberately does not include references to LLVM core
-// code gen types, passes, etc..
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64BASEINFO_H
-#define ARM64BASEINFO_H
-
-#include "ARM64MCTargetDesc.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-inline static unsigned getWRegFromXReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::X0: return ARM64::W0;
-  case ARM64::X1: return ARM64::W1;
-  case ARM64::X2: return ARM64::W2;
-  case ARM64::X3: return ARM64::W3;
-  case ARM64::X4: return ARM64::W4;
-  case ARM64::X5: return ARM64::W5;
-  case ARM64::X6: return ARM64::W6;
-  case ARM64::X7: return ARM64::W7;
-  case ARM64::X8: return ARM64::W8;
-  case ARM64::X9: return ARM64::W9;
-  case ARM64::X10: return ARM64::W10;
-  case ARM64::X11: return ARM64::W11;
-  case ARM64::X12: return ARM64::W12;
-  case ARM64::X13: return ARM64::W13;
-  case ARM64::X14: return ARM64::W14;
-  case ARM64::X15: return ARM64::W15;
-  case ARM64::X16: return ARM64::W16;
-  case ARM64::X17: return ARM64::W17;
-  case ARM64::X18: return ARM64::W18;
-  case ARM64::X19: return ARM64::W19;
-  case ARM64::X20: return ARM64::W20;
-  case ARM64::X21: return ARM64::W21;
-  case ARM64::X22: return ARM64::W22;
-  case ARM64::X23: return ARM64::W23;
-  case ARM64::X24: return ARM64::W24;
-  case ARM64::X25: return ARM64::W25;
-  case ARM64::X26: return ARM64::W26;
-  case ARM64::X27: return ARM64::W27;
-  case ARM64::X28: return ARM64::W28;
-  case ARM64::FP: return ARM64::W29;
-  case ARM64::LR: return ARM64::W30;
-  case ARM64::SP: return ARM64::WSP;
-  case ARM64::XZR: return ARM64::WZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-inline static unsigned getXRegFromWReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::W0: return ARM64::X0;
-  case ARM64::W1: return ARM64::X1;
-  case ARM64::W2: return ARM64::X2;
-  case ARM64::W3: return ARM64::X3;
-  case ARM64::W4: return ARM64::X4;
-  case ARM64::W5: return ARM64::X5;
-  case ARM64::W6: return ARM64::X6;
-  case ARM64::W7: return ARM64::X7;
-  case ARM64::W8: return ARM64::X8;
-  case ARM64::W9: return ARM64::X9;
-  case ARM64::W10: return ARM64::X10;
-  case ARM64::W11: return ARM64::X11;
-  case ARM64::W12: return ARM64::X12;
-  case ARM64::W13: return ARM64::X13;
-  case ARM64::W14: return ARM64::X14;
-  case ARM64::W15: return ARM64::X15;
-  case ARM64::W16: return ARM64::X16;
-  case ARM64::W17: return ARM64::X17;
-  case ARM64::W18: return ARM64::X18;
-  case ARM64::W19: return ARM64::X19;
-  case ARM64::W20: return ARM64::X20;
-  case ARM64::W21: return ARM64::X21;
-  case ARM64::W22: return ARM64::X22;
-  case ARM64::W23: return ARM64::X23;
-  case ARM64::W24: return ARM64::X24;
-  case ARM64::W25: return ARM64::X25;
-  case ARM64::W26: return ARM64::X26;
-  case ARM64::W27: return ARM64::X27;
-  case ARM64::W28: return ARM64::X28;
-  case ARM64::W29: return ARM64::FP;
-  case ARM64::W30: return ARM64::LR;
-  case ARM64::WSP: return ARM64::SP;
-  case ARM64::WZR: return ARM64::XZR;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-static inline unsigned getBRegFromDReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::D0:  return ARM64::B0;
-  case ARM64::D1:  return ARM64::B1;
-  case ARM64::D2:  return ARM64::B2;
-  case ARM64::D3:  return ARM64::B3;
-  case ARM64::D4:  return ARM64::B4;
-  case ARM64::D5:  return ARM64::B5;
-  case ARM64::D6:  return ARM64::B6;
-  case ARM64::D7:  return ARM64::B7;
-  case ARM64::D8:  return ARM64::B8;
-  case ARM64::D9:  return ARM64::B9;
-  case ARM64::D10: return ARM64::B10;
-  case ARM64::D11: return ARM64::B11;
-  case ARM64::D12: return ARM64::B12;
-  case ARM64::D13: return ARM64::B13;
-  case ARM64::D14: return ARM64::B14;
-  case ARM64::D15: return ARM64::B15;
-  case ARM64::D16: return ARM64::B16;
-  case ARM64::D17: return ARM64::B17;
-  case ARM64::D18: return ARM64::B18;
-  case ARM64::D19: return ARM64::B19;
-  case ARM64::D20: return ARM64::B20;
-  case ARM64::D21: return ARM64::B21;
-  case ARM64::D22: return ARM64::B22;
-  case ARM64::D23: return ARM64::B23;
-  case ARM64::D24: return ARM64::B24;
-  case ARM64::D25: return ARM64::B25;
-  case ARM64::D26: return ARM64::B26;
-  case ARM64::D27: return ARM64::B27;
-  case ARM64::D28: return ARM64::B28;
-  case ARM64::D29: return ARM64::B29;
-  case ARM64::D30: return ARM64::B30;
-  case ARM64::D31: return ARM64::B31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-
-static inline unsigned getDRegFromBReg(unsigned Reg) {
-  switch (Reg) {
-  case ARM64::B0:  return ARM64::D0;
-  case ARM64::B1:  return ARM64::D1;
-  case ARM64::B2:  return ARM64::D2;
-  case ARM64::B3:  return ARM64::D3;
-  case ARM64::B4:  return ARM64::D4;
-  case ARM64::B5:  return ARM64::D5;
-  case ARM64::B6:  return ARM64::D6;
-  case ARM64::B7:  return ARM64::D7;
-  case ARM64::B8:  return ARM64::D8;
-  case ARM64::B9:  return ARM64::D9;
-  case ARM64::B10: return ARM64::D10;
-  case ARM64::B11: return ARM64::D11;
-  case ARM64::B12: return ARM64::D12;
-  case ARM64::B13: return ARM64::D13;
-  case ARM64::B14: return ARM64::D14;
-  case ARM64::B15: return ARM64::D15;
-  case ARM64::B16: return ARM64::D16;
-  case ARM64::B17: return ARM64::D17;
-  case ARM64::B18: return ARM64::D18;
-  case ARM64::B19: return ARM64::D19;
-  case ARM64::B20: return ARM64::D20;
-  case ARM64::B21: return ARM64::D21;
-  case ARM64::B22: return ARM64::D22;
-  case ARM64::B23: return ARM64::D23;
-  case ARM64::B24: return ARM64::D24;
-  case ARM64::B25: return ARM64::D25;
-  case ARM64::B26: return ARM64::D26;
-  case ARM64::B27: return ARM64::D27;
-  case ARM64::B28: return ARM64::D28;
-  case ARM64::B29: return ARM64::D29;
-  case ARM64::B30: return ARM64::D30;
-  case ARM64::B31: return ARM64::D31;
-  }
-  // For anything else, return it unchanged.
-  return Reg;
-}
-
-namespace ARM64CC {
-
-// The CondCodes constants map directly to the 4-bit encoding of the condition
-// field for predicated instructions.
-enum CondCode {  // Meaning (integer)          Meaning (floating-point)
-  EQ = 0x0,      // Equal                      Equal
-  NE = 0x1,      // Not equal                  Not equal, or unordered
-  CS = 0x2,      // Carry set                  >, ==, or unordered
-  CC = 0x3,      // Carry clear                Less than
-  MI = 0x4,      // Minus, negative            Less than
-  PL = 0x5,      // Plus, positive or zero     >, ==, or unordered
-  VS = 0x6,      // Overflow                   Unordered
-  VC = 0x7,      // No overflow                Not unordered
-  HI = 0x8,      // Unsigned higher            Greater than, or unordered
-  LS = 0x9,      // Unsigned lower or same     Less than or equal
-  GE = 0xa,      // Greater than or equal      Greater than or equal
-  LT = 0xb,      // Less than                  Less than, or unordered
-  GT = 0xc,      // Greater than               Greater than
-  LE = 0xd,      // Less than or equal         <, ==, or unordered
-  AL = 0xe       // Always (unconditional)     Always (unconditional)
-};
-
-inline static const char *getCondCodeName(CondCode Code) {
-  // cond<0> is ignored when cond<3:1> = 111, where 1110 is 0xe (aka AL).
-  if ((Code & AL) == AL)
-    Code = AL;
-  switch (Code) {
-  case EQ:  return "eq";
-  case NE:  return "ne";
-  case CS:  return "cs";
-  case CC:  return "cc";
-  case MI:  return "mi";
-  case PL:  return "pl";
-  case VS:  return "vs";
-  case VC:  return "vc";
-  case HI:  return "hi";
-  case LS:  return "ls";
-  case GE:  return "ge";
-  case LT:  return "lt";
-  case GT:  return "gt";
-  case LE:  return "le";
-  case AL:  return "al";
-  }
-  llvm_unreachable("Unknown condition code");
-}
-
-inline static CondCode getInvertedCondCode(CondCode Code) {
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ:  return NE;
-  case NE:  return EQ;
-  case CS:  return CC;
-  case CC:  return CS;
-  case MI:  return PL;
-  case PL:  return MI;
-  case VS:  return VC;
-  case VC:  return VS;
-  case HI:  return LS;
-  case LS:  return HI;
-  case GE:  return LT;
-  case LT:  return GE;
-  case GT:  return LE;
-  case LE:  return GT;
-  }
-}
-
-/// Given a condition code, return NZCV flags that would satisfy that condition.
-/// The flag bits are in the format expected by the ccmp instructions.
-/// Note that many different flag settings can satisfy a given condition code,
-/// this function just returns one of them.
-inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
-  // NZCV flags encoded as expected by ccmp instructions, ARMv8 ISA 5.5.7.
-  enum { N = 8, Z = 4, C = 2, V = 1 };
-  switch (Code) {
-  default: llvm_unreachable("Unknown condition code");
-  case EQ: return Z; // Z == 1
-  case NE: return 0; // Z == 0
-  case CS: return C; // C == 1
-  case CC: return 0; // C == 0
-  case MI: return N; // N == 1
-  case PL: return 0; // N == 0
-  case VS: return V; // V == 1
-  case VC: return 0; // V == 0
-  case HI: return C; // C == 1 && Z == 0
-  case LS: return 0; // C == 0 || Z == 1
-  case GE: return 0; // N == V
-  case LT: return N; // N != V
-  case GT: return 0; // Z == 0 && N == V
-  case LE: return Z; // Z == 1 || N != V
-  }
-}
-} // end namespace ARM64CC
-
-namespace ARM64SYS {
-enum BarrierOption {
-  InvalidBarrier = 0xff,
-  OSHLD = 0x1,
-  OSHST = 0x2,
-  OSH =   0x3,
-  NSHLD = 0x5,
-  NSHST = 0x6,
-  NSH =   0x7,
-  ISHLD = 0x9,
-  ISHST = 0xa,
-  ISH =   0xb,
-  LD =    0xd,
-  ST =    0xe,
-  SY =    0xf
-};
-
-inline static const char *getBarrierOptName(BarrierOption Opt) {
-  switch (Opt) {
-  default: return NULL;
-  case 0x1: return "oshld";
-  case 0x2: return "oshst";
-  case 0x3: return "osh";
-  case 0x5: return "nshld";
-  case 0x6: return "nshst";
-  case 0x7: return "nsh";
-  case 0x9: return "ishld";
-  case 0xa: return "ishst";
-  case 0xb: return "ish";
-  case 0xd: return "ld";
-  case 0xe: return "st";
-  case 0xf: return "sy";
-  }
-}
-
-#define A64_SYSREG_ENC(op0,CRn,op2,CRm,op1) ((op0) << 14 | (op1) << 11 | \
-                                             (CRn) << 7  | (CRm) << 3 | (op2))
-enum SystemRegister {
-  InvalidSystemReg = 0,
-  // Table in section 3.10.3
-  SPSR_EL1  = 0xc200,
-  SPSR_svc  = SPSR_EL1,
-  ELR_EL1   = 0xc201,
-  SP_EL0    = 0xc208,
-  SPSel     = 0xc210,
-  CurrentEL = 0xc212,
-  DAIF      = 0xda11,
-  NZCV      = 0xda10,
-  FPCR      = 0xda20,
-  FPSR      = 0xda21,
-  DSPSR     = 0xda28,
-  DLR       = 0xda29,
-  SPSR_EL2  = 0xe200,
-  SPSR_hyp  = SPSR_EL2,
-  ELR_EL2   = 0xe201,
-  SP_EL1    = 0xe208,
-  SPSR_irq  = 0xe218,
-  SPSR_abt  = 0xe219,
-  SPSR_und  = 0xe21a,
-  SPSR_fiq  = 0xe21b,
-  SPSR_EL3  = 0xf200,
-  ELR_EL3   = 0xf201,
-  SP_EL2    = 0xf208,
-
-
-  // Table in section 3.10.8
-  MIDR_EL1 = 0xc000,
-  CTR_EL0 = 0xd801,
-  MPIDR_EL1 = 0xc005,
-  ECOIDR_EL1 = 0xc006,
-  DCZID_EL0 = 0xd807,
-  MVFR0_EL1 = 0xc018,
-  MVFR1_EL1 = 0xc019,
-  ID_AA64PFR0_EL1 = 0xc020,
-  ID_AA64PFR1_EL1 = 0xc021,
-  ID_AA64DFR0_EL1 = 0xc028,
-  ID_AA64DFR1_EL1 = 0xc029,
-  ID_AA64ISAR0_EL1 = 0xc030,
-  ID_AA64ISAR1_EL1 = 0xc031,
-  ID_AA64MMFR0_EL1 = 0xc038,
-  ID_AA64MMFR1_EL1 = 0xc039,
-  CCSIDR_EL1 = 0xc800,
-  CLIDR_EL1 = 0xc801,
-  AIDR_EL1 = 0xc807,
-  CSSELR_EL1 = 0xd000,
-  VPIDR_EL2 = 0xe000,
-  VMPIDR_EL2 = 0xe005,
-  SCTLR_EL1 = 0xc080,
-  SCTLR_EL2 = 0xe080,
-  SCTLR_EL3 = 0xf080,
-  ACTLR_EL1 = 0xc081,
-  ACTLR_EL2 = 0xe081,
-  ACTLR_EL3 = 0xf081,
-  CPACR_EL1 = 0xc082,
-  CPTR_EL2 = 0xe08a,
-  CPTR_EL3 = 0xf08a,
-  SCR_EL3 = 0xf088,
-  HCR_EL2 = 0xe088,
-  MDCR_EL2 = 0xe089,
-  MDCR_EL3 = 0xf099,
-  HSTR_EL2 = 0xe08b,
-  HACR_EL2 = 0xe08f,
-  TTBR0_EL1 = 0xc100,
-  TTBR1_EL1 = 0xc101,
-  TTBR0_EL2 = 0xe100,
-  TTBR0_EL3 = 0xf100,
-  VTTBR_EL2 = 0xe108,
-  TCR_EL1 = 0xc102,
-  TCR_EL2 = 0xe102,
-  TCR_EL3 = 0xf102,
-  VTCR_EL2 = 0xe10a,
-  ADFSR_EL1 = 0xc288,
-  AIFSR_EL1 = 0xc289,
-  ADFSR_EL2 = 0xe288,
-  AIFSR_EL2 = 0xe289,
-  ADFSR_EL3 = 0xf288,
-  AIFSR_EL3 = 0xf289,
-  ESR_EL1 = 0xc290,
-  ESR_EL2 = 0xe290,
-  ESR_EL3 = 0xf290,
-  FAR_EL1 = 0xc300,
-  FAR_EL2 = 0xe300,
-  FAR_EL3 = 0xf300,
-  HPFAR_EL2 = 0xe304,
-  PAR_EL1 = 0xc3a0,
-  MAIR_EL1 = 0xc510,
-  MAIR_EL2 = 0xe510,
-  MAIR_EL3 = 0xf510,
-  AMAIR_EL1 = 0xc518,
-  AMAIR_EL2 = 0xe518,
-  AMAIR_EL3 = 0xf518,
-  VBAR_EL1 = 0xc600,
-  VBAR_EL2 = 0xe600,
-  VBAR_EL3 = 0xf600,
-  RVBAR_EL1 = 0xc601,
-  RVBAR_EL2 = 0xe601,
-  RVBAR_EL3 = 0xf601,
-  ISR_EL1 = 0xc608,
-  CONTEXTIDR_EL1 = 0xc681,
-  TPIDR_EL0 = 0xde82,
-  TPIDRRO_EL0 = 0xde83,
-  TPIDR_EL1 = 0xc684,
-  TPIDR_EL2 = 0xe682,
-  TPIDR_EL3 = 0xf682,
-  TEECR32_EL1 = 0x9000,
-  CNTFRQ_EL0 = 0xdf00,
-  CNTPCT_EL0 = 0xdf01,
-  CNTVCT_EL0 = 0xdf02,
-  CNTVOFF_EL2 = 0xe703,
-  CNTKCTL_EL1 = 0xc708,
-  CNTHCTL_EL2 = 0xe708,
-  CNTP_TVAL_EL0 = 0xdf10,
-  CNTP_CTL_EL0 = 0xdf11,
-  CNTP_CVAL_EL0 = 0xdf12,
-  CNTV_TVAL_EL0 = 0xdf18,
-  CNTV_CTL_EL0 = 0xdf19,
-  CNTV_CVAL_EL0 = 0xdf1a,
-  CNTHP_TVAL_EL2 = 0xe710,
-  CNTHP_CTL_EL2 = 0xe711,
-  CNTHP_CVAL_EL2 = 0xe712,
-  CNTPS_TVAL_EL1 = 0xff10,
-  CNTPS_CTL_EL1 = 0xff11,
-  CNTPS_CVAL_EL1= 0xff12,
-
-  PMEVCNTR0_EL0  = 0xdf40,
-  PMEVCNTR1_EL0  = 0xdf41,
-  PMEVCNTR2_EL0  = 0xdf42,
-  PMEVCNTR3_EL0  = 0xdf43,
-  PMEVCNTR4_EL0  = 0xdf44,
-  PMEVCNTR5_EL0  = 0xdf45,
-  PMEVCNTR6_EL0  = 0xdf46,
-  PMEVCNTR7_EL0  = 0xdf47,
-  PMEVCNTR8_EL0  = 0xdf48,
-  PMEVCNTR9_EL0  = 0xdf49,
-  PMEVCNTR10_EL0 = 0xdf4a,
-  PMEVCNTR11_EL0 = 0xdf4b,
-  PMEVCNTR12_EL0 = 0xdf4c,
-  PMEVCNTR13_EL0 = 0xdf4d,
-  PMEVCNTR14_EL0 = 0xdf4e,
-  PMEVCNTR15_EL0 = 0xdf4f,
-  PMEVCNTR16_EL0 = 0xdf50,
-  PMEVCNTR17_EL0 = 0xdf51,
-  PMEVCNTR18_EL0 = 0xdf52,
-  PMEVCNTR19_EL0 = 0xdf53,
-  PMEVCNTR20_EL0 = 0xdf54,
-  PMEVCNTR21_EL0 = 0xdf55,
-  PMEVCNTR22_EL0 = 0xdf56,
-  PMEVCNTR23_EL0 = 0xdf57,
-  PMEVCNTR24_EL0 = 0xdf58,
-  PMEVCNTR25_EL0 = 0xdf59,
-  PMEVCNTR26_EL0 = 0xdf5a,
-  PMEVCNTR27_EL0 = 0xdf5b,
-  PMEVCNTR28_EL0 = 0xdf5c,
-  PMEVCNTR29_EL0 = 0xdf5d,
-  PMEVCNTR30_EL0 = 0xdf5e,
-
-  PMEVTYPER0_EL0  = 0xdf60,
-  PMEVTYPER1_EL0  = 0xdf61,
-  PMEVTYPER2_EL0  = 0xdf62,
-  PMEVTYPER3_EL0  = 0xdf63,
-  PMEVTYPER4_EL0  = 0xdf64,
-  PMEVTYPER5_EL0  = 0xdf65,
-  PMEVTYPER6_EL0  = 0xdf66,
-  PMEVTYPER7_EL0  = 0xdf67,
-  PMEVTYPER8_EL0  = 0xdf68,
-  PMEVTYPER9_EL0  = 0xdf69,
-  PMEVTYPER10_EL0 = 0xdf6a,
-  PMEVTYPER11_EL0 = 0xdf6b,
-  PMEVTYPER12_EL0 = 0xdf6c,
-  PMEVTYPER13_EL0 = 0xdf6d,
-  PMEVTYPER14_EL0 = 0xdf6e,
-  PMEVTYPER15_EL0 = 0xdf6f,
-  PMEVTYPER16_EL0 = 0xdf70,
-  PMEVTYPER17_EL0 = 0xdf71,
-  PMEVTYPER18_EL0 = 0xdf72,
-  PMEVTYPER19_EL0 = 0xdf73,
-  PMEVTYPER20_EL0 = 0xdf74,
-  PMEVTYPER21_EL0 = 0xdf75,
-  PMEVTYPER22_EL0 = 0xdf76,
-  PMEVTYPER23_EL0 = 0xdf77,
-  PMEVTYPER24_EL0 = 0xdf78,
-  PMEVTYPER25_EL0 = 0xdf79,
-  PMEVTYPER26_EL0 = 0xdf7a,
-  PMEVTYPER27_EL0 = 0xdf7b,
-  PMEVTYPER28_EL0 = 0xdf7c,
-  PMEVTYPER29_EL0 = 0xdf7d,
-  PMEVTYPER30_EL0 = 0xdf7e,
-
-  PMCCFILTR_EL0  = 0xdf7f,
-
-  RMR_EL3 = 0xf602,
-  RMR_EL2 = 0xd602,
-  RMR_EL1 = 0xce02,
-
-  // Debug Architecture 5.3, Table 17.
-  MDCCSR_EL0   = A64_SYSREG_ENC(2, 0, 0, 1, 3),
-  MDCCINT_EL1  = A64_SYSREG_ENC(2, 0, 0, 2, 0),
-  DBGDTR_EL0   = A64_SYSREG_ENC(2, 0, 0, 4, 3),
-  DBGDTRRX_EL0 = A64_SYSREG_ENC(2, 0, 0, 5, 3),
-  DBGDTRTX_EL0 = DBGDTRRX_EL0,
-  DBGVCR32_EL2 = A64_SYSREG_ENC(2, 0, 0, 7, 4),
-  OSDTRRX_EL1  = A64_SYSREG_ENC(2, 0, 2, 0, 0),
-  MDSCR_EL1    = A64_SYSREG_ENC(2, 0, 2, 2, 0),
-  OSDTRTX_EL1  = A64_SYSREG_ENC(2, 0, 2, 3, 0),
-  OSECCR_EL11  = A64_SYSREG_ENC(2, 0, 2, 6, 0),
-
-  DBGBVR0_EL1  = A64_SYSREG_ENC(2, 0, 4, 0, 0),
-  DBGBVR1_EL1  = A64_SYSREG_ENC(2, 0, 4, 1, 0),
-  DBGBVR2_EL1  = A64_SYSREG_ENC(2, 0, 4, 2, 0),
-  DBGBVR3_EL1  = A64_SYSREG_ENC(2, 0, 4, 3, 0),
-  DBGBVR4_EL1  = A64_SYSREG_ENC(2, 0, 4, 4, 0),
-  DBGBVR5_EL1  = A64_SYSREG_ENC(2, 0, 4, 5, 0),
-  DBGBVR6_EL1  = A64_SYSREG_ENC(2, 0, 4, 6, 0),
-  DBGBVR7_EL1  = A64_SYSREG_ENC(2, 0, 4, 7, 0),
-  DBGBVR8_EL1  = A64_SYSREG_ENC(2, 0, 4, 8, 0),
-  DBGBVR9_EL1  = A64_SYSREG_ENC(2, 0, 4, 9, 0),
-  DBGBVR10_EL1 = A64_SYSREG_ENC(2, 0, 4, 10, 0),
-  DBGBVR11_EL1 = A64_SYSREG_ENC(2, 0, 4, 11, 0),
-  DBGBVR12_EL1 = A64_SYSREG_ENC(2, 0, 4, 12, 0),
-  DBGBVR13_EL1 = A64_SYSREG_ENC(2, 0, 4, 13, 0),
-  DBGBVR14_EL1 = A64_SYSREG_ENC(2, 0, 4, 14, 0),
-  DBGBVR15_EL1 = A64_SYSREG_ENC(2, 0, 4, 15, 0),
-
-  DBGBCR0_EL1  = A64_SYSREG_ENC(2, 0, 5, 0, 0),
-  DBGBCR1_EL1  = A64_SYSREG_ENC(2, 0, 5, 1, 0),
-  DBGBCR2_EL1  = A64_SYSREG_ENC(2, 0, 5, 2, 0),
-  DBGBCR3_EL1  = A64_SYSREG_ENC(2, 0, 5, 3, 0),
-  DBGBCR4_EL1  = A64_SYSREG_ENC(2, 0, 5, 4, 0),
-  DBGBCR5_EL1  = A64_SYSREG_ENC(2, 0, 5, 5, 0),
-  DBGBCR6_EL1  = A64_SYSREG_ENC(2, 0, 5, 6, 0),
-  DBGBCR7_EL1  = A64_SYSREG_ENC(2, 0, 5, 7, 0),
-  DBGBCR8_EL1  = A64_SYSREG_ENC(2, 0, 5, 8, 0),
-  DBGBCR9_EL1  = A64_SYSREG_ENC(2, 0, 5, 9, 0),
-  DBGBCR10_EL1 = A64_SYSREG_ENC(2, 0, 5, 10, 0),
-  DBGBCR11_EL1 = A64_SYSREG_ENC(2, 0, 5, 11, 0),
-  DBGBCR12_EL1 = A64_SYSREG_ENC(2, 0, 5, 12, 0),
-  DBGBCR13_EL1 = A64_SYSREG_ENC(2, 0, 5, 13, 0),
-  DBGBCR14_EL1 = A64_SYSREG_ENC(2, 0, 5, 14, 0),
-  DBGBCR15_EL1 = A64_SYSREG_ENC(2, 0, 5, 15, 0),
-
-  DBGWVR0_EL1  = A64_SYSREG_ENC(2, 0, 6, 0, 0),
-  DBGWVR1_EL1  = A64_SYSREG_ENC(2, 0, 6, 1, 0),
-  DBGWVR2_EL1  = A64_SYSREG_ENC(2, 0, 6, 2, 0),
-  DBGWVR3_EL1  = A64_SYSREG_ENC(2, 0, 6, 3, 0),
-  DBGWVR4_EL1  = A64_SYSREG_ENC(2, 0, 6, 4, 0),
-  DBGWVR5_EL1  = A64_SYSREG_ENC(2, 0, 6, 5, 0),
-  DBGWVR6_EL1  = A64_SYSREG_ENC(2, 0, 6, 6, 0),
-  DBGWVR7_EL1  = A64_SYSREG_ENC(2, 0, 6, 7, 0),
-  DBGWVR8_EL1  = A64_SYSREG_ENC(2, 0, 6, 8, 0),
-  DBGWVR9_EL1  = A64_SYSREG_ENC(2, 0, 6, 9, 0),
-  DBGWVR10_EL1 = A64_SYSREG_ENC(2, 0, 6, 10, 0),
-  DBGWVR11_EL1 = A64_SYSREG_ENC(2, 0, 6, 11, 0),
-  DBGWVR12_EL1 = A64_SYSREG_ENC(2, 0, 6, 12, 0),
-  DBGWVR13_EL1 = A64_SYSREG_ENC(2, 0, 6, 13, 0),
-  DBGWVR14_EL1 = A64_SYSREG_ENC(2, 0, 6, 14, 0),
-  DBGWVR15_EL1 = A64_SYSREG_ENC(2, 0, 6, 15, 0),
-
-  DBGWCR0_EL1  = A64_SYSREG_ENC(2, 0, 7, 0, 0),
-  DBGWCR1_EL1  = A64_SYSREG_ENC(2, 0, 7, 1, 0),
-  DBGWCR2_EL1  = A64_SYSREG_ENC(2, 0, 7, 2, 0),
-  DBGWCR3_EL1  = A64_SYSREG_ENC(2, 0, 7, 3, 0),
-  DBGWCR4_EL1  = A64_SYSREG_ENC(2, 0, 7, 4, 0),
-  DBGWCR5_EL1  = A64_SYSREG_ENC(2, 0, 7, 5, 0),
-  DBGWCR6_EL1  = A64_SYSREG_ENC(2, 0, 7, 6, 0),
-  DBGWCR7_EL1  = A64_SYSREG_ENC(2, 0, 7, 7, 0),
-  DBGWCR8_EL1  = A64_SYSREG_ENC(2, 0, 7, 8, 0),
-  DBGWCR9_EL1  = A64_SYSREG_ENC(2, 0, 7, 9, 0),
-  DBGWCR10_EL1 = A64_SYSREG_ENC(2, 0, 7, 10, 0),
-  DBGWCR11_EL1 = A64_SYSREG_ENC(2, 0, 7, 11, 0),
-  DBGWCR12_EL1 = A64_SYSREG_ENC(2, 0, 7, 12, 0),
-  DBGWCR13_EL1 = A64_SYSREG_ENC(2, 0, 7, 13, 0),
-  DBGWCR14_EL1 = A64_SYSREG_ENC(2, 0, 7, 14, 0),
-  DBGWCR15_EL1 = A64_SYSREG_ENC(2, 0, 7, 15, 0),
-
-  MDRAR_EL1    = A64_SYSREG_ENC(2, 1, 0, 0, 0),
-  OSLAR_EL1    = A64_SYSREG_ENC(2, 1, 4, 0, 0),
-  OSLSR_EL1    = A64_SYSREG_ENC(2, 1, 4, 1, 0),
-  OSDLR_EL1    = A64_SYSREG_ENC(2, 1, 4, 3, 0),
-  DBGPRCR_EL1  = A64_SYSREG_ENC(2, 1, 4, 4, 0),
-
-  DBGCLAIMSET_EL1   = A64_SYSREG_ENC(2, 7, 6, 8, 0),
-  DBGCLAIMCLR_EL1   = A64_SYSREG_ENC(2, 7, 6, 9, 0),
-  DBGAUTHSTATUS_EL1 = A64_SYSREG_ENC(2, 7, 6, 14, 0),
-
-  DBGDEVID2    = A64_SYSREG_ENC(2, 7, 7, 0, 0),
-  DBGDEVID1    = A64_SYSREG_ENC(2, 7, 7, 1, 0),
-  DBGDEVID0    = A64_SYSREG_ENC(2, 7, 7, 2, 0),
-
-  // The following registers are defined to allow access from AArch64 to
-  // registers which are only used in the AArch32 architecture.
-  DACR32_EL2 = 0xe180,
-  IFSR32_EL2 = 0xe281,
-  TEEHBR32_EL1 = 0x9080,
-  SDER32_EL3 = 0xf089,
-  FPEXC32_EL2 = 0xe298,
-
-  // Cyclone specific system registers
-  CPM_IOACC_CTL_EL3 = 0xff90,
-
-  // Architectural system registers
-  ID_PFR0_EL1 = 0xc008,
-  ID_PFR1_EL1 = 0xc009,
-  ID_DFR0_EL1 = 0xc00a,
-  ID_AFR0_EL1 = 0xc00b,
-  ID_ISAR0_EL1 = 0xc010,
-  ID_ISAR1_EL1 = 0xc011,
-  ID_ISAR2_EL1 = 0xc012,
-  ID_ISAR3_EL1 = 0xc013,
-  ID_ISAR4_EL1 = 0xc014,
-  ID_ISAR5_EL1 = 0xc015,
-  AFSR1_EL1 = 0xc289, // note same as old AIFSR_EL1
-  AFSR0_EL1 = 0xc288, // note same as old ADFSR_EL1
-  REVIDR_EL1 = 0xc006 // note same as old ECOIDR_EL1
-
-};
-#undef A64_SYSREG_ENC
-
-static inline const char *getSystemRegisterName(SystemRegister Reg) {
-  switch(Reg) {
-  default: return NULL; // Caller is responsible for handling invalid value.
-  case SPSR_EL1: return "SPSR_EL1";
-  case ELR_EL1: return "ELR_EL1";
-  case SP_EL0: return "SP_EL0";
-  case SPSel: return "SPSel";
-  case DAIF: return "DAIF";
-  case CurrentEL: return "CurrentEL";
-  case NZCV: return "NZCV";
-  case FPCR: return "FPCR";
-  case FPSR: return "FPSR";
-  case DSPSR: return "DSPSR";
-  case DLR: return "DLR";
-  case SPSR_EL2: return "SPSR_EL2";
-  case ELR_EL2: return "ELR_EL2";
-  case SP_EL1: return "SP_EL1";
-  case SPSR_irq: return "SPSR_irq";
-  case SPSR_abt: return "SPSR_abt";
-  case SPSR_und: return "SPSR_und";
-  case SPSR_fiq: return "SPSR_fiq";
-  case SPSR_EL3: return "SPSR_EL3";
-  case ELR_EL3: return "ELR_EL3";
-  case SP_EL2: return "SP_EL2";
-  case MIDR_EL1: return "MIDR_EL1";
-  case CTR_EL0: return "CTR_EL0";
-  case MPIDR_EL1: return "MPIDR_EL1";
-  case DCZID_EL0: return "DCZID_EL0";
-  case MVFR0_EL1: return "MVFR0_EL1";
-  case MVFR1_EL1: return "MVFR1_EL1";
-  case ID_AA64PFR0_EL1: return "ID_AA64PFR0_EL1";
-  case ID_AA64PFR1_EL1: return "ID_AA64PFR1_EL1";
-  case ID_AA64DFR0_EL1: return "ID_AA64DFR0_EL1";
-  case ID_AA64DFR1_EL1: return "ID_AA64DFR1_EL1";
-  case ID_AA64ISAR0_EL1: return "ID_AA64ISAR0_EL1";
-  case ID_AA64ISAR1_EL1: return "ID_AA64ISAR1_EL1";
-  case ID_AA64MMFR0_EL1: return "ID_AA64MMFR0_EL1";
-  case ID_AA64MMFR1_EL1: return "ID_AA64MMFR1_EL1";
-  case CCSIDR_EL1: return "CCSIDR_EL1";
-  case CLIDR_EL1: return "CLIDR_EL1";
-  case AIDR_EL1: return "AIDR_EL1";
-  case CSSELR_EL1: return "CSSELR_EL1";
-  case VPIDR_EL2: return "VPIDR_EL2";
-  case VMPIDR_EL2: return "VMPIDR_EL2";
-  case SCTLR_EL1: return "SCTLR_EL1";
-  case SCTLR_EL2: return "SCTLR_EL2";
-  case SCTLR_EL3: return "SCTLR_EL3";
-  case ACTLR_EL1: return "ACTLR_EL1";
-  case ACTLR_EL2: return "ACTLR_EL2";
-  case ACTLR_EL3: return "ACTLR_EL3";
-  case CPACR_EL1: return "CPACR_EL1";
-  case CPTR_EL2: return "CPTR_EL2";
-  case CPTR_EL3: return "CPTR_EL3";
-  case SCR_EL3: return "SCR_EL3";
-  case HCR_EL2: return "HCR_EL2";
-  case MDCR_EL2: return "MDCR_EL2";
-  case MDCR_EL3: return "MDCR_EL3";
-  case HSTR_EL2: return "HSTR_EL2";
-  case HACR_EL2: return "HACR_EL2";
-  case TTBR0_EL1: return "TTBR0_EL1";
-  case TTBR1_EL1: return "TTBR1_EL1";
-  case TTBR0_EL2: return "TTBR0_EL2";
-  case TTBR0_EL3: return "TTBR0_EL3";
-  case VTTBR_EL2: return "VTTBR_EL2";
-  case TCR_EL1: return "TCR_EL1";
-  case TCR_EL2: return "TCR_EL2";
-  case TCR_EL3: return "TCR_EL3";
-  case VTCR_EL2: return "VTCR_EL2";
-  case ADFSR_EL2: return "ADFSR_EL2";
-  case AIFSR_EL2: return "AIFSR_EL2";
-  case ADFSR_EL3: return "ADFSR_EL3";
-  case AIFSR_EL3: return "AIFSR_EL3";
-  case ESR_EL1: return "ESR_EL1";
-  case ESR_EL2: return "ESR_EL2";
-  case ESR_EL3: return "ESR_EL3";
-  case FAR_EL1: return "FAR_EL1";
-  case FAR_EL2: return "FAR_EL2";
-  case FAR_EL3: return "FAR_EL3";
-  case HPFAR_EL2: return "HPFAR_EL2";
-  case PAR_EL1: return "PAR_EL1";
-  case MAIR_EL1: return "MAIR_EL1";
-  case MAIR_EL2: return "MAIR_EL2";
-  case MAIR_EL3: return "MAIR_EL3";
-  case AMAIR_EL1: return "AMAIR_EL1";
-  case AMAIR_EL2: return "AMAIR_EL2";
-  case AMAIR_EL3: return "AMAIR_EL3";
-  case VBAR_EL1: return "VBAR_EL1";
-  case VBAR_EL2: return "VBAR_EL2";
-  case VBAR_EL3: return "VBAR_EL3";
-  case RVBAR_EL1: return "RVBAR_EL1";
-  case RVBAR_EL2: return "RVBAR_EL2";
-  case RVBAR_EL3: return "RVBAR_EL3";
-  case ISR_EL1: return "ISR_EL1";
-  case CONTEXTIDR_EL1: return "CONTEXTIDR_EL1";
-  case TPIDR_EL0: return "TPIDR_EL0";
-  case TPIDRRO_EL0: return "TPIDRRO_EL0";
-  case TPIDR_EL1: return "TPIDR_EL1";
-  case TPIDR_EL2: return "TPIDR_EL2";
-  case TPIDR_EL3: return "TPIDR_EL3";
-  case TEECR32_EL1: return "TEECR32_EL1";
-  case CNTFRQ_EL0: return "CNTFRQ_EL0";
-  case CNTPCT_EL0: return "CNTPCT_EL0";
-  case CNTVCT_EL0: return "CNTVCT_EL0";
-  case CNTVOFF_EL2: return "CNTVOFF_EL2";
-  case CNTKCTL_EL1: return "CNTKCTL_EL1";
-  case CNTHCTL_EL2: return "CNTHCTL_EL2";
-  case CNTP_TVAL_EL0: return "CNTP_TVAL_EL0";
-  case CNTP_CTL_EL0: return "CNTP_CTL_EL0";
-  case CNTP_CVAL_EL0: return "CNTP_CVAL_EL0";
-  case CNTV_TVAL_EL0: return "CNTV_TVAL_EL0";
-  case CNTV_CTL_EL0: return "CNTV_CTL_EL0";
-  case CNTV_CVAL_EL0: return "CNTV_CVAL_EL0";
-  case CNTHP_TVAL_EL2: return "CNTHP_TVAL_EL2";
-  case CNTHP_CTL_EL2: return "CNTHP_CTL_EL2";
-  case CNTHP_CVAL_EL2: return "CNTHP_CVAL_EL2";
-  case CNTPS_TVAL_EL1: return "CNTPS_TVAL_EL1";
-  case CNTPS_CTL_EL1: return "CNTPS_CTL_EL1";
-  case CNTPS_CVAL_EL1: return "CNTPS_CVAL_EL1";
-  case DACR32_EL2: return "DACR32_EL2";
-  case IFSR32_EL2: return "IFSR32_EL2";
-  case TEEHBR32_EL1: return "TEEHBR32_EL1";
-  case SDER32_EL3: return "SDER32_EL3";
-  case FPEXC32_EL2: return "FPEXC32_EL2";
-  case PMEVCNTR0_EL0: return "PMEVCNTR0_EL0";
-  case PMEVCNTR1_EL0: return "PMEVCNTR1_EL0";
-  case PMEVCNTR2_EL0: return "PMEVCNTR2_EL0";
-  case PMEVCNTR3_EL0: return "PMEVCNTR3_EL0";
-  case PMEVCNTR4_EL0: return "PMEVCNTR4_EL0";
-  case PMEVCNTR5_EL0: return "PMEVCNTR5_EL0";
-  case PMEVCNTR6_EL0: return "PMEVCNTR6_EL0";
-  case PMEVCNTR7_EL0: return "PMEVCNTR7_EL0";
-  case PMEVCNTR8_EL0: return "PMEVCNTR8_EL0";
-  case PMEVCNTR9_EL0: return "PMEVCNTR9_EL0";
-  case PMEVCNTR10_EL0: return "PMEVCNTR10_EL0";
-  case PMEVCNTR11_EL0: return "PMEVCNTR11_EL0";
-  case PMEVCNTR12_EL0: return "PMEVCNTR12_EL0";
-  case PMEVCNTR13_EL0: return "PMEVCNTR13_EL0";
-  case PMEVCNTR14_EL0: return "PMEVCNTR14_EL0";
-  case PMEVCNTR15_EL0: return "PMEVCNTR15_EL0";
-  case PMEVCNTR16_EL0: return "PMEVCNTR16_EL0";
-  case PMEVCNTR17_EL0: return "PMEVCNTR17_EL0";
-  case PMEVCNTR18_EL0: return "PMEVCNTR18_EL0";
-  case PMEVCNTR19_EL0: return "PMEVCNTR19_EL0";
-  case PMEVCNTR20_EL0: return "PMEVCNTR20_EL0";
-  case PMEVCNTR21_EL0: return "PMEVCNTR21_EL0";
-  case PMEVCNTR22_EL0: return "PMEVCNTR22_EL0";
-  case PMEVCNTR23_EL0: return "PMEVCNTR23_EL0";
-  case PMEVCNTR24_EL0: return "PMEVCNTR24_EL0";
-  case PMEVCNTR25_EL0: return "PMEVCNTR25_EL0";
-  case PMEVCNTR26_EL0: return "PMEVCNTR26_EL0";
-  case PMEVCNTR27_EL0: return "PMEVCNTR27_EL0";
-  case PMEVCNTR28_EL0: return "PMEVCNTR28_EL0";
-  case PMEVCNTR29_EL0: return "PMEVCNTR29_EL0";
-  case PMEVCNTR30_EL0: return "PMEVCNTR30_EL0";
-  case PMEVTYPER0_EL0: return "PMEVTYPER0_EL0";
-  case PMEVTYPER1_EL0: return "PMEVTYPER1_EL0";
-  case PMEVTYPER2_EL0: return "PMEVTYPER2_EL0";
-  case PMEVTYPER3_EL0: return "PMEVTYPER3_EL0";
-  case PMEVTYPER4_EL0: return "PMEVTYPER4_EL0";
-  case PMEVTYPER5_EL0: return "PMEVTYPER5_EL0";
-  case PMEVTYPER6_EL0: return "PMEVTYPER6_EL0";
-  case PMEVTYPER7_EL0: return "PMEVTYPER7_EL0";
-  case PMEVTYPER8_EL0: return "PMEVTYPER8_EL0";
-  case PMEVTYPER9_EL0: return "PMEVTYPER9_EL0";
-  case PMEVTYPER10_EL0: return "PMEVTYPER10_EL0";
-  case PMEVTYPER11_EL0: return "PMEVTYPER11_EL0";
-  case PMEVTYPER12_EL0: return "PMEVTYPER12_EL0";
-  case PMEVTYPER13_EL0: return "PMEVTYPER13_EL0";
-  case PMEVTYPER14_EL0: return "PMEVTYPER14_EL0";
-  case PMEVTYPER15_EL0: return "PMEVTYPER15_EL0";
-  case PMEVTYPER16_EL0: return "PMEVTYPER16_EL0";
-  case PMEVTYPER17_EL0: return "PMEVTYPER17_EL0";
-  case PMEVTYPER18_EL0: return "PMEVTYPER18_EL0";
-  case PMEVTYPER19_EL0: return "PMEVTYPER19_EL0";
-  case PMEVTYPER20_EL0: return "PMEVTYPER20_EL0";
-  case PMEVTYPER21_EL0: return "PMEVTYPER21_EL0";
-  case PMEVTYPER22_EL0: return "PMEVTYPER22_EL0";
-  case PMEVTYPER23_EL0: return "PMEVTYPER23_EL0";
-  case PMEVTYPER24_EL0: return "PMEVTYPER24_EL0";
-  case PMEVTYPER25_EL0: return "PMEVTYPER25_EL0";
-  case PMEVTYPER26_EL0: return "PMEVTYPER26_EL0";
-  case PMEVTYPER27_EL0: return "PMEVTYPER27_EL0";
-  case PMEVTYPER28_EL0: return "PMEVTYPER28_EL0";
-  case PMEVTYPER29_EL0: return "PMEVTYPER29_EL0";
-  case PMEVTYPER30_EL0: return "PMEVTYPER30_EL0";
-  case PMCCFILTR_EL0: return "PMCCFILTR_EL0";
-  case RMR_EL3: return "RMR_EL3";
-  case RMR_EL2: return "RMR_EL2";
-  case RMR_EL1: return "RMR_EL1";
-  case CPM_IOACC_CTL_EL3: return "CPM_IOACC_CTL_EL3";
-  case MDCCSR_EL0: return "MDCCSR_EL0";
-  case MDCCINT_EL1: return "MDCCINT_EL1";
-  case DBGDTR_EL0: return "DBGDTR_EL0";
-  case DBGDTRRX_EL0: return "DBGDTRRX_EL0";
-  case DBGVCR32_EL2: return "DBGVCR32_EL2";
-  case OSDTRRX_EL1: return "OSDTRRX_EL1";
-  case MDSCR_EL1: return "MDSCR_EL1";
-  case OSDTRTX_EL1: return "OSDTRTX_EL1";
-  case OSECCR_EL11: return "OSECCR_EL11";
-  case DBGBVR0_EL1: return "DBGBVR0_EL1";
-  case DBGBVR1_EL1: return "DBGBVR1_EL1";
-  case DBGBVR2_EL1: return "DBGBVR2_EL1";
-  case DBGBVR3_EL1: return "DBGBVR3_EL1";
-  case DBGBVR4_EL1: return "DBGBVR4_EL1";
-  case DBGBVR5_EL1: return "DBGBVR5_EL1";
-  case DBGBVR6_EL1: return "DBGBVR6_EL1";
-  case DBGBVR7_EL1: return "DBGBVR7_EL1";
-  case DBGBVR8_EL1: return "DBGBVR8_EL1";
-  case DBGBVR9_EL1: return "DBGBVR9_EL1";
-  case DBGBVR10_EL1: return "DBGBVR10_EL1";
-  case DBGBVR11_EL1: return "DBGBVR11_EL1";
-  case DBGBVR12_EL1: return "DBGBVR12_EL1";
-  case DBGBVR13_EL1: return "DBGBVR13_EL1";
-  case DBGBVR14_EL1: return "DBGBVR14_EL1";
-  case DBGBVR15_EL1: return "DBGBVR15_EL1";
-  case DBGBCR0_EL1: return "DBGBCR0_EL1";
-  case DBGBCR1_EL1: return "DBGBCR1_EL1";
-  case DBGBCR2_EL1: return "DBGBCR2_EL1";
-  case DBGBCR3_EL1: return "DBGBCR3_EL1";
-  case DBGBCR4_EL1: return "DBGBCR4_EL1";
-  case DBGBCR5_EL1: return "DBGBCR5_EL1";
-  case DBGBCR6_EL1: return "DBGBCR6_EL1";
-  case DBGBCR7_EL1: return "DBGBCR7_EL1";
-  case DBGBCR8_EL1: return "DBGBCR8_EL1";
-  case DBGBCR9_EL1: return "DBGBCR9_EL1";
-  case DBGBCR10_EL1: return "DBGBCR10_EL1";
-  case DBGBCR11_EL1: return "DBGBCR11_EL1";
-  case DBGBCR12_EL1: return "DBGBCR12_EL1";
-  case DBGBCR13_EL1: return "DBGBCR13_EL1";
-  case DBGBCR14_EL1: return "DBGBCR14_EL1";
-  case DBGBCR15_EL1: return "DBGBCR15_EL1";
-  case DBGWVR0_EL1: return "DBGWVR0_EL1";
-  case DBGWVR1_EL1: return "DBGWVR1_EL1";
-  case DBGWVR2_EL1: return "DBGWVR2_EL1";
-  case DBGWVR3_EL1: return "DBGWVR3_EL1";
-  case DBGWVR4_EL1: return "DBGWVR4_EL1";
-  case DBGWVR5_EL1: return "DBGWVR5_EL1";
-  case DBGWVR6_EL1: return "DBGWVR6_EL1";
-  case DBGWVR7_EL1: return "DBGWVR7_EL1";
-  case DBGWVR8_EL1: return "DBGWVR8_EL1";
-  case DBGWVR9_EL1: return "DBGWVR9_EL1";
-  case DBGWVR10_EL1: return "DBGWVR10_EL1";
-  case DBGWVR11_EL1: return "DBGWVR11_EL1";
-  case DBGWVR12_EL1: return "DBGWVR12_EL1";
-  case DBGWVR13_EL1: return "DBGWVR13_EL1";
-  case DBGWVR14_EL1: return "DBGWVR14_EL1";
-  case DBGWVR15_EL1: return "DBGWVR15_EL1";
-  case DBGWCR0_EL1: return "DBGWCR0_EL1";
-  case DBGWCR1_EL1: return "DBGWCR1_EL1";
-  case DBGWCR2_EL1: return "DBGWCR2_EL1";
-  case DBGWCR3_EL1: return "DBGWCR3_EL1";
-  case DBGWCR4_EL1: return "DBGWCR4_EL1";
-  case DBGWCR5_EL1: return "DBGWCR5_EL1";
-  case DBGWCR6_EL1: return "DBGWCR6_EL1";
-  case DBGWCR7_EL1: return "DBGWCR7_EL1";
-  case DBGWCR8_EL1: return "DBGWCR8_EL1";
-  case DBGWCR9_EL1: return "DBGWCR9_EL1";
-  case DBGWCR10_EL1: return "DBGWCR10_EL1";
-  case DBGWCR11_EL1: return "DBGWCR11_EL1";
-  case DBGWCR12_EL1: return "DBGWCR12_EL1";
-  case DBGWCR13_EL1: return "DBGWCR13_EL1";
-  case DBGWCR14_EL1: return "DBGWCR14_EL1";
-  case DBGWCR15_EL1: return "DBGWCR15_EL1";
-  case MDRAR_EL1: return "MDRAR_EL1";
-  case OSLAR_EL1: return "OSLAR_EL1";
-  case OSLSR_EL1: return "OSLSR_EL1";
-  case OSDLR_EL1: return "OSDLR_EL1";
-  case DBGPRCR_EL1: return "DBGPRCR_EL1";
-  case DBGCLAIMSET_EL1: return "DBGCLAIMSET_EL1";
-  case DBGCLAIMCLR_EL1: return "DBGCLAIMCLR_EL1";
-  case DBGAUTHSTATUS_EL1: return "DBGAUTHSTATUS_EL1";
-  case DBGDEVID2: return "DBGDEVID2";
-  case DBGDEVID1: return "DBGDEVID1";
-  case DBGDEVID0: return "DBGDEVID0";
-  case ID_PFR0_EL1: return "ID_PFR0_EL1";
-  case ID_PFR1_EL1: return "ID_PFR1_EL1";
-  case ID_DFR0_EL1: return "ID_DFR0_EL1";
-  case ID_AFR0_EL1: return "ID_AFR0_EL1";
-  case ID_ISAR0_EL1: return "ID_ISAR0_EL1";
-  case ID_ISAR1_EL1: return "ID_ISAR1_EL1";
-  case ID_ISAR2_EL1: return "ID_ISAR2_EL1";
-  case ID_ISAR3_EL1: return "ID_ISAR3_EL1";
-  case ID_ISAR4_EL1: return "ID_ISAR4_EL1";
-  case ID_ISAR5_EL1: return "ID_ISAR5_EL1";
-  case AFSR1_EL1: return "AFSR1_EL1";
-  case AFSR0_EL1: return "AFSR0_EL1";
-  case REVIDR_EL1: return "REVIDR_EL1";
-  }
-}
-
-enum CPSRField {
-  InvalidCPSRField = 0xff,
-  cpsr_SPSel = 0x5,
-  cpsr_DAIFSet = 0x1e,
-  cpsr_DAIFClr = 0x1f
-};
-
-static inline const char *getCPSRFieldName(CPSRField Val) {
-  switch(Val) {
-  default: assert(0 && "Invalid system register value!");
-  case cpsr_SPSel: return "SPSel";
-  case cpsr_DAIFSet: return "DAIFSet";
-  case cpsr_DAIFClr: return "DAIFClr";
-  }
-}
-
-} // end namespace ARM64SYS
-
-namespace ARM64II {
-  /// Target Operand Flag enum.
-  enum TOF {
-    //===------------------------------------------------------------------===//
-    // ARM64 Specific MachineOperand flags.
-
-    MO_NO_FLAG,
-
-    MO_FRAGMENT = 0x7,
-
-    /// MO_PAGE - A symbol operand with this flag represents the pc-relative
-    /// offset of the 4K page containing the symbol.  This is used with the
-    /// ADRP instruction.
-    MO_PAGE = 1,
-
-    /// MO_PAGEOFF - A symbol operand with this flag represents the offset of
-    /// that symbol within a 4K page.  This offset is added to the page address
-    /// to produce the complete address.
-    MO_PAGEOFF = 2,
-
-    /// MO_G3 - A symbol operand with this flag (granule 3) represents the high
-    /// 16-bits of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G3 = 3,
-
-    /// MO_G2 - A symbol operand with this flag (granule 2) represents the bits
-    /// 32-47 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G2 = 4,
-
-    /// MO_G1 - A symbol operand with this flag (granule 1) represents the bits
-    /// 16-31 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G1 = 5,
-
-    /// MO_G0 - A symbol operand with this flag (granule 0) represents the bits
-    /// 0-15 of a 64-bit address, used in a MOVZ or MOVK instruction
-    MO_G0 = 6,
-
-    /// MO_GOT - This flag indicates that a symbol operand represents the
-    /// address of the GOT entry for the symbol, rather than the address of
-    /// the symbol itself.
-    MO_GOT = 8,
-
-    /// MO_NC - Indicates whether the linker is expected to check the symbol
-    /// reference for overflow. For example in an ADRP/ADD pair of relocations
-    /// the ADRP usually does check, but not the ADD.
-    MO_NC = 0x10,
-
-    /// MO_TLS - Indicates that the operand being accessed is some kind of
-    /// thread-local symbol. On Darwin, only one type of thread-local access
-    /// exists (pre linker-relaxation), but on ELF the TLSModel used for the
-    /// referee will affect interpretation.
-    MO_TLS = 0x20
-  };
-} // end namespace ARM64II
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
deleted file mode 100644
index 1a132a1..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFObjectWriter.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-//===-- ARM64ELFObjectWriter.cpp - ARM64 ELF Writer -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file handles ELF-specific object emission, converting LLVM's internal
-// fixups into the appropriate relocations.
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "MCTargetDesc/ARM64MCTargetDesc.h"
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-namespace {
-class ARM64ELFObjectWriter : public MCELFObjectTargetWriter {
-public:
-  ARM64ELFObjectWriter(uint8_t OSABI);
-
-  virtual ~ARM64ELFObjectWriter();
-
-protected:
-  unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
-                        bool IsPCRel) const override;
-
-private:
-};
-}
-
-ARM64ELFObjectWriter::ARM64ELFObjectWriter(uint8_t OSABI)
-    : MCELFObjectTargetWriter(/*Is64Bit*/ true, OSABI, ELF::EM_AARCH64,
-                              /*HasRelocationAddend*/ true) {}
-
-ARM64ELFObjectWriter::~ARM64ELFObjectWriter() {}
-
-unsigned ARM64ELFObjectWriter::GetRelocType(const MCValue &Target,
-                                            const MCFixup &Fixup,
-                                            bool IsPCRel) const {
-  ARM64MCExpr::VariantKind RefKind =
-      static_cast<ARM64MCExpr::VariantKind>(Target.getRefKind());
-  ARM64MCExpr::VariantKind SymLoc = ARM64MCExpr::getSymbolLoc(RefKind);
-  bool IsNC = ARM64MCExpr::isNotChecked(RefKind);
-
-  assert((!Target.getSymA() ||
-          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  assert((!Target.getSymB() ||
-          Target.getSymB()->getKind() == MCSymbolRefExpr::VK_None) &&
-         "Should only be expression-level modifiers here");
-
-  if (IsPCRel) {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_PREL16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_PREL32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_PREL64;
-    case ARM64::fixup_arm64_pcrel_adr_imm21:
-      llvm_unreachable("No ELF relocations supported for ADR at the moment");
-    case ARM64::fixup_arm64_pcrel_adrp_imm21:
-      if (SymLoc == ARM64MCExpr::VK_ABS && !IsNC)
-        return ELF::R_AARCH64_ADR_PREL_PG_HI21;
-      if (SymLoc == ARM64MCExpr::VK_GOT && !IsNC)
-        return ELF::R_AARCH64_ADR_GOT_PAGE;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSIE_ADR_GOTTPREL_PAGE21;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && !IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADR_PAGE;
-      llvm_unreachable("invalid symbol kind for ADRP relocation");
-    case ARM64::fixup_arm64_pcrel_branch26:
-      return ELF::R_AARCH64_JUMP26;
-    case ARM64::fixup_arm64_pcrel_call26:
-      return ELF::R_AARCH64_CALL26;
-    case ARM64::fixup_arm64_pcrel_imm19:
-      return ELF::R_AARCH64_TLSIE_LD_GOTTPREL_PREL19;
-    default:
-      llvm_unreachable("Unsupported pc-relative fixup kind");
-    }
-  } else {
-    switch ((unsigned)Fixup.getKind()) {
-    case FK_Data_2:
-      return ELF::R_AARCH64_ABS16;
-    case FK_Data_4:
-      return ELF::R_AARCH64_ABS32;
-    case FK_Data_8:
-      return ELF::R_AARCH64_ABS64;
-    case ARM64::fixup_arm64_add_imm12:
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_ADD_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_ADD_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_ADD_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_ADD_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for add (uimm12) instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale1:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST8_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST8_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST8_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 8-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale2:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST16_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST16_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST16_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 16-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale4:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST32_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST32_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST32_TPREL_LO12_NC;
-
-      report_fatal_error("invalid fixup for 32-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale8:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST64_ABS_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOT && IsNC)
-        return ELF::R_AARCH64_LD64_GOT_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_DTPREL && IsNC)
-        return ELF::R_AARCH64_TLSLD_LDST64_DTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && !IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12;
-      if (SymLoc == ARM64MCExpr::VK_TPREL && IsNC)
-        return ELF::R_AARCH64_TLSLE_LDST64_TPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_GOTTPREL && IsNC)
-        return ELF::R_AARCH64_TLSIE_LD64_GOTTPREL_LO12_NC;
-      if (SymLoc == ARM64MCExpr::VK_TLSDESC && IsNC)
-        return ELF::R_AARCH64_TLSDESC_LD64_LO12_NC;
-
-      report_fatal_error("invalid fixup for 64-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_ldst_imm12_scale16:
-      if (SymLoc == ARM64MCExpr::VK_ABS && IsNC)
-        return ELF::R_AARCH64_LDST128_ABS_LO12_NC;
-
-      report_fatal_error("invalid fixup for 128-bit load/store instruction");
-      return 0;
-    case ARM64::fixup_arm64_movw:
-      if (RefKind == ARM64MCExpr::VK_ABS_G3)
-        return ELF::R_AARCH64_MOVW_UABS_G3;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2)
-        return ELF::R_AARCH64_MOVW_UABS_G2;
-      if (RefKind == ARM64MCExpr::VK_ABS_G2_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G2_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1)
-        return ELF::R_AARCH64_MOVW_UABS_G1;
-      if (RefKind == ARM64MCExpr::VK_ABS_G1_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0)
-        return ELF::R_AARCH64_MOVW_UABS_G0;
-      if (RefKind == ARM64MCExpr::VK_ABS_G0_NC)
-        return ELF::R_AARCH64_MOVW_UABS_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G2)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_DTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLD_MOVW_DTPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G2)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G2;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G1_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G1_NC;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0;
-      if (RefKind == ARM64MCExpr::VK_TPREL_G0_NC)
-        return ELF::R_AARCH64_TLSLE_MOVW_TPREL_G0_NC;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G1)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G1;
-      if (RefKind == ARM64MCExpr::VK_GOTTPREL_G0_NC)
-        return ELF::R_AARCH64_TLSIE_MOVW_GOTTPREL_G0_NC;
-      report_fatal_error("invalid fixup for movz/movk instruction");
-      return 0;
-    case ARM64::fixup_arm64_tlsdesc_call:
-      return ELF::R_AARCH64_TLSDESC_CALL;
-    default:
-      llvm_unreachable("Unknown ELF relocation type");
-    }
-  }
-
-  llvm_unreachable("Unimplemented fixup -> relocation");
-}
-
-MCObjectWriter *llvm::createARM64ELFObjectWriter(raw_ostream &OS,
-                                                 uint8_t OSABI) {
-  MCELFObjectTargetWriter *MOTW = new ARM64ELFObjectWriter(OSABI);
-  return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
deleted file mode 100644
index 97a3493..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.cpp
+++ /dev/null
@@ -1,158 +0,0 @@
-//===- lib/MC/ARM64ELFStreamer.cpp - ELF Object Output for ARM64 ----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file assembles .s files and emits AArch64 ELF .o object files. Different
-// from generic ELF streamer in emitting mapping symbols ($x and $d) to delimit
-// regions of data and code.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/MC/MCELFSymbolFlags.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-namespace {
-
-/// Extend the generic ELFStreamer class so that it can emit mapping symbols at
-/// the appropriate points in the object files. These symbols are defined in the
-/// AArch64 ELF ABI:
-///    infocenter.arm.com/help/topic/com.arm.doc.ihi0056a/IHI0056A_aaelf64.pdf
-///
-/// In brief: $x or $d should be emitted at the start of each contiguous region
-/// of A64 code or data in a section. In practice, this emission does not rely
-/// on explicit assembler directives but on inherent properties of the
-/// directives doing the emission (e.g. ".byte" is data, "add x0, x0, x0" an
-/// instruction).
-///
-/// As a result this system is orthogonal to the DataRegion infrastructure used
-/// by MachO. Beware!
-class ARM64ELFStreamer : public MCELFStreamer {
-public:
-  ARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_ostream &OS,
-                   MCCodeEmitter *Emitter)
-      : MCELFStreamer(Context, TAB, OS, Emitter), MappingSymbolCounter(0),
-        LastEMS(EMS_None) {}
-
-  ~ARM64ELFStreamer() {}
-
-  virtual void ChangeSection(const MCSection *Section,
-                             const MCExpr *Subsection) {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
-    MCELFStreamer::ChangeSection(Section, Subsection);
-  }
-
-  /// This function is the one used to emit instruction data into the ELF
-  /// streamer. We override it to add the appropriate mapping symbol if
-  /// necessary.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
-    EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst, STI);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  virtual void EmitBytes(StringRef Data) {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
-  }
-
-  /// This is one of the functions used to emit data into an ELF section, so the
-  /// ARM64 streamer overrides it to add the appropriate mapping symbol ($d)
-  /// if necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size);
-  }
-
-private:
-  enum ElfMappingSymbol {
-    EMS_None,
-    EMS_A64,
-    EMS_Data
-  };
-
-  void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data)
-      return;
-    EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
-  }
-
-  void EmitA64MappingSymbol() {
-    if (LastEMS == EMS_A64)
-      return;
-    EmitMappingSymbol("$x");
-    LastEMS = EMS_A64;
-  }
-
-  void EmitMappingSymbol(StringRef Name) {
-    MCSymbol *Start = getContext().CreateTempSymbol();
-    EmitLabel(Start);
-
-    MCSymbol *Symbol = getContext().GetOrCreateSymbol(
-        Name + "." + Twine(MappingSymbolCounter++));
-
-    MCSymbolData &SD = getAssembler().getOrCreateSymbolData(*Symbol);
-    MCELF::SetType(SD, ELF::STT_NOTYPE);
-    MCELF::SetBinding(SD, ELF::STB_LOCAL);
-    SD.setExternal(false);
-    Symbol->setSection(*getCurrentSection().first);
-
-    const MCExpr *Value = MCSymbolRefExpr::Create(Start, getContext());
-    Symbol->setVariableValue(Value);
-  }
-
-  int64_t MappingSymbolCounter;
-
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS;
-
-  /// @}
-};
-}
-
-namespace llvm {
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack) {
-  ARM64ELFStreamer *S = new ARM64ELFStreamer(Context, TAB, OS, Emitter);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  if (NoExecStack)
-    S->getAssembler().setNoExecStack(true);
-  return S;
-}
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h b/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
deleted file mode 100644
index 72dadbc..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64ELFStreamer.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===-- ARM64ELFStreamer.h - ELF Streamer for ARM64 -------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements ELF streamer information for the ARM64 backend.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_AARCH64_ELF_STREAMER_H
-#define LLVM_AARCH64_ELF_STREAMER_H
-
-#include "llvm/MC/MCELFStreamer.h"
-
-namespace llvm {
-
-MCELFStreamer *createARM64ELFStreamer(MCContext &Context, MCAsmBackend &TAB,
-                                      raw_ostream &OS, MCCodeEmitter *Emitter,
-                                      bool RelaxAll, bool NoExecStack);
-}
-
-#endif // ARM64_ELF_STREAMER_H
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h b/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
deleted file mode 100644
index 02eb91f..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64FixupKinds.h
+++ /dev/null
@@ -1,72 +0,0 @@
-//===-- ARM64FixupKinds.h - ARM64 Specific Fixup Entries --------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64FIXUPKINDS_H
-#define LLVM_ARM64FIXUPKINDS_H
-
-#include "llvm/MC/MCFixup.h"
-
-namespace llvm {
-namespace ARM64 {
-
-enum Fixups {
-  // fixup_arm64_pcrel_adr_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADR instruction.
-  fixup_arm64_pcrel_adr_imm21 = FirstTargetFixupKind,
-
-  // fixup_arm64_pcrel_adrp_imm21 - A 21-bit pc-relative immediate inserted into
-  // an ADRP instruction.
-  fixup_arm64_pcrel_adrp_imm21,
-
-  // fixup_arm64_imm12 - 12-bit fixup for add/sub instructions.
-  //     No alignment adjustment. All value bits are encoded.
-  fixup_arm64_add_imm12,
-
-  // fixup_arm64_ldst_imm12_* - unsigned 12-bit fixups for load and
-  // store instructions.
-  fixup_arm64_ldst_imm12_scale1,
-  fixup_arm64_ldst_imm12_scale2,
-  fixup_arm64_ldst_imm12_scale4,
-  fixup_arm64_ldst_imm12_scale8,
-  fixup_arm64_ldst_imm12_scale16,
-
-  // FIXME: comment
-  fixup_arm64_movw,
-
-  // fixup_arm64_pcrel_imm14 - The high 14 bits of a 21-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch14,
-
-  // fixup_arm64_pcrel_imm19 - The high 19 bits of a 21-bit pc-relative
-  // immediate. Same encoding as fixup_arm64_pcrel_adrhi, except this
-  // is not used as part of a lo/hi pair and thus generates relocations
-  // directly when necessary.
-  fixup_arm64_pcrel_imm19,
-
-  // fixup_arm64_pcrel_branch26 - The high 26 bits of a 28-bit pc-relative
-  // immediate.
-  fixup_arm64_pcrel_branch26,
-
-  // fixup_arm64_pcrel_call26 - The high 26 bits of a 28-bit pc-relative
-  // immediate. Distinguished from branch26 only on ELF.
-  fixup_arm64_pcrel_call26,
-
-  // fixup_arm64_tlsdesc_call - zero-space placeholder for the ELF
-  // R_AARCH64_TLSDESC_CALL relocation.
-  fixup_arm64_tlsdesc_call,
-
-  // Marker
-  LastTargetFixupKind,
-  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
-};
-
-} // end namespace ARM64
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
deleted file mode 100644
index 97e0d3c..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-//===-- ARM64MCAsmInfo.cpp - ARM64 asm properties -----------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declarations of the ARM64MCAsmInfo properties.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCAsmInfo.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/CommandLine.h"
-using namespace llvm;
-
-enum AsmWriterVariantTy {
-  Default = -1,
-  Generic = 0,
-  Apple = 1
-};
-
-static cl::opt<AsmWriterVariantTy> AsmWriterVariant(
-    "arm64-neon-syntax", cl::init(Default),
-    cl::desc("Choose style of NEON code to emit from ARM64 backend:"),
-    cl::values(clEnumValN(Generic, "generic", "Emit generic NEON assembly"),
-               clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"),
-               clEnumValEnd));
-
-ARM64MCAsmInfoDarwin::ARM64MCAsmInfoDarwin() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant;
-
-  PrivateGlobalPrefix = "L";
-  SeparatorString = "%%";
-  CommentString = ";";
-  PointerSize = CalleeSaveStackSlotSize = 8;
-
-  AlignmentIsInBytes = false;
-  UsesELFSectionDirectiveForBSS = true;
-  SupportsDebugInformation = true;
-  UseDataRegionDirectives = true;
-
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
-
-const MCExpr *ARM64MCAsmInfoDarwin::getExprForPersonalitySymbol(
-    const MCSymbol *Sym, unsigned Encoding, MCStreamer &Streamer) const {
-  // On Darwin, we can reference dwarf symbols with foo@GOT-., which
-  // is an indirect pc-relative reference. The default implementation
-  // won't reference using the GOT, so we need this target-specific
-  // version.
-  MCContext &Context = Streamer.getContext();
-  const MCExpr *Res =
-      MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOT, Context);
-  MCSymbol *PCSym = Context.CreateTempSymbol();
-  Streamer.EmitLabel(PCSym);
-  const MCExpr *PC = MCSymbolRefExpr::Create(PCSym, Context);
-  return MCBinaryExpr::CreateSub(Res, PC, Context);
-}
-
-ARM64MCAsmInfoELF::ARM64MCAsmInfoELF() {
-  // We prefer NEON instructions to be printed in the short form.
-  AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant;
-
-  PointerSize = 8;
-
-  // ".comm align is in bytes but .align is pow-2."
-  AlignmentIsInBytes = false;
-
-  CommentString = "//";
-  PrivateGlobalPrefix = ".L";
-  Code32Directive = ".code\t32";
-
-  Data16bitsDirective = "\t.hword\t";
-  Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = "\t.xword\t";
-
-  UseDataRegionDirectives = false;
-
-  WeakRefDirective = "\t.weak\t";
-
-  HasLEB128 = true;
-  SupportsDebugInformation = true;
-
-  // Exceptions handling
-  ExceptionsType = ExceptionHandling::DwarfCFI;
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
deleted file mode 100644
index f2d33a7..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCAsmInfo.h
+++ /dev/null
@@ -1,36 +0,0 @@
-//=====-- ARM64MCAsmInfo.h - ARM64 asm properties -----------*- C++ -*--====//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the ARM64MCAsmInfo class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64TARGETASMINFO_H
-#define ARM64TARGETASMINFO_H
-
-#include "llvm/MC/MCAsmInfoDarwin.h"
-
-namespace llvm {
-class Target;
-class StringRef;
-class MCStreamer;
-struct ARM64MCAsmInfoDarwin : public MCAsmInfoDarwin {
-  explicit ARM64MCAsmInfoDarwin();
-  virtual const MCExpr *getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                                    unsigned Encoding,
-                                                    MCStreamer &Streamer) const;
-};
-
-struct ARM64MCAsmInfoELF : public MCAsmInfo {
-  explicit ARM64MCAsmInfoELF();
-};
-
-} // namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
deleted file mode 100644
index 19559f8..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCCodeEmitter.cpp
+++ /dev/null
@@ -1,563 +0,0 @@
-//===-- ARM64/ARM64MCCodeEmitter.cpp - Convert ARM64 code to machine code -===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the ARM64MCCodeEmitter class.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mccodeemitter"
-#include "MCTargetDesc/ARM64AddressingModes.h"
-#include "MCTargetDesc/ARM64BaseInfo.h"
-#include "MCTargetDesc/ARM64FixupKinds.h"
-#include "MCTargetDesc/ARM64MCExpr.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-STATISTIC(MCNumEmitted, "Number of MC instructions emitted.");
-STATISTIC(MCNumFixups, "Number of MC fixups created.");
-
-namespace {
-
-class ARM64MCCodeEmitter : public MCCodeEmitter {
-  MCContext &Ctx;
-
-  ARM64MCCodeEmitter(const ARM64MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARM64MCCodeEmitter &);     // DO NOT IMPLEMENT
-public:
-  ARM64MCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                     MCContext &ctx)
-      : Ctx(ctx) {}
-
-  ~ARM64MCCodeEmitter() {}
-
-  // getBinaryCodeForInstr - TableGen'erated function for getting the
-  // binary encoding for an instruction.
-  uint64_t getBinaryCodeForInstr(const MCInst &MI,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getMachineOpValue - Return binary encoding of operand. If the machine
-  /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                             SmallVectorImpl<MCFixup> &Fixups,
-                             const MCSubtargetInfo &STI) const;
-
-  /// getAMIndexed8OpValue - Return encoding info for base register
-  /// and 12-bit unsigned immediate attached to a load, store or prfm
-  /// instruction. If operand requires a relocation, record it and
-  /// return zero in that part of the encoding.
-  template <uint32_t FixupKind>
-  uint32_t getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-  /// target.
-  uint32_t getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                              SmallVectorImpl<MCFixup> &Fixups,
-                              const MCSubtargetInfo &STI) const;
-
-  /// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-  /// the 2-bit shift field.
-  uint32_t getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                               SmallVectorImpl<MCFixup> &Fixups,
-                               const MCSubtargetInfo &STI) const;
-
-  /// getCondBranchTargetOpValue - Return the encoded value for a conditional
-  /// branch target.
-  uint32_t getCondBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-  /// branch target.
-  uint32_t getTestBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const;
-
-  /// getBranchTargetOpValue - Return the encoded value for an unconditional
-  /// branch target.
-  uint32_t getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                  SmallVectorImpl<MCFixup> &Fixups,
-                                  const MCSubtargetInfo &STI) const;
-
-  /// getMoveWideImmOpValue - Return the encoded value for the immediate operand
-  /// of a MOVZ or MOVK instruction.
-  uint32_t getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  /// getVecShifterOpValue - Return the encoded value for the vector shifter.
-  uint32_t getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getMoveVecShifterOpValue - Return the encoded value for the vector move
-  /// shifter (MSL).
-  uint32_t getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  /// getFixedPointScaleOpValue - Return the encoded value for the
-  // FP-to-fixed-point scale factor.
-  uint32_t getFixedPointScaleOpValue(const MCInst &MI, unsigned OpIdx,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
-
-  uint32_t getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-  uint32_t getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                SmallVectorImpl<MCFixup> &Fixups,
-                                const MCSubtargetInfo &STI) const;
-
-  /// getSIMDShift64OpValue - Return the encoded value for the
-  // shift-by-immediate AdvSIMD instructions.
-  uint32_t getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                    SmallVectorImpl<MCFixup> &Fixups,
-                                    const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  uint32_t getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const;
-
-  unsigned fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                   const MCSubtargetInfo &STI) const;
-
-  void EmitByte(unsigned char C, raw_ostream &OS) const { OS << (char)C; }
-
-  void EmitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      EmitByte(Val & 255, OS);
-      Val >>= 8;
-    }
-  }
-
-  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                         SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
-};
-
-} // end anonymous namespace
-
-MCCodeEmitter *llvm::createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              const MCSubtargetInfo &STI,
-                                              MCContext &Ctx) {
-  return new ARM64MCCodeEmitter(MCII, STI, Ctx);
-}
-
-/// getMachineOpValue - Return binary encoding of operand. If the machine
-/// operand requires relocation, record the relocation and return zero.
-unsigned
-ARM64MCCodeEmitter::getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                      SmallVectorImpl<MCFixup> &Fixups,
-                                      const MCSubtargetInfo &STI) const {
-  if (MO.isReg())
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
-  else {
-    assert(MO.isImm() && "did not expect relocated expression");
-    return static_cast<unsigned>(MO.getImm());
-  }
-
-  assert(0 && "Unable to encode MCOperand!");
-  return 0;
-}
-
-template <uint32_t FixupKind>
-uint32_t
-ARM64MCCodeEmitter::getAMIndexed8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  unsigned BaseReg = MI.getOperand(OpIdx).getReg();
-  BaseReg = Ctx.getRegisterInfo()->getEncodingValue(BaseReg);
-
-  const MCOperand &MO = MI.getOperand(OpIdx + 1);
-  uint32_t ImmVal = 0;
-
-  if (MO.isImm())
-    ImmVal = static_cast<uint32_t>(MO.getImm());
-  else {
-    assert(MO.isExpr() && "unable to encode load/store imm operand");
-    MCFixupKind Kind = MCFixupKind(FixupKind);
-    Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-    ++MCNumFixups;
-  }
-
-  return BaseReg | (ImmVal << 5);
-}
-
-/// getAdrLabelOpValue - Return encoding info for 21-bit immediate ADR label
-/// target.
-uint32_t
-ARM64MCCodeEmitter::getAdrLabelOpValue(const MCInst &MI, unsigned OpIdx,
-                                       SmallVectorImpl<MCFixup> &Fixups,
-                                       const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-  const MCExpr *Expr = MO.getExpr();
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::ADR
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_adr_imm21)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_adrp_imm21);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  MCNumFixups += 1;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getAddSubImmOpValue - Return encoding for the 12-bit immediate value and
-/// the 2-bit shift field.  The shift field is stored in bits 13-14 of the
-/// return value.
-uint32_t
-ARM64MCCodeEmitter::getAddSubImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                        SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
-  // Suboperands are [imm, shifter].
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  assert(ARM64_AM::getShiftType(MO1.getImm()) == ARM64_AM::LSL &&
-         "unexpected shift type for add/sub immediate");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO1.getImm());
-  assert((ShiftVal == 0 || ShiftVal == 12) &&
-         "unexpected shift value for add/sub immediate");
-  if (MO.isImm())
-    return MO.getImm() | (ShiftVal == 0 ? 0 : (1 << 12));
-  assert(MO.isExpr() && "Unable to encode MCOperand!");
-  const MCExpr *Expr = MO.getExpr();
-  assert(ShiftVal == 0 && "shift not allowed on add/sub immediate with fixup");
-
-  // Encode the 12 bits of the fixup.
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_add_imm12);
-  Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getCondBranchTargetOpValue - Return the encoded value for a conditional
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getCondBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_imm19);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getMoveWideImmOpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected movz/movk immediate");
-
-  Fixups.push_back(MCFixup::Create(
-      0, MO.getExpr(), MCFixupKind(ARM64::fixup_arm64_movw), MI.getLoc()));
-
-  ++MCNumFixups;
-
-  return 0;
-}
-
-/// getTestBranchTargetOpValue - Return the encoded value for a test-bit-and-
-/// branch target.
-uint32_t ARM64MCCodeEmitter::getTestBranchTargetOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MCFixupKind(ARM64::fixup_arm64_pcrel_branch14);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getBranchTargetOpValue - Return the encoded value for an unconditional
-/// branch target.
-uint32_t
-ARM64MCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpIdx,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-
-  // If the destination is an immediate, we have nothing to do.
-  if (MO.isImm())
-    return MO.getImm();
-  assert(MO.isExpr() && "Unexpected ADR target type!");
-
-  MCFixupKind Kind = MI.getOpcode() == ARM64::BL
-                         ? MCFixupKind(ARM64::fixup_arm64_pcrel_call26)
-                         : MCFixupKind(ARM64::fixup_arm64_pcrel_branch26);
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(), Kind, MI.getLoc()));
-
-  ++MCNumFixups;
-
-  // All of the information is in the fixup.
-  return 0;
-}
-
-/// getVecShifterOpValue - Return the encoded value for the vector shifter:
-///
-///   00 -> 0
-///   01 -> 8
-///   10 -> 16
-///   11 -> 24
-uint32_t
-ARM64MCCodeEmitter::getVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-
-  switch (MO.getImm()) {
-  default:
-    break;
-  case 0:
-    return 0;
-  case 8:
-    return 1;
-  case 16:
-    return 2;
-  case 24:
-    return 3;
-  }
-
-  assert(false && "Invalid value for vector shift amount!");
-  return 0;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm());
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift64_32OpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 64 - (MO.getImm() | 32);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 32 - (MO.getImm() | 16);
-}
-
-uint32_t
-ARM64MCCodeEmitter::getSIMDShift16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the shift amount!");
-  return 16 - (MO.getImm() | 8);
-}
-
-/// getFixedPointScaleOpValue - Return the encoded value for the
-// FP-to-fixed-point scale factor.
-uint32_t ARM64MCCodeEmitter::getFixedPointScaleOpValue(
-    const MCInst &MI, unsigned OpIdx, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 64 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 32 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 16 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftR8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return 8 - MO.getImm();
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL64OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 64;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL32OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 32;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL16OpValue(const MCInst &MI, unsigned OpIdx,
-                                          SmallVectorImpl<MCFixup> &Fixups,
-                                          const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 16;
-}
-
-uint32_t
-ARM64MCCodeEmitter::getVecShiftL8OpValue(const MCInst &MI, unsigned OpIdx,
-                                         SmallVectorImpl<MCFixup> &Fixups,
-                                         const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() && "Expected an immediate value for the scale amount!");
-  return MO.getImm() - 8;
-}
-
-/// getMoveVecShifterOpValue - Return the encoded value for the vector move
-/// shifter (MSL).
-uint32_t
-ARM64MCCodeEmitter::getMoveVecShifterOpValue(const MCInst &MI, unsigned OpIdx,
-                                             SmallVectorImpl<MCFixup> &Fixups,
-                                             const MCSubtargetInfo &STI) const {
-  const MCOperand &MO = MI.getOperand(OpIdx);
-  assert(MO.isImm() &&
-         "Expected an immediate value for the move shift amount!");
-  unsigned ShiftVal = ARM64_AM::getShiftValue(MO.getImm());
-  assert((ShiftVal == 8 || ShiftVal == 16) && "Invalid shift amount!");
-  return ShiftVal == 8 ? 0 : 1;
-}
-
-unsigned ARM64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
-                                     const MCSubtargetInfo &STI) const {
-  // If one of the signed fixup kinds is applied to a MOVZ instruction, the
-  // eventual result could be either a MOVZ or a MOVN. It's the MCCodeEmitter's
-  // job to ensure that any bits possibly affected by this are 0. This means we
-  // must zero out bit 30 (essentially emitting a MOVN).
-  MCOperand UImm16MO = MI.getOperand(1);
-
-  // Nothing to do if there's no fixup.
-  if (UImm16MO.isImm())
-    return EncodedValue;
-
-  return EncodedValue & ~(1u << 30);
-}
-
-void ARM64MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
-                                           SmallVectorImpl<MCFixup> &Fixups,
-                                           const MCSubtargetInfo &STI) const {
-  if (MI.getOpcode() == ARM64::TLSDESCCALL) {
-    // This is a directive which applies an R_AARCH64_TLSDESC_CALL to the
-    // following (BLR) instruction. It doesn't emit any code itself so it
-    // doesn't go through the normal TableGenerated channels.
-    MCFixupKind Fixup = MCFixupKind(ARM64::fixup_arm64_tlsdesc_call);
-    Fixups.push_back(MCFixup::Create(0, MI.getOperand(0).getExpr(), Fixup));
-    return;
-  }
-
-  uint64_t Binary = getBinaryCodeForInstr(MI, Fixups, STI);
-  EmitConstant(Binary, 4, OS);
-  ++MCNumEmitted; // Keep track of the # of mi's emitted.
-}
-
-#include "ARM64GenMCCodeEmitter.inc"
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
deleted file mode 100644
index d4ab140..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.cpp
+++ /dev/null
@@ -1,168 +0,0 @@
-//===-- ARM64MCExpr.cpp - ARM64 specific MC expression classes --------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the implementation of the assembly expression modifiers
-// accepted by the AArch64 architecture (e.g. ":lo12:", ":gottprel_g1:", ...).
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "aarch64symbolrefexpr"
-#include "ARM64MCExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCELF.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Object/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
-
-using namespace llvm;
-
-const ARM64MCExpr *ARM64MCExpr::Create(const MCExpr *Expr, VariantKind Kind,
-                                       MCContext &Ctx) {
-  return new (Ctx) ARM64MCExpr(Expr, Kind);
-}
-
-StringRef ARM64MCExpr::getVariantKindName() const {
-  switch (static_cast<uint32_t>(getKind())) {
-  case VK_CALL:                return "";
-  case VK_LO12:                return ":lo12:";
-  case VK_ABS_G3:              return ":abs_g3:";
-  case VK_ABS_G2:              return ":abs_g2:";
-  case VK_ABS_G2_NC:           return ":abs_g2_nc:";
-  case VK_ABS_G1:              return ":abs_g1:";
-  case VK_ABS_G1_NC:           return ":abs_g1_nc:";
-  case VK_ABS_G0:              return ":abs_g0:";
-  case VK_ABS_G0_NC:           return ":abs_g0_nc:";
-  case VK_DTPREL_G2:           return ":dtprel_g2:";
-  case VK_DTPREL_G1:           return ":dtprel_g1:";
-  case VK_DTPREL_G1_NC:        return ":dtprel_g1_nc:";
-  case VK_DTPREL_G0:           return ":dtprel_g0:";
-  case VK_DTPREL_G0_NC:        return ":dtprel_g0_nc:";
-  case VK_DTPREL_LO12:         return ":dtprel_lo12:";
-  case VK_DTPREL_LO12_NC:      return ":dtprel_lo12_nc:";
-  case VK_TPREL_G2:            return ":tprel_g2:";
-  case VK_TPREL_G1:            return ":tprel_g1:";
-  case VK_TPREL_G1_NC:         return ":tprel_g1_nc:";
-  case VK_TPREL_G0:            return ":tprel_g0:";
-  case VK_TPREL_G0_NC:         return ":tprel_g0_nc:";
-  case VK_TPREL_LO12:          return ":tprel_lo12:";
-  case VK_TPREL_LO12_NC:       return ":tprel_lo12_nc:";
-  case VK_TLSDESC_LO12:        return ":tlsdesc_lo12:";
-  case VK_ABS_PAGE:            return "";
-  case VK_GOT_PAGE:            return ":got:";
-  case VK_GOT_LO12:            return ":got_lo12:";
-  case VK_GOTTPREL_PAGE:       return ":gottprel:";
-  case VK_GOTTPREL_LO12_NC:    return ":gottprel_lo12:";
-  case VK_GOTTPREL_G1:         return ":gottprel_g1:";
-  case VK_GOTTPREL_G0_NC:      return ":gottprel_g0_nc:";
-  case VK_TLSDESC:             return "";
-  case VK_TLSDESC_PAGE:        return ":tlsdesc:";
-  default:
-    llvm_unreachable("Invalid ELF symbol kind");
-  }
-}
-
-void ARM64MCExpr::PrintImpl(raw_ostream &OS) const {
-  if (getKind() != VK_NONE)
-    OS << getVariantKindName();
-  OS << *Expr;
-}
-
-// FIXME: This basically copies MCObjectStreamer::AddValueSymbols. Perhaps
-// that method should be made public?
-// FIXME: really do above: now that two backends are using it.
-static void AddValueSymbolsImpl(const MCExpr *Value, MCAssembler *Asm) {
-  switch (Value->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expr!");
-    break;
-
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Value);
-    AddValueSymbolsImpl(BE->getLHS(), Asm);
-    AddValueSymbolsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef:
-    Asm->getOrCreateSymbolData(cast<MCSymbolRefExpr>(Value)->getSymbol());
-    break;
-
-  case MCExpr::Unary:
-    AddValueSymbolsImpl(cast<MCUnaryExpr>(Value)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::AddValueSymbols(MCAssembler *Asm) const {
-  AddValueSymbolsImpl(getSubExpr(), Asm);
-}
-
-const MCSection *ARM64MCExpr::FindAssociatedSection() const {
-  llvm_unreachable("FIXME: what goes here?");
-}
-
-bool ARM64MCExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                            const MCAsmLayout *Layout) const {
-  if (!getSubExpr()->EvaluateAsRelocatable(Res, Layout))
-    return false;
-
-  Res =
-      MCValue::get(Res.getSymA(), Res.getSymB(), Res.getConstant(), getKind());
-
-  return true;
-}
-
-static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
-  switch (Expr->getKind()) {
-  case MCExpr::Target:
-    llvm_unreachable("Can't handle nested target expression");
-    break;
-  case MCExpr::Constant:
-    break;
-
-  case MCExpr::Binary: {
-    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
-    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
-    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
-    break;
-  }
-
-  case MCExpr::SymbolRef: {
-    // We're known to be under a TLS fixup, so any symbol should be
-    // modified. There should be only one.
-    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
-    MCSymbolData &SD = Asm.getOrCreateSymbolData(SymRef.getSymbol());
-    MCELF::SetType(SD, ELF::STT_TLS);
-    break;
-  }
-
-  case MCExpr::Unary:
-    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
-    break;
-  }
-}
-
-void ARM64MCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
-  switch (getSymbolLoc(Kind)) {
-  default:
-    return;
-  case VK_DTPREL:
-  case VK_GOTTPREL:
-  case VK_TPREL:
-  case VK_TLSDESC:
-    break;
-  }
-
-  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
deleted file mode 100644
index a33fe43..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCExpr.h
+++ /dev/null
@@ -1,162 +0,0 @@
-//=---- ARM64MCExpr.h - ARM64 specific MC expression classes ------*- C++ -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes ARM64-specific MCExprs, used for modifiers like
-// ":lo12:" or ":gottprel_g1:".
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ARM64MCEXPR_H
-#define LLVM_ARM64MCEXPR_H
-
-#include "llvm/MC/MCExpr.h"
-#include "llvm/Support/ErrorHandling.h"
-
-namespace llvm {
-
-class ARM64MCExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_NONE     = 0x000,
-
-    // Symbol locations specifying (roughly speaking) what calculation should be
-    // performed to construct the final address for the relocated
-    // symbol. E.g. direct, via the GOT, ...
-    VK_ABS      = 0x001,
-    VK_SABS     = 0x002,
-    VK_GOT      = 0x003,
-    VK_DTPREL   = 0x004,
-    VK_GOTTPREL = 0x005,
-    VK_TPREL    = 0x006,
-    VK_TLSDESC  = 0x007,
-    VK_SymLocBits = 0x00f,
-
-    // Variants specifying which part of the final address calculation is
-    // used. E.g. the low 12 bits for an ADD/LDR, the middle 16 bits for a
-    // MOVZ/MOVK.
-    VK_PAGE     = 0x010,
-    VK_PAGEOFF  = 0x020,
-    VK_G0       = 0x030,
-    VK_G1       = 0x040,
-    VK_G2       = 0x050,
-    VK_G3       = 0x060,
-    VK_AddressFragBits = 0x0f0,
-
-    // Whether the final relocation is a checked one (where a linker should
-    // perform a range-check on the final address) or not. Note that this field
-    // is unfortunately sometimes omitted from the assembly syntax. E.g. :lo12:
-    // on its own is a non-checked relocation. We side with ELF on being
-    // explicit about this!
-    VK_NC       = 0x100,
-
-    // Convenience definitions for referring to specific textual representations
-    // of relocation specifiers. Note that this means the "_NC" is sometimes
-    // omitted in line with assembly syntax here (VK_LO12 rather than VK_LO12_NC
-    // since a user would write ":lo12:").
-    VK_CALL              = VK_ABS,
-    VK_ABS_PAGE          = VK_ABS      | VK_PAGE,
-    VK_ABS_G3            = VK_ABS      | VK_G3,
-    VK_ABS_G2            = VK_ABS      | VK_G2,
-    VK_ABS_G2_NC         = VK_ABS      | VK_G2      | VK_NC,
-    VK_ABS_G1            = VK_ABS      | VK_G1,
-    VK_ABS_G1_NC         = VK_ABS      | VK_G1      | VK_NC,
-    VK_ABS_G0            = VK_ABS      | VK_G0,
-    VK_ABS_G0_NC         = VK_ABS      | VK_G0      | VK_NC,
-    VK_LO12              = VK_ABS      | VK_PAGEOFF | VK_NC,
-    VK_GOT_LO12          = VK_GOT      | VK_PAGEOFF | VK_NC,
-    VK_GOT_PAGE          = VK_GOT      | VK_PAGE,
-    VK_DTPREL_G2         = VK_DTPREL   | VK_G2,
-    VK_DTPREL_G1         = VK_DTPREL   | VK_G1,
-    VK_DTPREL_G1_NC      = VK_DTPREL   | VK_G1      | VK_NC,
-    VK_DTPREL_G0         = VK_DTPREL   | VK_G0,
-    VK_DTPREL_G0_NC      = VK_DTPREL   | VK_G0      | VK_NC,
-    VK_DTPREL_LO12       = VK_DTPREL   | VK_PAGEOFF,
-    VK_DTPREL_LO12_NC    = VK_DTPREL   | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_PAGE     = VK_GOTTPREL | VK_PAGE,
-    VK_GOTTPREL_LO12_NC  = VK_GOTTPREL | VK_PAGEOFF | VK_NC,
-    VK_GOTTPREL_G1       = VK_GOTTPREL | VK_G1,
-    VK_GOTTPREL_G0_NC    = VK_GOTTPREL | VK_G0      | VK_NC,
-    VK_TPREL_G2          = VK_TPREL    | VK_G2,
-    VK_TPREL_G1          = VK_TPREL    | VK_G1,
-    VK_TPREL_G1_NC       = VK_TPREL    | VK_G1      | VK_NC,
-    VK_TPREL_G0          = VK_TPREL    | VK_G0,
-    VK_TPREL_G0_NC       = VK_TPREL    | VK_G0      | VK_NC,
-    VK_TPREL_LO12        = VK_TPREL    | VK_PAGEOFF,
-    VK_TPREL_LO12_NC     = VK_TPREL    | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_LO12      = VK_TLSDESC  | VK_PAGEOFF | VK_NC,
-    VK_TLSDESC_PAGE      = VK_TLSDESC  | VK_PAGE,
-
-    VK_INVALID  = 0xfff
-  };
-
-private:
-  const MCExpr *Expr;
-  const VariantKind Kind;
-
-  explicit ARM64MCExpr(const MCExpr *Expr, VariantKind Kind)
-    : Expr(Expr), Kind(Kind) {}
-
-public:
-  /// @name Construction
-  /// @{
-
-  static const ARM64MCExpr *Create(const MCExpr *Expr, VariantKind Kind,
-                                   MCContext &Ctx);
-
-  /// @}
-  /// @name Accessors
-  /// @{
-
-  /// Get the kind of this expression.
-  VariantKind getKind() const { return static_cast<VariantKind>(Kind); }
-
-  /// Get the expression this modifier applies to.
-  const MCExpr *getSubExpr() const { return Expr; }
-
-  /// @}
-  /// @name VariantKind information extractors.
-  /// @{
-
-  static VariantKind getSymbolLoc(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_SymLocBits);
-  }
-
-  static VariantKind getAddressFrag(VariantKind Kind) {
-    return static_cast<VariantKind>(Kind & VK_AddressFragBits);
-  }
-
-  static bool isNotChecked(VariantKind Kind) { return Kind & VK_NC; }
-
-  /// @}
-
-  /// Convert the variant kind into an ELF-appropriate modifier
-  /// (e.g. ":got:", ":lo12:").
-  StringRef getVariantKindName() const;
-
-  void PrintImpl(raw_ostream &OS) const;
-
-  void AddValueSymbols(MCAssembler *) const;
-
-  const MCSection *FindAssociatedSection() const;
-
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-
-  static bool classof(const ARM64MCExpr *) { return true; }
-
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
deleted file mode 100644
index 8d54412..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.cpp
+++ /dev/null
@@ -1,167 +0,0 @@
-//===-- ARM64MCTargetDesc.cpp - ARM64 Target Descriptions -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "ARM64MCTargetDesc.h"
-#include "ARM64ELFStreamer.h"
-#include "ARM64MCAsmInfo.h"
-#include "InstPrinter/ARM64InstPrinter.h"
-#include "llvm/MC/MCCodeGenInfo.h"
-#include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
-
-#define GET_INSTRINFO_MC_DESC
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_MC_DESC
-#include "ARM64GenSubtargetInfo.inc"
-
-#define GET_REGINFO_MC_DESC
-#include "ARM64GenRegisterInfo.inc"
-
-using namespace llvm;
-
-static MCInstrInfo *createARM64MCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitARM64MCInstrInfo(X);
-  return X;
-}
-
-static MCSubtargetInfo *createARM64MCSubtargetInfo(StringRef TT, StringRef CPU,
-                                                   StringRef FS) {
-  MCSubtargetInfo *X = new MCSubtargetInfo();
-  InitARM64MCSubtargetInfo(X, TT, CPU, FS);
-  return X;
-}
-
-static MCRegisterInfo *createARM64MCRegisterInfo(StringRef Triple) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitARM64MCRegisterInfo(X, ARM64::LR);
-  return X;
-}
-
-static MCAsmInfo *createARM64MCAsmInfo(const MCRegisterInfo &MRI,
-                                       StringRef TT) {
-  Triple TheTriple(TT);
-
-  MCAsmInfo *MAI;
-  if (TheTriple.isOSDarwin())
-    MAI = new ARM64MCAsmInfoDarwin();
-  else {
-    assert(TheTriple.isOSBinFormatELF() && "Only expect Darwin or ELF");
-    MAI = new ARM64MCAsmInfoELF();
-  }
-
-  // Initial state of the frame pointer is SP.
-  unsigned Reg = MRI.getDwarfRegNum(ARM64::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
-  MAI->addInitialFrameState(Inst);
-
-  return MAI;
-}
-
-static MCCodeGenInfo *createARM64MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM,
-                                               CodeGenOpt::Level OL) {
-  Triple TheTriple(TT);
-  assert((TheTriple.isOSBinFormatELF() || TheTriple.isOSBinFormatMachO()) &&
-         "Only expect Darwin and ELF targets");
-
-  if (CM == CodeModel::Default)
-    CM = CodeModel::Small;
-  // The default MCJIT memory managers make no guarantees about where they can
-  // find an executable page; JITed code needs to be able to refer to globals
-  // no matter how far away they are.
-  else if (CM == CodeModel::JITDefault)
-    CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error("Only small and large code models are allowed on ARM64");
-
-  // ARM64 Darwin is always PIC.
-  if (TheTriple.isOSDarwin())
-    RM = Reloc::PIC_;
-  // On ELF platforms the default static relocation model has a smart enough
-  // linker to cope with referencing external symbols defined in a shared
-  // library. Hence DynamicNoPIC doesn't need to be promoted to PIC.
-  else if (RM == Reloc::Default || RM == Reloc::DynamicNoPIC)
-    RM = Reloc::Static;
-
-  MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM, OL);
-  return X;
-}
-
-static MCInstPrinter *createARM64MCInstPrinter(const Target &T,
-                                               unsigned SyntaxVariant,
-                                               const MCAsmInfo &MAI,
-                                               const MCInstrInfo &MII,
-                                               const MCRegisterInfo &MRI,
-                                               const MCSubtargetInfo &STI) {
-  if (SyntaxVariant == 0)
-    return new ARM64InstPrinter(MAI, MII, MRI, STI);
-  if (SyntaxVariant == 1)
-    return new ARM64AppleInstPrinter(MAI, MII, MRI, STI);
-
-  return 0;
-}
-
-static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
-                                    MCContext &Ctx, MCAsmBackend &TAB,
-                                    raw_ostream &OS, MCCodeEmitter *Emitter,
-                                    const MCSubtargetInfo &STI, bool RelaxAll,
-                                    bool NoExecStack) {
-  Triple TheTriple(TT);
-
-  if (TheTriple.isOSDarwin())
-    return createMachOStreamer(Ctx, TAB, OS, Emitter, RelaxAll,
-                               /*LabelSections*/ true);
-
-  return createARM64ELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll, NoExecStack);
-}
-
-// Force static initialization.
-extern "C" void LLVMInitializeARM64TargetMC() {
-  // Register the MC asm info.
-  RegisterMCAsmInfoFn X(TheARM64Target, createARM64MCAsmInfo);
-
-  // Register the MC codegen info.
-  TargetRegistry::RegisterMCCodeGenInfo(TheARM64Target,
-                                        createARM64MCCodeGenInfo);
-
-  // Register the MC instruction info.
-  TargetRegistry::RegisterMCInstrInfo(TheARM64Target, createARM64MCInstrInfo);
-
-  // Register the MC register info.
-  TargetRegistry::RegisterMCRegInfo(TheARM64Target, createARM64MCRegisterInfo);
-
-  // Register the MC subtarget info.
-  TargetRegistry::RegisterMCSubtargetInfo(TheARM64Target,
-                                          createARM64MCSubtargetInfo);
-
-  // Register the asm backend.
-  TargetRegistry::RegisterMCAsmBackend(TheARM64Target, createARM64AsmBackend);
-
-  // Register the MC Code Emitter
-  TargetRegistry::RegisterMCCodeEmitter(TheARM64Target,
-                                        createARM64MCCodeEmitter);
-
-  // Register the object streamer.
-  TargetRegistry::RegisterMCObjectStreamer(TheARM64Target, createMCStreamer);
-
-  // Register the MCInstPrinter.
-  TargetRegistry::RegisterMCInstPrinter(TheARM64Target,
-                                        createARM64MCInstPrinter);
-}
diff --git a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h b/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
deleted file mode 100644
index 0db2b22..0000000
--- a/lib/Target/ARM64/MCTargetDesc/ARM64MCTargetDesc.h
+++ /dev/null
@@ -1,62 +0,0 @@
-//===-- ARM64MCTargetDesc.h - ARM64 Target Descriptions ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides ARM64 specific target descriptions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef ARM64MCTARGETDESC_H
-#define ARM64MCTARGETDESC_H
-
-#include "llvm/Support/DataTypes.h"
-#include <string>
-
-namespace llvm {
-class MCAsmBackend;
-class MCCodeEmitter;
-class MCContext;
-class MCInstrInfo;
-class MCRegisterInfo;
-class MCObjectWriter;
-class MCSubtargetInfo;
-class StringRef;
-class Target;
-class raw_ostream;
-
-extern Target TheARM64Target;
-
-MCCodeEmitter *createARM64MCCodeEmitter(const MCInstrInfo &MCII,
-                                        const MCRegisterInfo &MRI,
-                                        const MCSubtargetInfo &STI,
-                                        MCContext &Ctx);
-MCAsmBackend *createARM64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                                    StringRef TT, StringRef CPU);
-
-MCObjectWriter *createARM64ELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
-
-MCObjectWriter *createARM64MachObjectWriter(raw_ostream &OS, uint32_t CPUType,
-                                            uint32_t CPUSubtype);
-
-} // End llvm namespace
-
-// Defines symbolic names for ARM64 registers.  This defines a mapping from
-// register name to register number.
-//
-#define GET_REGINFO_ENUM
-#include "ARM64GenRegisterInfo.inc"
-
-// Defines symbolic names for the ARM64 instructions.
-//
-#define GET_INSTRINFO_ENUM
-#include "ARM64GenInstrInfo.inc"
-
-#define GET_SUBTARGETINFO_ENUM
-#include "ARM64GenSubtargetInfo.inc"
-
-#endif
diff --git a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
deleted file mode 100644
index f8665bc..0000000
--- a/lib/Target/ARM64/MCTargetDesc/CMakeLists.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-add_llvm_library(LLVMARM64Desc
-  ARM64AsmBackend.cpp
-  ARM64ELFObjectWriter.cpp
-  ARM64ELFStreamer.cpp
-  ARM64MCAsmInfo.cpp
-  ARM64MCCodeEmitter.cpp
-  ARM64MCExpr.cpp
-  ARM64MCTargetDesc.cpp
-  ARM64MachObjectWriter.cpp
-)
-add_dependencies(LLVMARM64Desc ARM64CommonTableGen)
-
-# Hack: we need to include 'main' target directory to grab private headers
-include_directories(${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_BINARY_DIR}/..)
diff --git a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt b/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
deleted file mode 100644
index e4c74d2..0000000
--- a/lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Desc
-parent = ARM64
-required_libraries = ARM64AsmPrinter ARM64Info MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/MCTargetDesc/Makefile b/lib/Target/ARM64/MCTargetDesc/Makefile
deleted file mode 100644
index 013cc63..0000000
--- a/lib/Target/ARM64/MCTargetDesc/Makefile
+++ /dev/null
@@ -1,16 +0,0 @@
-##===- lib/Target/ARM64/TargetDesc/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Desc
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/Makefile b/lib/Target/ARM64/Makefile
deleted file mode 100644
index 5f0f307..0000000
--- a/lib/Target/ARM64/Makefile
+++ /dev/null
@@ -1,25 +0,0 @@
-##===- lib/Target/ARM64/Makefile ---------------------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-
-LEVEL = ../../..
-LIBRARYNAME = LLVMARM64CodeGen
-TARGET = ARM64
-
-# Make sure that tblgen is run, first thing.
-BUILT_SOURCES = ARM64GenRegisterInfo.inc ARM64GenInstrInfo.inc \
-		ARM64GenAsmWriter.inc ARM64GenAsmWriter1.inc \
-		ARM64GenDAGISel.inc \
-		ARM64GenCallingConv.inc ARM64GenAsmMatcher.inc \
-		ARM64GenSubtargetInfo.inc ARM64GenMCCodeEmitter.inc \
-		ARM64GenFastISel.inc ARM64GenDisassemblerTables.inc \
-		ARM64GenMCPseudoLowering.inc
-
-DIRS = TargetInfo InstPrinter AsmParser Disassembler MCTargetDesc
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp b/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
deleted file mode 100644
index dec09ed..0000000
--- a/lib/Target/ARM64/TargetInfo/ARM64TargetInfo.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- ARM64TargetInfo.cpp - ARM64 Target Implementation -----------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/TargetRegistry.h"
-using namespace llvm;
-
-namespace llvm {
-Target TheARM64Target;
-} // end namespace llvm
-
-extern "C" void LLVMInitializeARM64TargetInfo() {
-  RegisterTarget<Triple::arm64, /*HasJIT=*/true> X(TheARM64Target, "arm64",
-                                                   "ARM64");
-}
diff --git a/lib/Target/ARM64/TargetInfo/CMakeLists.txt b/lib/Target/ARM64/TargetInfo/CMakeLists.txt
deleted file mode 100644
index a0142c4..0000000
--- a/lib/Target/ARM64/TargetInfo/CMakeLists.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
-
-add_llvm_library(LLVMARM64Info
-  ARM64TargetInfo.cpp
-  )
-
-add_dependencies(LLVMARM64Info ARM64CommonTableGen)
diff --git a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt b/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
deleted file mode 100644
index 5bea694..0000000
--- a/lib/Target/ARM64/TargetInfo/LLVMBuild.txt
+++ /dev/null
@@ -1,24 +0,0 @@
-;===- ./lib/Target/ARM64/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
-;
-;                     The LLVM Compiler Infrastructure
-;
-; This file is distributed under the University of Illinois Open Source
-; License. See LICENSE.TXT for details.
-;
-;===------------------------------------------------------------------------===;
-;
-; This is an LLVMBuild description file for the components in this subdirectory.
-;
-; For more information on the LLVMBuild system, please see:
-;
-;   http://llvm.org/docs/LLVMBuild.html
-;
-;===------------------------------------------------------------------------===;
-
-[component_0]
-type = Library
-name = ARM64Info
-parent = ARM64
-required_libraries = MC Support
-add_to_library_groups = ARM64
-
diff --git a/lib/Target/ARM64/TargetInfo/Makefile b/lib/Target/ARM64/TargetInfo/Makefile
deleted file mode 100644
index 2d5a1a0..0000000
--- a/lib/Target/ARM64/TargetInfo/Makefile
+++ /dev/null
@@ -1,15 +0,0 @@
-##===- lib/Target/ARM64/TargetInfo/Makefile ----------------*- Makefile -*-===##
-#
-#                     The LLVM Compiler Infrastructure
-#
-# This file is distributed under the University of Illinois Open Source
-# License. See LICENSE.TXT for details.
-#
-##===----------------------------------------------------------------------===##
-LEVEL = ../../../..
-LIBRARYNAME = LLVMARM64Info
-
-# Hack: we need to include 'main' target directory to grab private headers
-CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
-
-include $(LEVEL)/Makefile.common
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index afd1f51..15b574d 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -108,9 +108,9 @@ namespace {
     explicit CppWriter(formatted_raw_ostream &o) :
       ModulePass(ID), Out(o), uniqueNum(0), is_inline(false), indent_level(0){}
 
-    virtual const char *getPassName() const { return "C++ backend"; }
+    const char *getPassName() const override { return "C++ backend"; }
 
-    bool runOnModule(Module &M);
+    bool runOnModule(Module &M) override;
 
     void printProgram(const std::string& fname, const std::string& modName );
     void printModule(const std::string& fname, const std::string& modName );
@@ -396,7 +396,7 @@ std::string CppWriter::getCppName(Type* Ty) {
     return I->second;
 
   // Okay, let's build a new name for this type. Start with a prefix
-  const char* prefix = 0;
+  const char* prefix = nullptr;
   switch (Ty->getTypeID()) {
   case Type::FunctionTyID:    prefix = "FuncTy_"; break;
   case Type::StructTyID:      prefix = "StructTy_"; break;
@@ -1690,9 +1690,8 @@ void CppWriter::printFunctionUses(const Function* F) {
 
   // Print the function declarations for any functions encountered
   nl(Out) << "// Function Declarations"; nl(Out);
-  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-       I != E; ++I) {
-    if (Function* Fun = dyn_cast<Function>(*I)) {
+  for (auto *GV : gvs) {
+    if (Function *Fun = dyn_cast<Function>(GV)) {
       if (!is_inline || Fun != F)
         printFunctionHead(Fun);
     }
@@ -1700,17 +1699,15 @@ void CppWriter::printFunctionUses(const Function* F) {
 
   // Print the global variable declarations for any variables encountered
   nl(Out) << "// Global Variable Declarations"; nl(Out);
-  for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-       I != E; ++I) {
-    if (GlobalVariable* F = dyn_cast<GlobalVariable>(*I))
+  for (auto *GV : gvs) {
+    if (GlobalVariable *F = dyn_cast<GlobalVariable>(GV))
       printVariableHead(F);
   }
 
   // Print the constants found
   nl(Out) << "// Constant Definitions"; nl(Out);
-  for (SmallPtrSet<Constant*,64>::iterator I = consts.begin(),
-         E = consts.end(); I != E; ++I) {
-    printConstant(*I);
+  for (const auto *C : consts) {
+    printConstant(C);
   }
 
   // Process the global variables definitions now that all the constants have
@@ -1718,10 +1715,9 @@ void CppWriter::printFunctionUses(const Function* F) {
   // initializers.
   if (GenerationType != GenFunction) {
     nl(Out) << "// Global Variable Definitions"; nl(Out);
-    for (SmallPtrSet<GlobalValue*,64>::iterator I = gvs.begin(), E = gvs.end();
-         I != E; ++I) {
-      if (GlobalVariable* GV = dyn_cast<GlobalVariable>(*I))
-        printVariableBody(GV);
+    for (const auto &GV : gvs) {
+      if (GlobalVariable *Var = dyn_cast<GlobalVariable>(GV))
+        printVariableBody(Var);
     }
   }
 }
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 477e788..673ade7 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -28,14 +28,12 @@ struct CPPTargetMachine : public TargetMachine {
                    CodeGenOpt::Level OL)
     : TargetMachine(T, TT, CPU, FS, Options) {}
 
-  virtual bool addPassesToEmitFile(PassManagerBase &PM,
-                                   formatted_raw_ostream &Out,
-                                   CodeGenFileType FileType,
-                                   bool DisableVerify,
-                                   AnalysisID StartAfter,
-                                   AnalysisID StopAfter);
-
-  virtual const DataLayout *getDataLayout() const { return 0; }
+  bool addPassesToEmitFile(PassManagerBase &PM, formatted_raw_ostream &Out,
+                           CodeGenFileType FileType, bool DisableVerify,
+                           AnalysisID StartAfter,
+                           AnalysisID StopAfter) override;
+
+  const DataLayout *getDataLayout() const override { return nullptr; }
 };
 
 extern Target TheCppBackendTarget;
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index c1b6d45..5f4a6c6 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -200,8 +200,6 @@ class Proc<string Name, SchedMachineModel Model,
            list<SubtargetFeature> Features>
  : ProcessorModel<Name, Model, Features>;
 
-def : Proc<"hexagonv2", HexagonModel,   [ArchV2]>;
-def : Proc<"hexagonv3", HexagonModel,   [ArchV2, ArchV3]>;
 def : Proc<"hexagonv4", HexagonModelV4, [ArchV2, ArchV3, ArchV4]>;
 def : Proc<"hexagonv5", HexagonModelV4, [ArchV2, ArchV3, ArchV4, ArchV5]>;
 
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index a588274..2e011bd 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "Hexagon.h"
 #include "HexagonAsmPrinter.h"
 #include "HexagonMachineFunctionInfo.h"
@@ -56,6 +55,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 static cl::opt<bool> AlignCalls(
          "hexagon-align-calls", cl::Hidden, cl::init(true),
           cl::desc("Insert falign after call instruction for Hexagon target"));
@@ -224,7 +225,7 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Target &T,
   if (SyntaxVariant == 0)
     return(new HexagonInstPrinter(MAI, MII, MRI));
   else
-   return NULL;
+   return nullptr;
 }
 
 extern "C" void LLVMInitializeHexagonAsmPrinter() {
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.h b/lib/Target/Hexagon/HexagonAsmPrinter.h
index a186dc9..7fe8c57 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -30,21 +30,22 @@ namespace llvm {
       Subtarget = &TM.getSubtarget<HexagonSubtarget>();
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Assembly Printer";
     }
 
-    bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const;
+    bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock *MBB) const override;
 
-    virtual void EmitInstruction(const MachineInstr *MI);
+    void EmitInstruction(const MachineInstr *MI) override;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &OS);
+                         raw_ostream &OS) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &OS);
+                               raw_ostream &OS) override;
 
     static const char *getRegisterName(unsigned RegNo);
   };
diff --git a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
index 8597f11..de340e0 100644
--- a/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
+++ b/lib/Target/Hexagon/HexagonCFGOptimizer.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon_cfg"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon_cfg"
+
 namespace llvm {
   void initializeHexagonCFGOptimizerPass(PassRegistry&);
 }
@@ -48,10 +49,10 @@ private:
     initializeHexagonCFGOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon CFG Optimizer";
   }
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -146,8 +147,8 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
         MachineBasicBlock::succ_iterator SI = MBB->succ_begin();
         MachineBasicBlock* FirstSucc = *SI;
         MachineBasicBlock* SecondSucc = *(++SI);
-        MachineBasicBlock* LayoutSucc = NULL;
-        MachineBasicBlock* JumpAroundTarget = NULL;
+        MachineBasicBlock* LayoutSucc = nullptr;
+        MachineBasicBlock* JumpAroundTarget = nullptr;
 
         if (MBB->isLayoutSuccessor(FirstSucc)) {
           LayoutSucc = FirstSucc;
@@ -161,7 +162,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
         // The target of the unconditional branch must be JumpAroundTarget.
         // TODO: If not, we should not invert the unconditional branch.
-        MachineBasicBlock* CondBranchTarget = NULL;
+        MachineBasicBlock* CondBranchTarget = nullptr;
         if ((MI->getOpcode() == Hexagon::JMP_t) ||
             (MI->getOpcode() == Hexagon::JMP_f)) {
           CondBranchTarget = MI->getOperand(1).getMBB();
@@ -239,7 +240,7 @@ bool HexagonCFGOptimizer::runOnMachineFunction(MachineFunction &Fn) {
 
 static void initializePassOnce(PassRegistry &Registry) {
   PassInfo *PI = new PassInfo("Hexagon CFG Optimizer", "hexagon-cfg",
-                              &HexagonCFGOptimizer::ID, 0, false, false);
+                              &HexagonCFGOptimizer::ID, nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 60c933b..aeff680 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -11,8 +11,6 @@
 // to move them together. If we can move them next to each other we do so and
 // replace them with a combine instruction.
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-copy-combine"
-
 #include "llvm/PassSupport.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
@@ -36,6 +34,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-copy-combine"
+
 static
 cl::opt<bool> IsCombinesDisabled("disable-merge-into-combines",
                                  cl::Hidden, cl::ZeroOrMore,
@@ -68,15 +68,15 @@ public:
     initializeHexagonCopyToCombinePass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon Copy-To-Combine Pass";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
 private:
   MachineInstr *findPairable(MachineInstr *I1, bool &DoInsertAtI1);
@@ -262,7 +262,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     unsigned KilledOperand = 0;
     if (I2->killsRegister(I2UseReg))
       KilledOperand = I2UseReg;
-    MachineInstr *KillingInstr = 0;
+    MachineInstr *KillingInstr = nullptr;
 
     for (; I != End; ++I) {
       // If the intervening instruction I:
@@ -306,7 +306,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
     // Track killed operands. If we move across an instruction that kills our
     // operand, we need to update the kill information on the moved I1. It kills
     // the operand now.
-    MachineInstr *KillingInstr = 0;
+    MachineInstr *KillingInstr = nullptr;
     unsigned KilledOperand = 0;
 
     while(++I != End) {
@@ -333,7 +333,7 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr *I1,
 
       // Check for an exact kill (registers match).
       if (I1UseReg && I->killsRegister(I1UseReg)) {
-        assert(KillingInstr == 0 && "Should only see one killing instruction");
+        assert(!KillingInstr && "Should only see one killing instruction");
         KilledOperand = I1UseReg;
         KillingInstr = &*I;
       }
@@ -506,7 +506,7 @@ MachineInstr *HexagonCopyToCombine::findPairable(MachineInstr *I1,
     // Not safe. Stop searching.
     break;
   }
-  return 0;
+  return nullptr;
 }
 
 void HexagonCopyToCombine::combine(MachineInstr *I1, MachineInstr *I2,
diff --git a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
index 8a5991f..3dafe80 100644
--- a/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
+++ b/lib/Target/Hexagon/HexagonExpandPredSpillCode.cpp
@@ -60,10 +60,10 @@ class HexagonExpandPredSpillCode : public MachineFunctionPass {
       initializeHexagonExpandPredSpillCodePass(Registry);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Expand Predicate Spill Code";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -187,7 +187,7 @@ static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon Expand Predicate Spill Code";
   PassInfo *PI = new PassInfo(Name, "hexagon-spill-pred",
                               &HexagonExpandPredSpillCode::ID,
-                              0, false, false);
+                              nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index a79264b..d41939a 100644
--- a/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -40,11 +40,13 @@ namespace {
       initializeHexagonFixupHwLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const { return "Hexagon Hardware Loop Fixup"; }
+    const char *getPassName() const override {
+      return "Hexagon Hardware Loop Fixup";
+    }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 0ea13d4..d551ca9 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -246,7 +246,7 @@ HexagonFrameLowering::spillCalleeSavedRegisters(
     //
     unsigned SuperReg = uniqueSuperReg(Reg, TRI);
     bool CanUseDblStore = false;
-    const TargetRegisterClass* SuperRegClass = 0;
+    const TargetRegisterClass* SuperRegClass = nullptr;
 
     if (ContiguousRegs && (i < CSI.size()-1)) {
       unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
@@ -300,7 +300,7 @@ bool HexagonFrameLowering::restoreCalleeSavedRegisters(
     // Check if we can use a double-word load.
     //
     unsigned SuperReg = uniqueSuperReg(Reg, TRI);
-    const TargetRegisterClass* SuperRegClass = 0;
+    const TargetRegisterClass* SuperRegClass = nullptr;
     bool CanUseDblLoad = false;
     if (ContiguousRegs && (i < CSI.size()-1)) {
       unsigned SuperRegNext = uniqueSuperReg(CSI[i+1].getReg(), TRI);
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.h b/lib/Target/Hexagon/HexagonFrameLowering.h
index a62c76a..446af16 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -28,25 +28,25 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-  virtual bool
-  spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            const std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const;
-
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
-
-  virtual bool
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
+
+  bool
   restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const std::vector<CalleeSavedInfo> &CSI,
-                              const TargetRegisterInfo *TRI) const;
-  int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  bool hasFP(const MachineFunction &MF) const;
+                              const TargetRegisterInfo *TRI) const override;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  bool hasFP(const MachineFunction &MF) const override;
   bool hasTailCall(MachineBasicBlock &MBB) const;
 };
 
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 936fb11..7f76421 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -26,7 +26,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hwloops"
 #include "llvm/ADT/SmallSet.h"
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
@@ -47,6 +46,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hwloops"
+
 #ifndef NDEBUG
 static cl::opt<int> HWLoopLimit("max-hwloop", cl::Hidden, cl::init(-1));
 #endif
@@ -77,11 +78,11 @@ namespace {
       initializeHexagonHardwareLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const { return "Hexagon Hardware Loops"; }
+    const char *getPassName() const override { return "Hexagon Hardware Loops"; }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachineLoopInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -264,8 +265,8 @@ namespace {
       return Contents.ImmVal;
     }
 
-    void print(raw_ostream &OS, const TargetMachine *TM = 0) const {
-      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : 0;
+    void print(raw_ostream &OS, const TargetMachine *TM = nullptr) const {
+      const TargetRegisterInfo *TRI = TM ? TM->getRegisterInfo() : nullptr;
       if (isReg()) { OS << PrintReg(Contents.R.Reg, TRI, Contents.R.Sub); }
       if (isImm()) { OS << Contents.ImmVal; }
     }
@@ -369,7 +370,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
   }  // for (instr)
 
   SmallVector<MachineOperand,2> Cond;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
   if (NotAnalyzed)
     return false;
@@ -434,37 +435,37 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
          "Loop must have more than one incoming edge!");
   MachineBasicBlock *Backedge = *PI++;
   if (PI == TopMBB->pred_end())  // dead loop?
-    return 0;
+    return nullptr;
   MachineBasicBlock *Incoming = *PI++;
   if (PI != TopMBB->pred_end())  // multiple backedges?
-    return 0;
+    return nullptr;
 
   // Make sure there is one incoming and one backedge and determine which
   // is which.
   if (L->contains(Incoming)) {
     if (L->contains(Backedge))
-      return 0;
+      return nullptr;
     std::swap(Incoming, Backedge);
   } else if (!L->contains(Backedge))
-    return 0;
+    return nullptr;
 
   // Look for the cmp instruction to determine if we can get a useful trip
   // count.  The trip count can be either a register or an immediate.  The
   // location of the value depends upon the type (reg or imm).
   MachineBasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
-    return 0;
+    return nullptr;
 
   unsigned IVReg = 0;
   int64_t IVBump = 0;
   MachineInstr *IVOp;
   bool FoundIV = findInductionRegister(L, IVReg, IVBump, IVOp);
   if (!FoundIV)
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *Preheader = L->getLoopPreheader();
 
-  MachineOperand *InitialValue = 0;
+  MachineOperand *InitialValue = nullptr;
   MachineInstr *IV_Phi = MRI->getVRegDef(IVReg);
   for (unsigned i = 1, n = IV_Phi->getNumOperands(); i < n; i += 2) {
     MachineBasicBlock *MBB = IV_Phi->getOperand(i+1).getMBB();
@@ -474,13 +475,13 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
       IVReg = IV_Phi->getOperand(i).getReg();  // Want IV reg after bump.
   }
   if (!InitialValue)
-    return 0;
+    return nullptr;
 
   SmallVector<MachineOperand,2> Cond;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
   if (NotAnalyzed)
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *Header = L->getHeader();
   // TB must be non-null.  If FB is also non-null, one of them must be
@@ -489,7 +490,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   assert (TB && "Latch block without a branch?");
   assert ((!FB || TB == Header || FB == Header) && "Branches not to header?");
   if (!TB || (FB && TB != Header && FB != Header))
-    return 0;
+    return nullptr;
 
   // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
   // to put imm(0), followed by P in the vector Cond.
@@ -505,7 +506,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   bool AnalyzedCmp = TII->analyzeCompare(CondI, CmpReg1, CmpReg2,
                                          Mask, ImmValue);
   if (!AnalyzedCmp)
-    return 0;
+    return nullptr;
 
   // The comparison operator type determines how we compute the loop
   // trip count.
@@ -521,7 +522,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   bool isSwapped = false;
   const MachineOperand &Op1 = CondI->getOperand(1);
   const MachineOperand &Op2 = CondI->getOperand(2);
-  const MachineOperand *EndValue = 0;
+  const MachineOperand *EndValue = nullptr;
 
   if (Op1.isReg()) {
     if (Op2.isImm() || Op1.getReg() == IVReg)
@@ -533,7 +534,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   }
 
   if (!EndValue)
-    return 0;
+    return nullptr;
 
   switch (CondOpc) {
     case Hexagon::CMPEQri:
@@ -552,7 +553,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     case Hexagon::CMPbEQri_V4:
     case Hexagon::CMPhEQri_V4: {
       if (IVBump != 1)
-        return 0;
+        return nullptr;
 
       int64_t InitV, EndV;
       // Since the comparisons are "ri", the EndValue should be an
@@ -562,26 +563,26 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
       // Allow InitialValue to be a register defined with an immediate.
       if (InitialValue->isReg()) {
         if (!defWithImmediate(InitialValue->getReg()))
-          return 0;
+          return nullptr;
         InitV = getImmediate(*InitialValue);
       } else {
         assert(InitialValue->isImm());
         InitV = InitialValue->getImm();
       }
       if (InitV >= EndV)
-        return 0;
+        return nullptr;
       if (CondOpc == Hexagon::CMPbEQri_V4) {
         if (!isInt<8>(InitV) || !isInt<8>(EndV))
-          return 0;
+          return nullptr;
       } else {  // Hexagon::CMPhEQri_V4
         if (!isInt<16>(InitV) || !isInt<16>(EndV))
-          return 0;
+          return nullptr;
       }
       Cmp = !Negated ? Comparison::EQ : Comparison::NE;
       break;
     }
     default:
-      return 0;
+      return nullptr;
   }
 
   if (isSwapped)
@@ -591,14 +592,14 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
     unsigned R = InitialValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header))
-      return 0;
+      return nullptr;
     OldInsts.push_back(MRI->getVRegDef(R));
   }
   if (EndValue->isReg()) {
     unsigned R = EndValue->getReg();
     MachineBasicBlock *DefBB = MRI->getVRegDef(R)->getParent();
     if (!MDT->properlyDominates(DefBB, Header))
-      return 0;
+      return nullptr;
   }
 
   return computeCount(L, InitialValue, EndValue, IVReg, IVBump, Cmp);
@@ -616,7 +617,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
                                                Comparison::Kind Cmp) const {
   // Cannot handle comparison EQ, i.e. while (A == B).
   if (Cmp == Comparison::EQ)
-    return 0;
+    return nullptr;
 
   // Check if either the start or end values are an assignment of an immediate.
   // If so, use the immediate value rather than the register.
@@ -642,11 +643,11 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // If loop executes while iv is "less" with the iv value going down, then
   // the iv must wrap.
   if (CmpLess && IVBump < 0)
-    return 0;
+    return nullptr;
   // If loop executes while iv is "greater" with the iv value going up, then
   // the iv must wrap.
   if (CmpGreater && IVBump > 0)
-    return 0;
+    return nullptr;
 
   if (Start->isImm() && End->isImm()) {
     // Both, start and end are immediates.
@@ -654,15 +655,15 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     int64_t EndV = End->getImm();
     int64_t Dist = EndV - StartV;
     if (Dist == 0)
-      return 0;
+      return nullptr;
 
     bool Exact = (Dist % IVBump) == 0;
 
     if (Cmp == Comparison::NE) {
       if (!Exact)
-        return 0;
+        return nullptr;
       if ((Dist < 0) ^ (IVBump < 0))
-        return 0;
+        return nullptr;
     }
 
     // For comparisons that include the final value (i.e. include equality
@@ -683,7 +684,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
     uint64_t Count = Dist1;
 
     if (Count > 0xFFFFFFFFULL)
-      return 0;
+      return nullptr;
 
     return new CountValue(CountValue::CV_Immediate, Count);
   }
@@ -695,7 +696,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // If the induction variable bump is not a power of 2, quit.
   // Othwerise we'd need a general integer division.
   if (!isPowerOf2_64(abs64(IVBump)))
-    return 0;
+    return nullptr;
 
   MachineBasicBlock *PH = Loop->getLoopPreheader();
   assert (PH && "Should have a preheader by now");
@@ -766,7 +767,7 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
   // Hardware loops cannot handle 64-bit registers.  If it's a double
   // register, it has to have a subregister.
   if (!SR && RC == &Hexagon::DoubleRegsRegClass)
-    return 0;
+    return nullptr;
   const TargetRegisterClass *IntRC = &Hexagon::IntRegsRegClass;
 
   // Compute DistR (register with the distance between Start and End).
@@ -1013,7 +1014,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
 
   MachineBasicBlock *LastMBB = L->getExitingBlock();
   // Don't generate hw loop if the loop has more than one exit.
-  if (LastMBB == 0)
+  if (!LastMBB)
     return false;
 
   MachineBasicBlock::iterator LastI = LastMBB->getFirstTerminator();
@@ -1035,7 +1036,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
   SmallVector<MachineInstr*, 2> OldInsts;
   // Are we able to determine the trip count for the loop?
   CountValue *TripCount = getLoopTripCount(L, OldInsts);
-  if (TripCount == 0)
+  if (!TripCount)
     return false;
 
   // Is the trip count available in the preheader?
@@ -1127,7 +1128,7 @@ bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) {
       if (LastI != LastMBB->end())
         LastI = LastMBB->erase(LastI);
       SmallVector<MachineOperand, 0> Cond;
-      TII->InsertBranch(*LastMBB, BranchTarget, 0, Cond, LastIDL);
+      TII->InsertBranch(*LastMBB, BranchTarget, nullptr, Cond, LastIDL);
     }
   } else {
     // Conditional branch to loop start; just delete it.
@@ -1196,7 +1197,7 @@ MachineInstr *HexagonHardwareLoops::defWithImmediate(unsigned R) {
     case Hexagon::CONST64_Int_Real:
       return DI;
   }
-  return 0;
+  return nullptr;
 }
 
 
@@ -1291,7 +1292,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
   if (IndRegs.empty())
     return false;
 
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
   SmallVector<MachineOperand,2> Cond;
   // AnalyzeBranch returns true if it fails to analyze branch.
   bool NotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Cond, false);
@@ -1322,7 +1323,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
     return false;
 
   SmallSet<unsigned,2> CmpRegs;
-  MachineOperand *CmpImmOp = 0;
+  MachineOperand *CmpImmOp = nullptr;
 
   // Go over all operands to the compare and look for immediate and register
   // operands.  Assume that if the compare has a single register use and a
@@ -1420,7 +1421,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   DebugLoc DL;
 
   if (!Latch || Header->hasAddressTaken())
-    return 0;
+    return nullptr;
 
   typedef MachineBasicBlock::instr_iterator instr_iterator;
 
@@ -1429,17 +1430,17 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   typedef std::vector<MachineBasicBlock*> MBBVector;
   MBBVector Preds(Header->pred_begin(), Header->pred_end());
   SmallVector<MachineOperand,2> Tmp1;
-  MachineBasicBlock *TB = 0, *FB = 0;
+  MachineBasicBlock *TB = nullptr, *FB = nullptr;
 
   if (TII->AnalyzeBranch(*Latch, TB, FB, Tmp1, false))
-    return 0;
+    return nullptr;
 
   for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
     MachineBasicBlock *PB = *I;
     if (PB != Latch) {
       bool NotAnalyzed = TII->AnalyzeBranch(*PB, TB, FB, Tmp1, false);
       if (NotAnalyzed)
-        return 0;
+        return nullptr;
     }
   }
 
@@ -1515,7 +1516,7 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
   SmallVector<MachineOperand,1> Tmp2;
   SmallVector<MachineOperand,1> EmptyCond;
 
-  TB = FB = 0;
+  TB = FB = nullptr;
 
   for (MBBVector::iterator I = Preds.begin(), E = Preds.end(); I != E; ++I) {
     MachineBasicBlock *PB = *I;
@@ -1525,22 +1526,22 @@ MachineBasicBlock *HexagonHardwareLoops::createPreheaderForLoop(
       (void)NotAnalyzed; // suppress compiler warning
       assert (!NotAnalyzed && "Should be analyzable!");
       if (TB != Header && (Tmp2.empty() || FB != Header))
-        TII->InsertBranch(*PB, NewPH, 0, EmptyCond, DL);
+        TII->InsertBranch(*PB, NewPH, nullptr, EmptyCond, DL);
       PB->ReplaceUsesOfBlockWith(Header, NewPH);
     }
   }
 
   // It can happen that the latch block will fall through into the header.
   // Insert an unconditional branch to the header.
-  TB = FB = 0;
+  TB = FB = nullptr;
   bool LatchNotAnalyzed = TII->AnalyzeBranch(*Latch, TB, FB, Tmp2, false);
   (void)LatchNotAnalyzed; // suppress compiler warning
   assert (!LatchNotAnalyzed && "Should be analyzable!");
   if (!TB && !FB)
-    TII->InsertBranch(*Latch, Header, 0, EmptyCond, DL);
+    TII->InsertBranch(*Latch, Header, nullptr, EmptyCond, DL);
 
   // Finally, the branch from the preheader to the header.
-  TII->InsertBranch(*NewPH, Header, 0, EmptyCond, DL);
+  TII->InsertBranch(*NewPH, Header, nullptr, EmptyCond, DL);
   NewPH->addSuccessor(Header);
 
   return NewPH;
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index ed8c786..dabe650 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-isel"
 #include "Hexagon.h"
 #include "HexagonISelLowering.h"
 #include "HexagonTargetMachine.h"
@@ -23,6 +22,8 @@
 #include "llvm/Support/Debug.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-isel"
+
 static
 cl::opt<unsigned>
 MaxNumOfUsesForConstExtenders("ga-max-num-uses-for-constant-extenders",
@@ -61,7 +62,7 @@ public:
   }
   bool hasNumUsesBelowThresGA(SDNode *N) const;
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   inline bool foldGlobalAddress(SDValue &N, SDValue &R);
@@ -78,15 +79,15 @@ public:
   bool SelectADDRriU6_1(SDValue& N, SDValue &R1, SDValue &R2);
   bool SelectADDRriU6_2(SDValue& N, SDValue &R1, SDValue &R2);
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Hexagon DAG->DAG Pattern Instruction Selection";
   }
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
   bool SelectAddr(SDNode *Op, SDValue Addr, SDValue &Base, SDValue &Offset);
 
   SDNode *SelectLoad(SDNode *N);
@@ -186,7 +187,7 @@ FunctionPass *llvm::createHexagonISelDag(HexagonTargetMachine &TM,
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon DAG->DAG Pattern Instruction Selection";
   PassInfo *PI = new PassInfo(Name, "hexagon-isel",
-                              &SelectionDAGISel::ID, 0, false, false);
+                              &SelectionDAGISel::ID, nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
@@ -1238,7 +1239,7 @@ SDNode *HexagonDAGToDAGISel::SelectIntrinsicWOChain(SDNode *N) {
         SDNode *PdRs = CurDAG->getMachineNode(Hexagon::TFR_PdRs, dl, MVT::i1,
                                               SDValue(Arg, 0));
         Ops.push_back(SDValue(PdRs,0));
-      } else if (RC == NULL && (dyn_cast<ConstantSDNode>(Arg) != NULL)) {
+      } else if (!RC && (dyn_cast<ConstantSDNode>(Arg) != nullptr)) {
         // This is immediate operand. Lower it here making sure that we DO have
         // const SDNode for immediate value.
         int32_t Val = cast<ConstantSDNode>(Arg)->getSExtValue();
@@ -1346,7 +1347,7 @@ SDNode *HexagonDAGToDAGISel::SelectAdd(SDNode *N) {
 SDNode *HexagonDAGToDAGISel::Select(SDNode *N) {
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index 92b794d..b8e5d24 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-lowering"
+
 static cl::opt<bool>
 EmitJumpTables("hexagon-emit-jump-tables", cl::init(true), cl::Hidden,
                cl::desc("Control jump table emission on Hexagon target"));
@@ -135,7 +137,7 @@ CC_Hexagon_VarArg (unsigned ValNo, MVT ValVT,
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, ofst, LocVT, LocInfo));
     return false;
   }
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
 
@@ -182,7 +184,7 @@ static bool CC_Hexagon32(unsigned ValNo, MVT ValVT,
                          MVT LocVT, CCValAssign::LocInfo LocInfo,
                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
 
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     Hexagon::R0, Hexagon::R1, Hexagon::R2, Hexagon::R3, Hexagon::R4,
     Hexagon::R5
   };
@@ -205,10 +207,10 @@ static bool CC_Hexagon64(unsigned ValNo, MVT ValVT,
     return false;
   }
 
-  static const uint16_t RegList1[] = {
+  static const MCPhysReg RegList1[] = {
     Hexagon::D1, Hexagon::D2
   };
-  static const uint16_t RegList2[] = {
+  static const MCPhysReg RegList2[] = {
     Hexagon::R1, Hexagon::R3
   };
   if (unsigned Reg = State.AllocateReg(RegList1, RegList2, 2)) {
@@ -346,8 +348,7 @@ HexagonTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(HexagonISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 
@@ -410,7 +411,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   int NumNamedVarArgParams = -1;
   if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Callee))
   {
-    const Function* CalleeFn = NULL;
+    const Function* CalleeFn = nullptr;
     Callee = DAG.getTargetGlobalAddress(GA->getGlobal(), dl, MVT::i32);
     if ((CalleeFn = dyn_cast<Function>(GA->getGlobal())))
     {
@@ -520,8 +521,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty()) {
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOpChains[0],
-                        MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
   }
 
   if (!isTailCall)
@@ -595,9 +595,9 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (isTailCall)
-    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops);
 
-  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -817,7 +817,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                        Sub);
 
   SDValue Ops[2] = { ArgAdjust, CopyChain };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue
@@ -916,8 +916,7 @@ const {
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   if (isVarArg) {
     // This will point to the next argument passed via stack.
@@ -1480,7 +1479,7 @@ HexagonTargetLowering::HexagonTargetLowering(HexagonTargetMachine
 const char*
 HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-    default: return 0;
+    default: return nullptr;
     case HexagonISD::CONST32:     return "HexagonISD::CONST32";
     case HexagonISD::CONST32_GP: return "HexagonISD::CONST32_GP";
     case HexagonISD::CONST32_Int_Real: return "HexagonISD::CONST32_Int_Real";
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index 73da226..4f27c27 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -92,14 +92,14 @@ namespace llvm {
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       SelectionDAG& DAG) const;
 
-    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
-    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+    bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override;
 
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
     SDValue  LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
@@ -109,12 +109,12 @@ namespace llvm {
                                  CallingConv::ID CallConv, bool isVarArg,
                                  const SmallVectorImpl<ISD::InputArg> &Ins,
                                  SDLoc dl, SelectionDAG &DAG,
-                                 SmallVectorImpl<SDValue> &InVals) const;
+                                 SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerGLOBALADDRESS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                      SmallVectorImpl<SDValue> &InVals) const;
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
                             CallingConv::ID CallConv, bool isVarArg,
@@ -133,46 +133,45 @@ namespace llvm {
                         CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
-                        SDLoc dl, SelectionDAG &DAG) const;
+                        SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual MachineBasicBlock
-    *EmitInstrWithCustomInserter(MachineInstr *MI,
-                                 MachineBasicBlock *BB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *BB) const override;
 
     SDValue  LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue  LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
-    virtual EVT getSetCCResultType(LLVMContext &C, EVT VT) const {
+    EVT getSetCCResultType(LLVMContext &C, EVT VT) const override {
       if (!VT.isVector())
         return MVT::i1;
       else
         return EVT::getVectorVT(C, MVT::i1, VT.getVectorNumElements());
     }
 
-    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                            SDValue &Base, SDValue &Offset,
-                                            ISD::MemIndexedMode &AM,
-                                            SelectionDAG &DAG) const;
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base, SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
 
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const;
+                                 MVT VT) const override;
 
     // Intrinsics
-    virtual SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op,
-                                            SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     /// The type may be VoidTy, in which case only return true if the addressing
     /// mode is legal for a load/store of any legal type.
     /// TODO: Handle pre/postinc as well.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
     /// isLegalICmpImmediate - Return true if the specified immediate is legal
     /// icmp immediate, that is the target has icmp instructions which can
     /// compare a register against the immediate without having to materialize
     /// the immediate into a register.
-    virtual bool isLegalICmpImmediate(int64_t Imm) const;
+    bool isLegalICmpImmediate(int64_t Imm) const override;
   };
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index d25bfa8..1057343 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//                         Hexagon Intruction Flags +
+//                         Hexagon Instruction Flags +
 //
 //                    *** Must match HexagonBaseInfo.h ***
 //===----------------------------------------------------------------------===//
@@ -68,7 +68,7 @@ def DoubleWordAccess : MemAccessSize<4>;// Double word access instruction (memd)
 
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Class Declaration +
+//                         Instruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
 class OpcodeHexagon {
@@ -104,54 +104,72 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
   let TSFlags{5} = isSolo;
+  // Packed only with A or X-type instructions.
+  bits<1> isSoloAX = 0;
+  let TSFlags{6} = isSoloAX;
+  // Only A-type instruction in first slot or nothing.
+  bits<1> isSoloAin1 = 0;
+  let TSFlags{7} = isSoloAin1;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{6} = isPredicated;
+  let TSFlags{8} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{7} = isPredicatedFalse;
+  let TSFlags{9} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{8} = isPredicatedNew;
+  let TSFlags{10} = isPredicatedNew;
+  bits<1> isPredicateLate = 0;
+  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{9} = isNewValue; // New-value consumer insn.
+  let TSFlags{12} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{10} = hasNewValue; // New-value producer insn.
+  let TSFlags{13} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{13-11} = opNewValue; // New-value produced operand.
-  bits<2> opNewBits = 0;
-  let TSFlags{15-14} = opNewBits; // New-value opcode bits location: 0, 8, 16.
+  let TSFlags{16-14} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{16} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{17} = isNVStore; // New-value store insn.
+  let TSFlags{18} = isNVStore; // New-value store insn.
+  bits<1> isCVLoadable = 0;
+  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  bits<1> isCVLoad = 0;
+  let TSFlags{20} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{18} = isExtendable; // Insn may be extended.
+  let TSFlags{21} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{19} = isExtended; // Insn must be extended.
+  let TSFlags{22} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{22-20} = opExtendable; // Which operand may be extended.
+  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{23} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{28-24} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
 
   // If an instruction is valid on a subtarget (v2-v5), set the corresponding
   // bit from validSubTargets. v2 is the least significant bit.
   // By default, instruction is valid on all subtargets.
   SubTarget validSubTargets = HasV2SubT;
-  let TSFlags{32-29} = validSubTargets.Value;
+  let TSFlags{37-34} = validSubTargets.Value;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{35-33} = addrMode.Value;
+  let TSFlags{42-40} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{38-36} = accessSize.Value;
+  let TSFlags{45-43} = accessSize.Value;
+
+  bits<1> isTaken = 0;
+  let TSFlags {47} = isTaken; // Branch prediction.
+
+  bits<1> isFP = 0;
+  let TSFlags {48} = isFP; // Floating-point.
 
   // Fields used for relation models.
   string BaseOpcode = "";
@@ -173,14 +191,14 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 }
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Classes Definitions +
+//                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, LD, TypeLD>;
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
 
 let mayLoad = 1 in
 class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -199,16 +217,16 @@ class LDInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayLoad = 1 in
 class LD0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+              string cstr = "", InstrItinClass itin=LD_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 let mayStore = 1 in
 class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST, TypeST>;
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
 
 class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
@@ -216,39 +234,39 @@ class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayStore = 1 in
 class ST0Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ST0, TypeST>;
+              string cstr = "", InstrItinClass itin = ST_tc_ld_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
 
 // ST Instruction Class in V2/V3 can take SLOT0 only.
 // ST Instruction Class in V4    can take SLOT0 & SLOT1.
 // Definition of the instruction class CHANGED from V2/V3 to V4.
 class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "">
-  : STInst<outs, ins, asmstr, pattern, cstr>;
+                 string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : STInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // SYSTEM Instruction Class in V4 can take SLOT0 only
 // In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
 class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, SYS, TypeSYSTEM>;
+              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>;
 
 // ALU32 Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU32, TypeALU32>;
+                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+ : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>;
 
 // ALU64 Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, ALU64, TypeXTYPE>;
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+  : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
 
 // M Instruction Class in V2/V3.
@@ -256,55 +274,55 @@ class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, M, TypeXTYPE>;
+            string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-    : MInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = M_tc_2_SLOT23>
+    : MInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, S, TypeXTYPE>;
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : SInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
+  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-            string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, J, TypeJ>;
+            string cstr = "", InstrItinClass itin = J_tc_2early_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>;
 
 // JR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, JR, TypeJR>;
+             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>;
 
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, CR, TypeCR>;
+             string cstr = "", InstrItinClass itin = CR_tc_2early_SLOT3>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeCR>;
 
 let isCodeGenOnly = 1, isPseudo = 1 in
 class Endloop<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, ENDLOOP, TypeENDLOOP>;
+              string cstr = "", InstrItinClass itin = J_tc_2early_SLOT0123>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeENDLOOP>;
 
 let isCodeGenOnly = 1, isPseudo = 1 in
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -317,39 +335,40 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
   : InstHexagon<outs, ins, asmstr, pattern, cstr, PSEUDOM, TypePSEUDO>;
 
 //===----------------------------------------------------------------------===//
-//                         Intruction Classes Definitions -
+//                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
 
 //
 // ALU32 patterns
 //.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
+
+class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
+   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr>;
 
 //
 // ALU64 patterns.
 //
-class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU64_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
-class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern,
-               string cstr = "">
-   : ALU64Inst<outs, ins, asmstr, pattern, cstr>;
+class ALU64_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+               string cstr = "", InstrItinClass itin = ALU64_tc_1_SLOT23>
+   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Post increment ST Instruction.
 class STInstPI<dag outs, dag ins, string asmstr, list<dag> pattern = [],
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 9fda0da..d92f97b 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 //----------------------------------------------------------------------------//
-//                         Hexagon Intruction Flags +
+//                         Hexagon Instruction Flags
 //
 //                        *** Must match BaseInfo.h ***
 //----------------------------------------------------------------------------//
@@ -22,30 +22,30 @@ def TypeNV     : IType<10>;
 def TypePREFIX : IType<30>;
 
 //----------------------------------------------------------------------------//
-//                         Intruction Classes Definitions +
+//                         Instruction Classes Definitions
 //----------------------------------------------------------------------------//
 
 //
 // NV type instructions.
 //
 class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, NV_V4, TypeNV>;
+             string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>;
 
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+                string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Definition of Post increment new value store.
 class NVInstPost_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // Post increment ST Instruction.
 let mayStore = 1 in
 class NVInstPI_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "">
-  : NVInst<outs, ins, asmstr, pattern, cstr>;
+               string cstr = "", InstrItinClass itin = ST_tc_st_SLOT0>
+  : NVInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 // New-value conditional branch.
 class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -54,13 +54,14 @@ class NCJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 let mayLoad = 1, mayStore = 1 in
 class MEMInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, MEM_V4, TypeMEMOP>;
+              string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeMEMOP>;
 
 class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                 string cstr = "">
-  : MEMInst<outs, ins, asmstr, pattern, cstr>;
+                 string cstr = "", InstrItinClass itin = V4LDST_tc_st_SLOT0>
+  : MEMInst<outs, ins, asmstr, pattern, cstr, itin>;
 
 let isCodeGenOnly = 1 in
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
-  : InstHexagon<outs, ins, asmstr, pattern, "", PREFIX, TypePREFIX>;
+  : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
+                TypePREFIX>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 21a12de..ea6367a 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -26,13 +26,16 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "hexagon-instrinfo"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "HexagonGenInstrInfo.inc"
 #include "HexagonGenDFAPacketizer.inc"
 
-using namespace llvm;
-
 ///
 /// Constants for Hexagon instructions.
 ///
@@ -135,7 +138,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
       regPos = 1;
     }
 
-    if (FBB == 0) {
+    if (!FBB) {
       if (Cond.empty()) {
         // Due to a bug in TailMerging/CFG Optimization, we need to add a
         // special case handling of a predicated jump followed by an
@@ -151,7 +154,7 @@ HexagonInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
           if (NewTBB == NextBB) {
             ReverseBranchCondition(Cond);
             RemoveBranch(MBB);
-            return InsertBranch(MBB, TBB, 0, Cond, DL);
+            return InsertBranch(MBB, TBB, nullptr, Cond, DL);
           }
         }
         BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB);
@@ -174,8 +177,8 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                  MachineBasicBlock *&FBB,
                                  SmallVectorImpl<MachineOperand> &Cond,
                                  bool AllowModify) const {
-  TBB = NULL;
-  FBB = NULL;
+  TBB = nullptr;
+  FBB = nullptr;
 
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::instr_iterator I = MBB.instr_end();
@@ -224,7 +227,7 @@ bool HexagonInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
 
   // Get the last instruction in the block.
   MachineInstr *LastInst = I;
-  MachineInstr *SecondLastInst = NULL;
+  MachineInstr *SecondLastInst = nullptr;
   // Find one more terminator if present.
   do {
     if (&*I != LastInst && !I->isBundle() && isUnpredicatedTerminator(I)) {
@@ -557,7 +560,7 @@ MachineInstr *HexagonInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                           const SmallVectorImpl<unsigned> &Ops,
                                                     int FI) const {
   // Hexagon_TODO: Implement.
-  return(0);
+  return nullptr;
 }
 
 unsigned HexagonInstrInfo::createVR(MachineFunction* MF, MVT VT) const {
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 5da23cb..6b032c9 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -40,124 +40,121 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
+  const HexagonRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
-                                 MachineBasicBlock *&FBB,
-                                 SmallVectorImpl<MachineOperand> &Cond,
-                                 bool AllowModify) const;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  virtual bool analyzeCompare(const MachineInstr *MI,
-                              unsigned &SrcReg, unsigned &SrcReg2,
-                              int &Mask, int &Value) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
-                              SmallVectorImpl<MachineOperand> &Addr,
-                              const TargetRegisterClass *RC,
-                              SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
-                               SmallVectorImpl<MachineOperand> &Addr,
-                               const TargetRegisterClass *RC,
-                               SmallVectorImpl<MachineInstr*> &NewMIs) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              int FrameIndex) const;
-
-  virtual MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
-                                              MachineInstr* MI,
-                                           const SmallVectorImpl<unsigned> &Ops,
-                                              MachineInstr* LoadMI) const {
-    return 0;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
+                         MachineBasicBlock *&FBB,
+                         SmallVectorImpl<MachineOperand> &Cond,
+                         bool AllowModify) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void storeRegToAddr(MachineFunction &MF, unsigned SrcReg, bool isKill,
+                      SmallVectorImpl<MachineOperand> &Addr,
+                      const TargetRegisterClass *RC,
+                      SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
+                       SmallVectorImpl<MachineOperand> &Addr,
+                       const TargetRegisterClass *RC,
+                       SmallVectorImpl<MachineInstr*> &NewMIs) const;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const override;
+
+  MachineInstr* foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr* MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr* LoadMI) const override {
+    return nullptr;
   }
 
   unsigned createVR(MachineFunction* MF, MVT VT) const;
 
-  virtual bool isBranch(const MachineInstr *MI) const;
-  virtual bool isPredicable(MachineInstr *MI) const;
-  virtual bool
-  PredicateInstruction(MachineInstr *MI,
-                       const SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
-                                   unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumTCycles, unsigned ExtraTCycles,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumFCycles, unsigned ExtraFCycles,
-                                   const BranchProbability &Probability) const;
-
-  virtual bool isPredicated(const MachineInstr *MI) const;
-  virtual bool isPredicated(unsigned Opcode) const;
-  virtual bool isPredicatedTrue(const MachineInstr *MI) const;
-  virtual bool isPredicatedTrue(unsigned Opcode) const;
-  virtual bool isPredicatedNew(const MachineInstr *MI) const;
-  virtual bool isPredicatedNew(unsigned Opcode) const;
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
-  virtual bool
-  SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                    const SmallVectorImpl<MachineOperand> &Pred2) const;
-
-  virtual bool
-  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool
-  isProfitableToDupForIfCvt(MachineBasicBlock &MBB,unsigned NumCycles,
-                            const BranchProbability &Probability) const;
-
-  virtual DFAPacketizer*
+  bool isBranch(const MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool PredicateInstruction(MachineInstr *MI,
+                    const SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumTCycles, unsigned ExtraTCycles,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumFCycles, unsigned ExtraFCycles,
+                           const BranchProbability &Probability) const override;
+
+  bool isPredicated(const MachineInstr *MI) const override;
+  bool isPredicated(unsigned Opcode) const;
+  bool isPredicatedTrue(const MachineInstr *MI) const;
+  bool isPredicatedTrue(unsigned Opcode) const;
+  bool isPredicatedNew(const MachineInstr *MI) const;
+  bool isPredicatedNew(unsigned Opcode) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
+                           const BranchProbability &Probability) const override;
+
+  DFAPacketizer*
   CreateTargetScheduleState(const TargetMachine *TM,
-                            const ScheduleDAG *DAG) const;
+                            const ScheduleDAG *DAG) const override;
 
-  virtual bool isSchedulingBoundary(const MachineInstr *MI,
-                                    const MachineBasicBlock *MBB,
-                                    const MachineFunction &MF) const;
+  bool isSchedulingBoundary(const MachineInstr *MI,
+                            const MachineBasicBlock *MBB,
+                            const MachineFunction &MF) const override;
   bool isValidOffset(const int Opcode, const int Offset) const;
   bool isValidAutoIncImm(const EVT VT, const int Offset) const;
   bool isMemOp(const MachineInstr *MI) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
index c96aaca..4dcf101 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ b/lib/Target/Hexagon/HexagonInstrInfo.td
@@ -768,12 +768,13 @@ class T_JMP <dag InsDag, list<dag> JumpList = []>
 
 let InputType = "imm", isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
 Defs = [PC], isPredicated = 1, opExtentBits = 17 in
-class T_JMP_c <bit PredNot, bit isPredNew, bit isTaken>:
+class T_JMP_c <bit PredNot, bit isPredNew, bit isTak>:
             JInst<(outs ), (ins PredRegs:$src, brtarget:$dst),
             !if(PredNot, "if (!$src", "if ($src")#
             !if(isPredNew, ".new) ", ") ")#"jump"#
-            !if(isPredNew, !if(isTaken, ":t ", ":nt "), " ")#"$dst"> {
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
 
+    let isTaken = isTak;
     let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
@@ -784,7 +785,7 @@ class T_JMP_c <bit PredNot, bit isPredNew, bit isTaken>:
 
     let Inst{27-24} = 0b1100;
     let Inst{21} = PredNot;
-    let Inst{12} = !if(isPredNew, isTaken, zero);
+    let Inst{12} = !if(isPredNew, isTak, zero);
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
     let Inst{23-22} = dst{16-15};
@@ -806,12 +807,13 @@ class T_JMPr<dag InsDag = (ins IntRegs:$dst)>
 }
 
 let Defs = [PC], isPredicated = 1, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTaken>:
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>:
             JRInst <(outs ), (ins PredRegs:$src, IntRegs:$dst),
             !if(PredNot, "if (!$src", "if ($src")#
             !if(isPredNew, ".new) ", ") ")#"jumpr"#
-            !if(isPredNew, !if(isTaken, ":t ", ":nt "), " ")#"$dst"> {
+            !if(isPredNew, !if(isTak, ":t ", ":nt "), " ")#"$dst"> {
 
+    let isTaken = isTak;
     let isBrTaken = !if(isPredNew, !if(isTaken, "true", "false"), "");
     let isPredicatedFalse = PredNot;
     let isPredicatedNew = isPredNew;
@@ -823,7 +825,7 @@ class T_JMPr_c <bit PredNot, bit isPredNew, bit isTaken>:
     let Inst{27-22} = 0b001101;
     let Inst{21} = PredNot;
     let Inst{20-16} = dst;
-    let Inst{12} = !if(isPredNew, isTaken, zero);
+    let Inst{12} = !if(isPredNew, isTak, zero);
     let Inst{11} = isPredNew;
     let Inst{9-8} = src;
     let Predicates = !if(isPredNew, [HasV3T], [HasV2T]);
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index a95fb80..db5b7ea 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -1004,13 +1004,13 @@ defm POST_STwri: ST_PostInc_nv <"memw", "STriw", IntRegs, s4_2Imm>, AddrModeRel;
 
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
 class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
-                      bit isNegCond, bit isTaken>
+                      bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic#
     "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
     "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
       bits<5> src1;
@@ -1019,6 +1019,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       bits<5> RegOp; // Non-New-Value Operand
       bits<11> offset;
 
+      let isTaken = isTak;
       let isBrTaken = !if(isTaken, "true", "false");
       let isPredicatedFalse = isNegCond;
 
@@ -1030,7 +1031,7 @@ class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = Ns;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{12-8} = RegOp;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
@@ -1078,13 +1079,14 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 
 let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11 in
 class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
-                         bit isTaken>
+                         bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, u5Imm:$src2, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
+      let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
       let isBrTaken = !if(isTaken, "true", "false");
 
@@ -1097,7 +1099,7 @@ class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = src1;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{12-8} = src2;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
@@ -1135,14 +1137,15 @@ let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
 
 let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 11 in
 class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
-                            bit isNegCond, bit isTaken>
+                            bit isNegCond, bit isTak>
   : NVInst_V4<(outs),
     (ins IntRegs:$src1, brtarget:$offset),
     "if ("#!if(isNegCond, "!","")#mnemonic
     #"($src1.new, #"#ImmVal#")) jump:"
-    #!if(isTaken, "t","nt")#" $offset",
+    #!if(isTak, "t","nt")#" $offset",
     []>, Requires<[HasV4T]> {
 
+      let isTaken = isTak;
       let isPredicatedFalse = isNegCond;
       let isBrTaken = !if(isTaken, "true", "false");
 
@@ -1153,7 +1156,7 @@ class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
       let Inst{25-23} = majOp;
       let Inst{22} = isNegCond;
       let Inst{18-16} = src1;
-      let Inst{13} = isTaken;
+      let Inst{13} = isTak;
       let Inst{21-20} = offset{10-9};
       let Inst{7-1} = offset{8-2};
 }
@@ -2019,9 +2022,10 @@ multiclass MemOpi_bitPats <PatFrag ldOp, PatFrag stOp, PatLeaf immPred,
 
   // mem[bhw](Rs+#0) = [clrbit|setbit](#U5)
   let AddedComplexity = 225 in
-  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), immPred:$bitend),
-                   addrPred:$addr),
-             (MI IntRegs:$addr, #0, (xformFunc immPred:$bitend))>;
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           immPred:$bitend),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (xformFunc immPred:$bitend))>;
 }
 
 multiclass MemOpi_bitExtType<PatFrag ldOpByte, PatFrag ldOpHalf > {
@@ -2065,9 +2069,10 @@ multiclass MemOpr_Pats <PatFrag ldOp, PatFrag stOp, ComplexPattern addrPred,
                      PatLeaf extPred, InstHexagon MI, SDNode OpNode> {
   let AddedComplexity = 141 in
   // mem[bhw](Rs+#0) [+-&|]= Rt
-  def : Pat <(stOp (OpNode (ldOp addrPred:$addr), (i32 IntRegs:$addend)),
-                   addrPred:$addr),
-             (MI IntRegs:$addr, #0, (i32 IntRegs:$addend) )>;
+  def : Pat <(stOp (OpNode (ldOp (addrPred IntRegs:$addr, extPred:$offset)),
+                           (i32 IntRegs:$addend)),
+                   (addrPred (i32 IntRegs:$addr), extPred:$offset)),
+             (MI IntRegs:$addr, extPred:$offset, (i32 IntRegs:$addend) )>;
 
   // mem[bhw](Rs+#U6:[012]) [+-&|]= Rt
   let AddedComplexity = 150 in
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 51318ff..7dd6e95 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -12,17 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "HexagonMachineScheduler.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/IR/Function.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 /// Platform specific modifications to DAG.
 void VLIWMachineScheduler::postprocessDAG() {
-  SUnit* LastSequentialCall = NULL;
+  SUnit* LastSequentialCall = nullptr;
   // Currently we only catch the situation when compare gets scheduled
   // before preceding call.
   for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
@@ -398,13 +398,13 @@ SUnit *ConvergingVLIWScheduler::VLIWSchedBoundary::pickOnlyChoice() {
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
-    ResourceModel->reserveResources(0);
+    ResourceModel->reserveResources(nullptr);
     bumpCycle();
     releasePending();
   }
   if (Available.size() == 1)
     return *Available.begin();
-  return NULL;
+  return nullptr;
 }
 
 #ifndef NDEBUG
@@ -424,7 +424,7 @@ void ConvergingVLIWScheduler::traceCandidate(const char *Label,
 /// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
 /// of SU, return it, otherwise return null.
 static SUnit *getSingleUnscheduledPred(SUnit *SU) {
-  SUnit *OnlyAvailablePred = 0;
+  SUnit *OnlyAvailablePred = nullptr;
   for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     SUnit &Pred = *I->getSUnit();
@@ -432,7 +432,7 @@ static SUnit *getSingleUnscheduledPred(SUnit *SU) {
       // We found an available, but not scheduled, predecessor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
-        return 0;
+        return nullptr;
       OnlyAvailablePred = &Pred;
     }
   }
@@ -442,7 +442,7 @@ static SUnit *getSingleUnscheduledPred(SUnit *SU) {
 /// getSingleUnscheduledSucc - If there is exactly one unscheduled successor
 /// of SU, return it, otherwise return null.
 static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
-  SUnit *OnlyAvailableSucc = 0;
+  SUnit *OnlyAvailableSucc = nullptr;
   for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
        I != E; ++I) {
     SUnit &Succ = *I->getSUnit();
@@ -450,7 +450,7 @@ static SUnit *getSingleUnscheduledSucc(SUnit *SU) {
       // We found an available, but not scheduled, successor.  If it's the
       // only one we have found, keep track of it... otherwise give up.
       if (OnlyAvailableSucc && OnlyAvailableSucc != &Succ)
-        return 0;
+        return nullptr;
       OnlyAvailableSucc = &Succ;
     }
   }
@@ -639,7 +639,7 @@ SUnit *ConvergingVLIWScheduler::pickNode(bool &IsTopNode) {
   if (DAG->top() == DAG->bottom()) {
     assert(Top.Available.empty() && Top.Pending.empty() &&
            Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
-    return NULL;
+    return nullptr;
   }
   SUnit *SU;
   if (llvm::ForceTopDown) {
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 300f1c7..99100a1 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -14,7 +14,6 @@
 #ifndef HEXAGONASMPRINTER_H
 #define HEXAGONASMPRINTER_H
 
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/PriorityQueue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
@@ -57,7 +56,7 @@ class VLIWResourceModel {
 public:
 VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
     SchedModel(SM), TotalPackets(0) {
-    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM,NULL);
+    ResourcesModel = TM.getInstrInfo()->CreateTargetScheduleState(&TM, nullptr);
 
     // This hard requirement could be relaxed,
     // but for now do not let it proceed.
@@ -94,8 +93,9 @@ VLIWResourceModel(const TargetMachine &TM, const TargetSchedModel *SM) :
 /// top-level schedule() driver.
 class VLIWMachineScheduler : public ScheduleDAGMILive {
 public:
-  VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGMILive(C, S) {}
+  VLIWMachineScheduler(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S)
+      : ScheduleDAGMILive(C, std::move(S)) {}
 
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
@@ -120,7 +120,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     // Best scheduling cost.
     int SCost;
 
-    SchedCandidate(): SU(NULL), SCost(0) {}
+    SchedCandidate(): SU(nullptr), SCost(0) {}
   };
   /// Represent the type of SchedCandidate found within a single queue.
   enum CandResult {
@@ -153,9 +153,9 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     VLIWSchedBoundary(unsigned ID, const Twine &Name):
-      DAG(0), SchedModel(0), Available(ID, Name+".A"),
+      DAG(nullptr), SchedModel(nullptr), Available(ID, Name+".A"),
       Pending(ID << ConvergingVLIWScheduler::LogMaxQID, Name+".P"),
-      CheckPending(false), HazardRec(0), ResourceModel(0),
+      CheckPending(false), HazardRec(nullptr), ResourceModel(nullptr),
       CurrCycle(0), IssueCount(0),
       MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
 
@@ -203,8 +203,9 @@ public:
     LogMaxQID = 2
   };
 
-  ConvergingVLIWScheduler():
-    DAG(0), SchedModel(0), Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
+  ConvergingVLIWScheduler()
+    : DAG(nullptr), SchedModel(nullptr), Top(TopQID, "TopQ"),
+      Bot(BotQID, "BotQ") {}
 
   virtual void initialize(ScheduleDAGMI *dag) override;
 
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 3e238bf..b7c03a7 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -21,7 +21,6 @@
 //
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "hexagon-nvj"
 #include "llvm/PassSupport.h"
 #include "Hexagon.h"
 #include "HexagonInstrInfo.h"
@@ -47,6 +46,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-nvj"
+
 STATISTIC(NumNVJGenerated, "Number of New Value Jump Instructions created");
 
 static cl::opt<int>
@@ -74,16 +75,16 @@ namespace {
       initializeHexagonNewValueJumpPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon NewValueJump";
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
   private:
     /// \brief A handle to the branch probability pass.
@@ -393,8 +394,8 @@ bool HexagonNewValueJump::runOnMachineFunction(MachineFunction &MF) {
     bool MO2IsKill = false;
     MachineBasicBlock::iterator jmpPos;
     MachineBasicBlock::iterator cmpPos;
-    MachineInstr *cmpInstr = NULL, *jmpInstr = NULL;
-    MachineBasicBlock *jmpTarget = NULL;
+    MachineInstr *cmpInstr = nullptr, *jmpInstr = nullptr;
+    MachineBasicBlock *jmpTarget = nullptr;
     bool afterRA = false;
     bool isSecondOpReg = false;
     bool isSecondOpNewified = false;
diff --git a/lib/Target/Hexagon/HexagonPeephole.cpp b/lib/Target/Hexagon/HexagonPeephole.cpp
index 5490ecd..48b6159 100644
--- a/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -35,7 +35,6 @@
 
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-peephole"
 #include "Hexagon.h"
 #include "HexagonTargetMachine.h"
 #include "llvm/ADT/DenseMap.h"
@@ -57,6 +56,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-peephole"
+
 static cl::opt<bool> DisableHexagonPeephole("disable-hexagon-peephole",
     cl::Hidden, cl::ZeroOrMore, cl::init(false),
     cl::desc("Disable Peephole Optimization"));
@@ -89,13 +90,13 @@ namespace {
       initializeHexagonPeepholePass(*PassRegistry::getPassRegistry());
     }
 
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon optimize redundant zero and size extends";
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index 9a20dfd..fb466d3 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -43,13 +43,12 @@ HexagonRegisterInfo::HexagonRegisterInfo(HexagonSubtarget &st)
     Subtarget(st) {
 }
 
-const uint16_t* HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction
-                                                        *MF)
-  const {
-  static const uint16_t CalleeSavedRegsV2[] = {
+const MCPhysReg *
+HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  static const MCPhysReg CalleeSavedRegsV2[] = {
     Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
   };
-  static const uint16_t CalleeSavedRegsV3[] = {
+  static const MCPhysReg CalleeSavedRegsV3[] = {
     Hexagon::R16,   Hexagon::R17,   Hexagon::R18,   Hexagon::R19,
     Hexagon::R20,   Hexagon::R21,   Hexagon::R22,   Hexagon::R23,
     Hexagon::R24,   Hexagon::R25,   Hexagon::R26,   Hexagon::R27, 0
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 89af7c3..648b4af 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -48,16 +48,17 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
   HexagonRegisterInfo(HexagonSubtarget &st);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  const TargetRegisterClass* const* getCalleeSavedRegClasses(
-                                     const MachineFunction *MF = 0) const;
+  const TargetRegisterClass* const*
+  getCalleeSavedRegClasses(const MachineFunction *MF = nullptr) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   /// determineFrameLayout - Determine the size of the frame and maximum call
   /// frame size.
@@ -65,17 +66,17 @@ struct HexagonRegisterInfo : public HexagonGenRegisterInfo {
 
   /// requiresRegisterScavenging - returns true since we may need scavenging for
   /// a temporary register when generating hardware loop instructions.
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
 
   // Debug information queries.
   unsigned getRARegister() const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
   unsigned getFrameRegister() const;
   unsigned getStackRegister() const;
 };
diff --git a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
index cadcb32..2b459a4 100644
--- a/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
+++ b/lib/Target/Hexagon/HexagonRemoveSZExtArgs.cpp
@@ -33,13 +33,13 @@ namespace {
     HexagonRemoveExtendArgs() : FunctionPass(ID) {
       initializeHexagonRemoveExtendArgsPass(*PassRegistry::getPassRegistry());
     }
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Remove sign extends";
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineFunctionAnalysis>();
       AU.addPreserved<MachineFunctionAnalysis>();
       AU.addPreserved("stack-protector");
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index c2cfbb9..528cafc 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -7,57 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// Functional Units
-def LSUNIT    : FuncUnit; // SLOT0
-def LUNIT     : FuncUnit; // SLOT1
-def MUNIT     : FuncUnit; // SLOT2
-def SUNIT     : FuncUnit; // SLOT3
-def LOOPUNIT  : FuncUnit;
-
-// Itinerary classes
-def ALU32     : InstrItinClass;
-def ALU64     : InstrItinClass;
-def CR        : InstrItinClass;
-def J         : InstrItinClass;
-def JR        : InstrItinClass;
-def LD        : InstrItinClass;
-def LD0       : InstrItinClass;
-def M         : InstrItinClass;
-def ST        : InstrItinClass;
-def ST0       : InstrItinClass;
-def S         : InstrItinClass;
-def SYS       : InstrItinClass;
-def ENDLOOP   : InstrItinClass;
-def PSEUDO    : InstrItinClass;
-def PSEUDOM   : InstrItinClass;
-
-def HexagonItineraries :
-      ProcessorItineraries<[LSUNIT, LUNIT, MUNIT, SUNIT, LOOPUNIT], [], [
-        InstrItinData<ALU32  , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-        InstrItinData<ALU64  , [InstrStage<1, [MUNIT, SUNIT]>]>,
-        InstrItinData<CR     , [InstrStage<1, [SUNIT]>]>,
-        InstrItinData<J      , [InstrStage<1, [SUNIT, MUNIT]>]>,
-        InstrItinData<JR     , [InstrStage<1, [MUNIT]>]>,
-        InstrItinData<LD     , [InstrStage<1, [LUNIT, LSUNIT]>]>,
-        InstrItinData<LD0    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<M      , [InstrStage<1, [MUNIT, SUNIT]>]>,
-        InstrItinData<ST     , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<ST0    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<S      , [InstrStage<1, [SUNIT, MUNIT]>]>,
-        InstrItinData<SYS    , [InstrStage<1, [LSUNIT]>]>,
-        InstrItinData<ENDLOOP, [InstrStage<1, [LOOPUNIT]>]>,
-        InstrItinData<PSEUDO , [InstrStage<1, [LUNIT, LSUNIT, MUNIT, SUNIT]>]>,
-        InstrItinData<PSEUDOM, [InstrStage<1, [MUNIT, SUNIT], 0>,
-                                InstrStage<1, [MUNIT, SUNIT]>]>
-      ]>;
-
-def HexagonModel : SchedMachineModel {
-  // Max issue per cycle == bundle width.
-  let IssueWidth = 4;
-  let Itineraries = HexagonItineraries;
-  let LoadLatency = 1;
-}
-
 //===----------------------------------------------------------------------===//
 // V4 Machine Info +
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index ef72cf4..a7d2d47 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -34,29 +34,158 @@ def SLOT3       : FuncUnit;
 def SLOT_ENDLOOP: FuncUnit;
 
 // Itinerary classes.
-def NV_V4       : InstrItinClass;
-def MEM_V4      : InstrItinClass;
+def PSEUDO      : InstrItinClass;
+def PSEUDOM   : InstrItinClass;
 // ALU64/M/S Instruction classes of V2 are collectively knownn as XTYPE in V4.
+def DUPLEX      : InstrItinClass;
 def PREFIX      : InstrItinClass;
+def COMPOUND    : InstrItinClass;
+
+def ALU32_2op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_2op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2early_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_1_SLOT0123  : InstrItinClass;
+def ALU32_3op_tc_2_SLOT0123  : InstrItinClass;
+def ALU32_ADDI_tc_1_SLOT0123 : InstrItinClass;
+def ALU64_tc_1_SLOT23        : InstrItinClass;
+def ALU64_tc_1or2_SLOT23     : InstrItinClass;
+def ALU64_tc_2_SLOT23        : InstrItinClass;
+def ALU64_tc_2early_SLOT23   : InstrItinClass;
+def ALU64_tc_3x_SLOT23       : InstrItinClass;
+def CR_tc_2_SLOT3            : InstrItinClass;
+def CR_tc_2early_SLOT23      : InstrItinClass;
+def CR_tc_2early_SLOT3       : InstrItinClass;
+def CR_tc_3x_SLOT23          : InstrItinClass;
+def CR_tc_3x_SLOT3           : InstrItinClass;
+def J_tc_2early_SLOT23       : InstrItinClass;
+def J_tc_2early_SLOT2        : InstrItinClass;
+def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_SLOT0           : InstrItinClass;
+def LD_tc_3or4stall_SLOT0    : InstrItinClass;
+def M_tc_1_SLOT23            : InstrItinClass;
+def M_tc_1or2_SLOT23         : InstrItinClass;
+def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_3_SLOT23            : InstrItinClass;
+def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3or4x_SLOT23        : InstrItinClass;
+def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_ld_SLOT0           : InstrItinClass;
+def ST_tc_3stall_SLOT0       : InstrItinClass;
+def S_2op_tc_1_SLOT23        : InstrItinClass;
+def S_2op_tc_2_SLOT23        : InstrItinClass;
+def S_2op_tc_2early_SLOT23   : InstrItinClass;
+def S_2op_tc_3or4x_SLOT23    : InstrItinClass;
+def S_3op_tc_1_SLOT23        : InstrItinClass;
+def S_3op_tc_1or2_SLOT23     : InstrItinClass;
+def S_3op_tc_2_SLOT23        : InstrItinClass;
+def S_3op_tc_2early_SLOT23   : InstrItinClass;
+def S_3op_tc_3_SLOT23        : InstrItinClass;
+def S_3op_tc_3x_SLOT23       : InstrItinClass;
+def NCJ_tc_3or4stall_SLOT0   : InstrItinClass;
+def V2LDST_tc_ld_SLOT01      : InstrItinClass;
+def V2LDST_tc_st_SLOT0       : InstrItinClass;
+def V2LDST_tc_st_SLOT01      : InstrItinClass;
+def V4LDST_tc_ld_SLOT01      : InstrItinClass;
+def V4LDST_tc_st_SLOT0       : InstrItinClass;
+def V4LDST_tc_st_SLOT01      : InstrItinClass;
+def J_tc_2early_SLOT0123     : InstrItinClass;
+def EXTENDER_tc_1_SLOT0123   : InstrItinClass;
+
 
 def HexagonItinerariesV4 :
       ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP], [], [
-        InstrItinData<ALU32  , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-        InstrItinData<ALU64  , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<CR     , [InstrStage<1, [SLOT3]>]>,
-        InstrItinData<J      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<JR     , [InstrStage<1, [SLOT2]>]>,
-        InstrItinData<LD     , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<LD0    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<M      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<ST     , [InstrStage<1, [SLOT0, SLOT1]>]>,
-        InstrItinData<ST0    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<S      , [InstrStage<1, [SLOT2, SLOT3]>]>,
-        InstrItinData<SYS    , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<NV_V4  , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<MEM_V4 , [InstrStage<1, [SLOT0]>]>,
-        InstrItinData<ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>]>,
-        InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        // ALU32
+        InstrItinData<ALU32_2op_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_1_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_3op_tc_2_SLOT0123   ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+        InstrItinData<ALU32_ADDI_tc_1_SLOT0123  ,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        // ALU64
+        InstrItinData<ALU64_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<ALU64_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // CR -> System
+        InstrItinData<CR_tc_2_SLOT3          , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_2early_SLOT3     , [InstrStage<1, [SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT3         , [InstrStage<1, [SLOT3]>]>,
+
+        // Jump (conditional/unconditional/return etc)
+        // CR
+        InstrItinData<CR_tc_2early_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<CR_tc_3x_SLOT23        , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // J
+        InstrItinData<J_tc_2early_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        // JR
+        InstrItinData<J_tc_2early_SLOT2      , [InstrStage<1, [SLOT2]>]>,
+
+        //Load
+        InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
+
+        // M
+        InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_1or2_SLOT23       , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // Store
+        // ST
+        InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        // ST0
+        InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+
+        // S
+        InstrItinData<S_2op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_2op_tc_3or4x_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_1or2_SLOT23   , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_2_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<S_3op_tc_3x_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
+
+        // SYS
+        InstrItinData<ST_tc_3stall_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+
+        // New Value Compare Jump
+        InstrItinData<NCJ_tc_3or4stall_SLOT0 , [InstrStage<1, [SLOT0]>]>,
+
+        // Mem ops - MEM_V4
+        InstrItinData<V2LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V2LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V2LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT0     , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<V4LDST_tc_ld_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<V4LDST_tc_st_SLOT01    , [InstrStage<1, [SLOT0, SLOT1]>]>,
+
+        InstrItinData<DUPLEX , [InstrStage<1, [SLOT0]>]>,
+
+        // ENDLOOP
+        InstrItinData<J_tc_2early_SLOT0123   , [InstrStage<1, [SLOT_ENDLOOP]>]>,
+
+        // Extender/PREFIX
+        InstrItinData<EXTENDER_tc_1_SLOT0123,
+                     [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+
+        InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
         InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
                                 InstrStage<1, [SLOT2, SLOT3]>]>
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index c37bf9f..9e1e0fd 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "hexagon-selectiondag-info"
 #include "HexagonTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-selectiondag-info"
+
 bool llvm::flag_aligned_memcpy;
 
 HexagonSelectionDAGInfo::HexagonSelectionDAGInfo(const HexagonTargetMachine
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index 31f278a..8ba6108 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -25,14 +25,13 @@ public:
   explicit HexagonSelectionDAGInfo(const HexagonTargetMachine &TM);
   ~HexagonSelectionDAGInfo();
 
-  virtual
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                   SDValue Chain,
                                   SDValue Dst, SDValue Src,
                                   SDValue Size, unsigned Align,
                                   bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
-                                  MachinePointerInfo SrcPtrInfo) const;
+                                  MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
index 5303f44..247207f 100644
--- a/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
+++ b/lib/Target/Hexagon/HexagonSplitConst32AndConst64.cpp
@@ -17,11 +17,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xfer"
-
-#include "HexagonTargetMachine.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
+#include "HexagonTargetMachine.h"
+#include "HexagonTargetObjectFile.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -44,21 +43,22 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xfer"
+
 namespace {
 
 class HexagonSplitConst32AndConst64 : public MachineFunctionPass {
-    const HexagonTargetMachine& QTM;
-    const HexagonSubtarget &QST;
+  const HexagonTargetMachine &QTM;
 
  public:
     static char ID;
-    HexagonSplitConst32AndConst64(const HexagonTargetMachine& TM)
-      : MachineFunctionPass(ID), QTM(TM), QST(*TM.getSubtargetImpl()) {}
+    HexagonSplitConst32AndConst64(const HexagonTargetMachine &TM)
+        : MachineFunctionPass(ID), QTM(TM) {}
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Split Const32s and Const64s";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -67,6 +67,12 @@ char HexagonSplitConst32AndConst64::ID = 0;
 
 bool HexagonSplitConst32AndConst64::runOnMachineFunction(MachineFunction &Fn) {
 
+  const HexagonTargetObjectFile &TLOF =
+      (const HexagonTargetObjectFile &)
+      QTM.getTargetLowering()->getObjFileLowering();
+  if (TLOF.IsSmallDataEnabled())
+    return true;
+
   const TargetInstrInfo *TII = QTM.getInstrInfo();
 
   // Loop over all of the basic blocks
diff --git a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
index 8608e08..9601090 100644
--- a/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
+++ b/lib/Target/Hexagon/HexagonSplitTFRCondSets.cpp
@@ -26,7 +26,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xfer"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
 #include "HexagonSubtarget.h"
@@ -49,6 +48,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xfer"
+
 namespace llvm {
   void initializeHexagonSplitTFRCondSetsPass(PassRegistry&);
 }
@@ -67,10 +68,10 @@ class HexagonSplitTFRCondSets : public MachineFunctionPass {
       initializeHexagonSplitTFRCondSetsPass(*PassRegistry::getPassRegistry());
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Split TFRCondSets";
     }
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 
@@ -221,7 +222,8 @@ bool HexagonSplitTFRCondSets::runOnMachineFunction(MachineFunction &Fn) {
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "Hexagon Split TFRCondSets";
   PassInfo *PI = new PassInfo(Name, "hexagon-split-tfr",
-                              &HexagonSplitTFRCondSets::ID, 0, false, false);
+                              &HexagonSplitTFRCondSets::ID, nullptr, false,
+                              false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index fca6707..70c87fa 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "hexagon-subtarget"
+
 #define GET_SUBTARGETINFO_CTOR
 #define GET_SUBTARGETINFO_TARGET_DESC
 #include "HexagonGenSubtargetInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9ce1fb8..b923764 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -52,7 +52,7 @@ extern "C" void LLVMInitializeHexagonTarget() {
 }
 
 static ScheduleDAGInstrs *createVLIWMachineSched(MachineSchedContext *C) {
-  return new VLIWMachineScheduler(C, new ConvergingVLIWScheduler());
+  return new VLIWMachineScheduler(C, make_unique<ConvergingVLIWScheduler>());
 }
 
 static MachineSchedRegistry
@@ -79,20 +79,6 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
     initAsmInfo();
 }
 
-// addPassesForOptimizations - Allow the backend (target) to add Target
-// Independent Optimization passes to the Pass Manager.
-bool HexagonTargetMachine::addPassesForOptimizations(PassManagerBase &PM) {
-  if (getOptLevel() != CodeGenOpt::None) {
-    PM.add(createConstantPropagationPass());
-    PM.add(createLoopSimplifyPass());
-    PM.add(createDeadCodeEliminationPass());
-    PM.add(createConstantPropagationPass());
-    PM.add(createLoopUnrollPass());
-    PM.add(createLoopStrengthReducePass());
-  }
-  return true;
-}
-
 namespace {
 /// Hexagon Code Generator Pass Configuration Options.
 class HexagonPassConfig : public TargetPassConfig {
@@ -113,16 +99,16 @@ public:
     return getTM<HexagonTargetMachine>();
   }
 
-  virtual ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
     return createVLIWMachineSched(C);
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
@@ -164,16 +150,12 @@ bool HexagonPassConfig::addPostRegAlloc() {
 
 bool HexagonPassConfig::addPreSched2() {
   const HexagonTargetMachine &TM = getHexagonTargetMachine();
-  const HexagonTargetObjectFile &TLOF =
-    (const HexagonTargetObjectFile &)getTargetLowering()->getObjFileLowering();
 
   addPass(createHexagonCopyToCombine());
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
-  if (!TLOF.IsSmallDataEnabled()) {
-    addPass(createHexagonSplitConst32AndConst64(TM));
-    printAndVerify("After hexagon split const32/64 pass");
-  }
+  addPass(createHexagonSplitConst32AndConst64(TM));
+  printAndVerify("After hexagon split const32/64 pass");
   return true;
 }
 
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.h b/lib/Target/Hexagon/HexagonTargetMachine.h
index cf8f9aa..70b835e 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.h
+++ b/lib/Target/Hexagon/HexagonTargetMachine.h
@@ -41,39 +41,37 @@ public:
                        Reloc::Model RM, CodeModel::Model CM,
                        CodeGenOpt::Level OL);
 
-  virtual const HexagonInstrInfo *getInstrInfo() const {
+  const HexagonInstrInfo *getInstrInfo() const override {
     return &InstrInfo;
   }
-  virtual const HexagonSubtarget *getSubtargetImpl() const {
+  const HexagonSubtarget *getSubtargetImpl() const override {
     return &Subtarget;
   }
-  virtual const HexagonRegisterInfo *getRegisterInfo() const {
+  const HexagonRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const InstrItineraryData* getInstrItineraryData() const {
+  const InstrItineraryData* getInstrItineraryData() const override {
     return InstrItins;
   }
 
 
-  virtual const HexagonTargetLowering* getTargetLowering() const {
+  const HexagonTargetLowering* getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const HexagonFrameLowering* getFrameLowering() const {
+  const HexagonFrameLowering* getFrameLowering() const override {
     return &FrameLowering;
   }
 
-  virtual const HexagonSelectionDAGInfo* getSelectionDAGInfo() const {
+  const HexagonSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
   static unsigned getModuleMatchQuality(const Module &M);
 
-  // Pass Pipeline Configuration.
-  virtual bool addPassesForOptimizations(PassManagerBase &PM);
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 };
 
 extern bool flag_aligned_memcpy;
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 976ff2b..87ce960 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -16,7 +16,6 @@
 // prune the dependence.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "packets"
 #include "llvm/CodeGen/DFAPacketizer.h"
 #include "Hexagon.h"
 #include "HexagonMachineFunctionInfo.h"
@@ -51,6 +50,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 static cl::opt<bool> PacketizeVolatiles("hexagon-packetize-volatiles",
       cl::ZeroOrMore, cl::Hidden, cl::init(true),
       cl::desc("Allow non-solo packetization of volatile memory references"));
@@ -69,7 +70,7 @@ namespace {
       initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry());
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
       AU.addRequired<MachineDominatorTree>();
       AU.addRequired<MachineBranchProbabilityInfo>();
@@ -79,11 +80,11 @@ namespace {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    const char *getPassName() const {
+    const char *getPassName() const override {
       return "Hexagon Packetizer";
     }
 
-    bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
   };
   char HexagonPacketizer::ID = 0;
 
@@ -121,24 +122,25 @@ namespace {
                           const MachineBranchProbabilityInfo *MBPI);
 
     // initPacketizerState - initialize some internal flags.
-    void initPacketizerState();
+    void initPacketizerState() override;
 
     // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-    bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool ignorePseudoInstruction(MachineInstr *MI,
+                                 MachineBasicBlock *MBB) override;
 
     // isSoloInstruction - return true if instruction MI can not be packetized
     // with any other instruction, which means that MI itself is a packet.
-    bool isSoloInstruction(MachineInstr *MI);
+    bool isSoloInstruction(MachineInstr *MI) override;
 
     // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
     // together.
-    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ);
+    bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override;
 
     // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
     // and SUJ.
-    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ);
+    bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override;
 
-    MachineBasicBlock::iterator addToPacket(MachineInstr *MI);
+    MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override;
   private:
     bool IsCallDependent(MachineInstr* MI, SDep::Kind DepType, unsigned DepReg);
     bool PromoteToDotNew(MachineInstr* MI, SDep::Kind DepType,
@@ -390,7 +392,7 @@ static bool IsLoopN(MachineInstr *MI) {
 /// callee-saved register.
 static bool DoesModifyCalleeSavedReg(MachineInstr *MI,
                                      const TargetRegisterInfo *TRI) {
-  for (const uint16_t *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
+  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(); *CSR; ++CSR) {
     unsigned CalleeSavedReg = *CSR;
     if (MI->modifiesRegister(CalleeSavedReg, TRI))
       return true;
@@ -603,7 +605,7 @@ bool HexagonPacketizerList::CanPromoteToNewValueStore( MachineInstr *MI,
     // evaluate identically
     unsigned predRegNumSrc = 0;
     unsigned predRegNumDst = 0;
-    const TargetRegisterClass* predRegClass = NULL;
+    const TargetRegisterClass* predRegClass = nullptr;
 
     // Get predicate register used in the source instruction
     for(unsigned opNum = 0; opNum < PacketMI->getNumOperands(); opNum++) {
@@ -1172,7 +1174,7 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
       // of that (IsCallDependent) function. Bug 6216 is opened for this.
       //
       unsigned DepReg = 0;
-      const TargetRegisterClass* RC = NULL;
+      const TargetRegisterClass* RC = nullptr;
       if (DepType == SDep::Data) {
         DepReg = SUJ->Succs[i].getReg();
         RC = QRI->getMinimalPhysRegClass(DepReg);
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
index 33667f4..9942a60 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "HexagonAsmPrinter.h"
 #include "Hexagon.h"
 #include "HexagonInstPrinter.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define GET_INSTRUCTION_NAME
 #include "HexagonGenAsmWriter.inc"
 
diff --git a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
index d0cef68..09e3f88 100644
--- a/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/InstPrinter/HexagonInstPrinter.h
@@ -27,7 +27,7 @@ namespace llvm {
                                 const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI), MII(MII) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
     void printInst(const HexagonMCInst *MI, raw_ostream &O, StringRef Annot);
     virtual StringRef getOpcodeName(unsigned Opcode) const;
     void printInstruction(const MCInst *MI, raw_ostream &O);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 8519cf3..f8be77c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -87,70 +87,82 @@ namespace HexagonII {
     // Solo instructions.
     SoloPos  = 5,
     SoloMask = 0x1,
+    // Packed only with A or X-type instructions.
+    SoloAXPos  = 6,
+    SoloAXMask = 0x1,
+    // Only A-type instruction in first slot or nothing.
+    SoloAin1Pos  = 7,
+    SoloAin1Mask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 6,
+    PredicatedPos  = 8,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 7,
+    PredicatedFalsePos  = 9,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 8,
+    PredicatedNewPos  = 10,
     PredicatedNewMask = 0x1,
+    PredicateLatePos  = 11,
+    PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 9,
+    NewValuePos  = 12,
     NewValueMask = 0x1,
-
     // New-Value producer instructions.
-    hasNewValuePos  = 10,
+    hasNewValuePos  = 13,
     hasNewValueMask = 0x1,
-
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 11,
+    NewValueOpPos  = 14,
     NewValueOpMask = 0x7,
-
-    // Which bits encode the new value.
-    NewValueBitsPos  = 14,
-    NewValueBitsMask = 0x3,
-
     // Stores that can become new-value stores.
-    mayNVStorePos  = 16,
+    mayNVStorePos  = 17,
     mayNVStoreMask = 0x1,
-
     // New-value store instructions.
-    NVStorePos  = 17,
+    NVStorePos  = 18,
     NVStoreMask = 0x1,
+    // Loads that can become current-value loads.
+    mayCVLoadPos  = 19,
+    mayCVLoadMask = 0x1,
+    // Current-value load instructions.
+    CVLoadPos  = 20,
+    CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 18,
+    ExtendablePos  = 21,
     ExtendableMask = 0x1,
-
     // Insns must be extended.
-    ExtendedPos  = 19,
+    ExtendedPos  = 22,
     ExtendedMask = 0x1,
-
     // Which operand may be extended.
-    ExtendableOpPos  = 20,
+    ExtendableOpPos  = 23,
     ExtendableOpMask = 0x7,
-
     // Signed or unsigned range.
-    ExtentSignedPos = 23,
+    ExtentSignedPos  = 26,
     ExtentSignedMask = 0x1,
-
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 24,
+    ExtentBitsPos  = 27,
     ExtentBitsMask = 0x1f,
+    // Alignment power-of-two before extending operand.
+    ExtentAlignPos  = 32,
+    ExtentAlignMask = 0x3,
 
     // Valid subtargets
-    validSubTargetPos = 29,
+    validSubTargetPos  = 34,
     validSubTargetMask = 0xf,
 
     // Addressing mode for load/store instructions.
-    AddrModePos = 33,
+    AddrModePos  = 40,
     AddrModeMask = 0x7,
+    // Access size for load/store instructions.
+    MemAccessSizePos = 43,
+    MemAccesSizeMask = 0x7,
+
+    // Branch predicted taken.
+    TakenPos = 47,
+    TakenMask = 0x1,
 
-    // Access size of memory access instructions (load/store).
-    MemAccessSizePos = 36,
-    MemAccesSizeMask = 0x7
+    // Floating-point instructions.
+    FPPos  = 48,
+    FPMask = 0x1
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index f1a65c3..141e514 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -21,7 +21,7 @@ void HexagonMCAsmInfo::anchor() {}
 HexagonMCAsmInfo::HexagonMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
-  Data64bitsDirective = 0;  // .xword is only supported by V9.
+  Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   ZeroDirective = "\t.skip\t";
   CommentString = "//";
   HasLEB128 = true;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
index bd8cb76..953d804 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.h
@@ -19,7 +19,7 @@
 
 namespace llvm {
   class HexagonMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit HexagonMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
index 3ca71f0..3c52d45 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInst.h
@@ -31,7 +31,7 @@ namespace llvm {
 
   public:
     explicit HexagonMCInst():
-      MCInst(), MCID(0), packetStart(0), packetEnd(0) {};
+      MCInst(), MCID(nullptr), packetStart(0), packetEnd(0) {};
     HexagonMCInst(const MCInstrDesc& mcid):
       MCInst(), MCID(&mcid), packetStart(0), packetEnd(0) {};
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 7f103d8..581674d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "HexagonGenInstrInfo.inc"
 
@@ -32,8 +34,6 @@
 #define GET_REGINFO_MC_DESC
 #include "HexagonGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createHexagonMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitHexagonMCInstrInfo(X);
@@ -60,7 +60,7 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
 
   // VirtualFP = (R30 + #0).
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
-      0, Hexagon::R30, 0);
+      nullptr, Hexagon::R30, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index 13abaf8..1b0837c 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = AArch64 ARM ARM64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
+subdirectories = ARM AArch64 CppBackend Hexagon MSP430 NVPTX Mips PowerPC R600 Sparc SystemZ X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
index 4b12aea..acf1214 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MSP430InstPrinter.h"
 #include "MSP430.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -21,6 +20,8 @@
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 
 // Include the auto-generated portion of the assembly writer.
 #include "MSP430GenAsmWriter.inc"
@@ -44,7 +45,7 @@ void MSP430InstPrinter::printPCRelImmOperand(const MCInst *MI, unsigned OpNo,
 
 void MSP430InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O, const char *Modifier) {
-  assert((Modifier == 0 || Modifier[0] == 0) && "No modifiers supported");
+  assert((Modifier == nullptr || Modifier[0] == 0) && "No modifiers supported");
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
     O << getRegisterName(Op.getReg());
diff --git a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
index d32eb3a..5afbd20 100644
--- a/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
+++ b/lib/Target/MSP430/InstPrinter/MSP430InstPrinter.h
@@ -25,17 +25,17 @@ namespace llvm {
                       const MCRegisterInfo &MRI)
       : MCInstPrinter(MAI, MII, MRI) {}
 
-    virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+    void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
 
     void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                      const char *Modifier = 0);
+                      const char *Modifier = nullptr);
     void printPCRelImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
     void printSrcMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
-                            const char *Modifier = 0);
+                            const char *Modifier = nullptr);
     void printCCOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
index a7e0e58..ef805bb 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
   class StringRef;
 
   class MSP430MCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit MSP430MCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index 530e6aa..72adb45 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -20,6 +20,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "MSP430GenInstrInfo.inc"
 
@@ -29,8 +31,6 @@
 #define GET_REGINFO_MC_DESC
 #include "MSP430GenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createMSP430MCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitMSP430MCInstrInfo(X);
@@ -66,7 +66,7 @@ static MCInstPrinter *createMSP430MCInstPrinter(const Target &T,
                                                 const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new MSP430InstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 extern "C" void LLVMInitializeMSP430TargetMC() {
diff --git a/lib/Target/MSP430/MSP430AsmPrinter.cpp b/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 91065d8..22a973e 100644
--- a/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MSP430.h"
 #include "InstPrinter/MSP430InstPrinter.h"
 #include "MSP430InstrInfo.h"
@@ -35,27 +34,29 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class MSP430AsmPrinter : public AsmPrinter {
   public:
     MSP430AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 Assembly Printer";
     }
 
     void printOperand(const MachineInstr *MI, int OpNum,
-                      raw_ostream &O, const char* Modifier = 0);
+                      raw_ostream &O, const char* Modifier = nullptr);
     void printSrcMemOperand(const MachineInstr *MI, int OpNum,
                             raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI,
                                unsigned OpNo, unsigned AsmVariant,
-                               const char *ExtraCode, raw_ostream &O);
-    void EmitInstruction(const MachineInstr *MI);
+                               const char *ExtraCode, raw_ostream &O) override;
+    void EmitInstruction(const MachineInstr *MI) override;
   };
 } // end of anonymous namespace
 
diff --git a/lib/Target/MSP430/MSP430BranchSelector.cpp b/lib/Target/MSP430/MSP430BranchSelector.cpp
index f128427..a96930a 100644
--- a/lib/Target/MSP430/MSP430BranchSelector.cpp
+++ b/lib/Target/MSP430/MSP430BranchSelector.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-branch-select"
 #include "MSP430.h"
 #include "MSP430InstrInfo.h"
 #include "llvm/ADT/Statistic.h"
@@ -25,6 +24,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-branch-select"
+
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
 
 namespace {
@@ -35,9 +36,9 @@ namespace {
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 Branch Selector";
     }
   };
diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp
index ce078a3..82c8b29 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -242,7 +242,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // alignment boundary.
       Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
 
-      MachineInstr *New = 0;
+      MachineInstr *New = nullptr;
       if (Old->getOpcode() == TII.getCallFrameSetupOpcode()) {
         New = BuildMI(MF, Old->getDebugLoc(),
                       TII.get(MSP430::SUB16ri), MSP430::SPW)
diff --git a/lib/Target/MSP430/MSP430FrameLowering.h b/lib/Target/MSP430/MSP430FrameLowering.h
index 8370714..d464dd9 100644
--- a/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/lib/Target/MSP430/MSP430FrameLowering.h
@@ -32,26 +32,26 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 4152829..a9b9035 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -31,6 +31,8 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-isel"
+
 namespace {
   struct MSP430ISelAddressMode {
     enum {
@@ -52,17 +54,17 @@ namespace {
     unsigned Align;    // CP alignment.
 
     MSP430ISelAddressMode()
-      : BaseType(RegBase), Disp(0), GV(0), CP(0), BlockAddr(0),
-        ES(0), JT(-1), Align(0) {
+      : BaseType(RegBase), Disp(0), GV(nullptr), CP(nullptr),
+        BlockAddr(nullptr), ES(nullptr), JT(-1), Align(0) {
     }
 
     bool hasSymbolicDisplacement() const {
-      return GV != 0 || CP != 0 || ES != 0 || JT != -1;
+      return GV != nullptr || CP != nullptr || ES != nullptr || JT != -1;
     }
 
     void dump() {
       errs() << "MSP430ISelAddressMode " << this << '\n';
-      if (BaseType == RegBase && Base.Reg.getNode() != 0) {
+      if (BaseType == RegBase && Base.Reg.getNode() != nullptr) {
         errs() << "Base.Reg ";
         Base.Reg.getNode()->dump();
       } else if (BaseType == FrameIndexBase) {
@@ -99,7 +101,7 @@ namespace {
         Lowering(*TM.getTargetLowering()),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "MSP430 DAG->DAG Pattern Instruction Selection";
     }
 
@@ -107,15 +109,14 @@ namespace {
     bool MatchWrapper(SDValue N, MSP430ISelAddressMode &AM);
     bool MatchAddressBase(SDValue N, MSP430ISelAddressMode &AM);
 
-    virtual bool
-    SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
-                                 std::vector<SDValue> &OutOps);
+    bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override;
 
     // Include the pieces autogenerated from the target description.
   #include "MSP430GenDAGISel.inc"
 
   private:
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
     SDNode *SelectIndexedLoad(SDNode *Op);
     SDNode *SelectIndexedBinOp(SDNode *Op, SDValue N1, SDValue N2,
                                unsigned Opc8, unsigned Opc16);
@@ -199,7 +200,7 @@ bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
 
   case ISD::FrameIndex:
     if (AM.BaseType == MSP430ISelAddressMode::RegBase
-        && AM.Base.Reg.getNode() == 0) {
+        && AM.Base.Reg.getNode() == nullptr) {
       AM.BaseType = MSP430ISelAddressMode::FrameIndexBase;
       AM.Base.FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
       return false;
@@ -228,7 +229,7 @@ bool MSP430DAGToDAGISel::MatchAddress(SDValue N, MSP430ISelAddressMode &AM) {
       // Start with the LHS as an addr mode.
       if (!MatchAddress(N.getOperand(0), AM) &&
           // Address could not have picked a GV address for the displacement.
-          AM.GV == NULL &&
+          AM.GV == nullptr &&
           // Check to see if the LHS & C is zero.
           CurDAG->MaskedValueIsZero(N.getOperand(0), CN->getAPIntValue())) {
         AM.Disp += Offset;
@@ -330,7 +331,7 @@ static bool isValidIndexedLoad(const LoadSDNode *LD) {
 SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   if (!isValidIndexedLoad(LD))
-    return NULL;
+    return nullptr;
 
   MVT VT = LD->getMemoryVT().getSimpleVT();
 
@@ -343,7 +344,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedLoad(SDNode *N) {
     Opcode = MSP430::MOV16rm_POST;
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
    return CurDAG->getMachineNode(Opcode, SDLoc(N),
@@ -359,7 +360,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
       IsLegalToFold(N1, Op, Op, OptLevel)) {
     LoadSDNode *LD = cast<LoadSDNode>(N1);
     if (!isValidIndexedLoad(LD))
-      return NULL;
+      return nullptr;
 
     MVT VT = LD->getMemoryVT().getSimpleVT();
     unsigned Opc = (VT == MVT::i16 ? Opc16 : Opc8);
@@ -367,9 +368,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
     MemRefs0[0] = cast<MemSDNode>(N1)->getMemOperand();
     SDValue Ops0[] = { N2, LD->getBasePtr(), LD->getChain() };
     SDNode *ResNode =
-      CurDAG->SelectNodeTo(Op, Opc,
-                           VT, MVT::i16, MVT::Other,
-                           Ops0, 3);
+      CurDAG->SelectNodeTo(Op, Opc, VT, MVT::i16, MVT::Other, Ops0);
     cast<MachineSDNode>(ResNode)->setMemRefs(MemRefs0, MemRefs0 + 1);
     // Transfer chain.
     ReplaceUses(SDValue(N1.getNode(), 2), SDValue(ResNode, 2));
@@ -378,7 +377,7 @@ SDNode *MSP430DAGToDAGISel::SelectIndexedBinOp(SDNode *Op,
     return ResNode;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 
@@ -396,7 +395,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
           Node->dump(CurDAG);
           errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
+    return nullptr;
   }
 
   // Few custom selection stuff.
@@ -484,7 +483,7 @@ SDNode *MSP430DAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
+  if (ResNode == nullptr || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index fe163d4..c5901bc 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-lower"
-
 #include "MSP430ISelLowering.h"
 #include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
@@ -38,6 +36,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-lower"
+
 typedef enum {
   NoHWMult,
   HWMultIntr,
@@ -284,7 +284,7 @@ template<typename ArgT>
 static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     MSP430::R15W, MSP430::R14W, MSP430::R13W, MSP430::R12W
   };
   static const unsigned NbRegs = array_lengthof(RegList);
@@ -462,7 +462,7 @@ MSP430TargetLowering::LowerCCCArguments(SDValue Chain,
           errs() << "LowerFormalArguments Unhandled argument type: "
                << RegVT.getSimpleVT().SimpleTy << "\n";
 #endif
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       case MVT::i16:
         unsigned VReg = RegInfo.createVirtualRegister(&MSP430::GR16RegClass);
@@ -568,7 +568,7 @@ MSP430TargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(Opc, dl, MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(Opc, dl, MVT::Other, RetOps);
 }
 
 /// LowerCCCCallTo - functions arguments are copied from virtual regs to
@@ -629,7 +629,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
     } else {
       assert(VA.isMemLoc());
 
-      if (StackPtr.getNode() == 0)
+      if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, MSP430::SPW, getPointerTy());
 
       SDValue PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(),
@@ -659,8 +659,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Transform all store nodes into one single node because all store nodes are
   // independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain and
   // flag operands which copy the outgoing args into registers.  The InFlag in
@@ -695,7 +694,7 @@ MSP430TargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(MSP430ISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -986,7 +985,7 @@ SDValue MSP430TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     Ops.push_back(Zero);
     Ops.push_back(TargetCC);
     Ops.push_back(Flag);
-    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+    return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
   }
 }
 
@@ -1009,7 +1008,7 @@ SDValue MSP430TargetLowering::LowerSELECT_CC(SDValue Op,
   Ops.push_back(TargetCC);
   Ops.push_back(Flag);
 
-  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, &Ops[0], Ops.size());
+  return DAG.getNode(MSP430ISD::SELECT_CC, dl, VTs, Ops);
 }
 
 SDValue MSP430TargetLowering::LowerSIGN_EXTEND(SDValue Op,
@@ -1148,7 +1147,7 @@ bool MSP430TargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 
 const char *MSP430TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return NULL;
+  default: return nullptr;
   case MSP430ISD::RET_FLAG:           return "MSP430ISD::RET_FLAG";
   case MSP430ISD::RETI_FLAG:          return "MSP430ISD::RETI_FLAG";
   case MSP430ISD::RRA:                return "MSP430ISD::RRA";
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 85a861e..3ced61d 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -73,14 +73,14 @@ namespace llvm {
   public:
     explicit MSP430TargetLowering(MSP430TargetMachine &TM);
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i8; }
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i8; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     /// DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
     SDValue LowerShifts(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -97,15 +97,16 @@ namespace llvm {
     SDValue getReturnAddressFrameIndex(SelectionDAG &DAG) const;
 
     TargetLowering::ConstraintType
-    getConstraintType(const std::string &Constraint) const;
+    getConstraintType(const std::string &Constraint) const override;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint,
+                                 MVT VT) const override;
 
     /// isTruncateFree - Return true if it's free to truncate a value of type
     /// Ty1 to type Ty2. e.g. On msp430 it's free to truncate a i16 value in
     /// register R15W to i8 by referencing its sub-register R15B.
-    virtual bool isTruncateFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isTruncateFree(EVT VT1, EVT VT2) const;
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
 
     /// isZExtFree - Return true if any actual instruction that defines a value
     /// of type Ty1 implicit zero-extends the value to Ty2 in the result
@@ -115,12 +116,12 @@ namespace llvm {
     /// necessarily apply to truncate instructions. e.g. on msp430, all
     /// instructions that define 8-bit values implicit zero-extend the result
     /// out to 16 bits.
-    virtual bool isZExtFree(Type *Ty1, Type *Ty2) const;
-    virtual bool isZExtFree(EVT VT1, EVT VT2) const;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+    bool isZExtFree(EVT VT1, EVT VT2) const override;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
     MachineBasicBlock* EmitInstrWithCustomInserter(MachineInstr *MI,
-                                                   MachineBasicBlock *BB) const;
+                                                   MachineBasicBlock *BB) const override;
     MachineBasicBlock* EmitShiftInstr(MachineInstr *MI,
                                       MachineBasicBlock *BB) const;
 
@@ -148,28 +149,27 @@ namespace llvm {
                             SDLoc dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
-    virtual SDValue
+                           SmallVectorImpl<SDValue> &InVals) const override;
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
-
-    virtual bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
-                                            SDValue &Base,
-                                            SDValue &Offset,
-                                            ISD::MemIndexedMode &AM,
-                                            SelectionDAG &DAG) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
+
+    bool getPostIndexedAddressParts(SDNode *N, SDNode *Op,
+                                    SDValue &Base,
+                                    SDValue &Offset,
+                                    ISD::MemIndexedMode &AM,
+                                    SelectionDAG &DAG) const override;
 
     const MSP430Subtarget &Subtarget;
     const DataLayout *TD;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index 38f73b9..0c04ddb 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -22,11 +22,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "MSP430GenInstrInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 void MSP430InstrInfo::anchor() {}
 
@@ -208,11 +208,11 @@ bool MSP430InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
       while (std::next(I) != MBB.end())
         std::next(I)->eraseFromParent();
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         continue;
diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h
index ad2b8cc..1ffcebb 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/lib/Target/MSP430/MSP430InstrInfo.h
@@ -50,40 +50,41 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
   void copyPhysReg(MachineBasicBlock &MBB,
                    MachineBasicBlock::iterator I, DebugLoc DL,
                    unsigned DestReg, unsigned SrcReg,
-                   bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill,
-                                   int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIdx,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill,
+                           int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIdx,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
   // Branch folding goodness
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-  bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
   bool AnalyzeBranch(MachineBasicBlock &MBB,
                      MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const;
+                     bool AllowModify) const override;
 
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
   unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
                         MachineBasicBlock *FBB,
                         const SmallVectorImpl<MachineOperand> &Cond,
-                        DebugLoc DL) const;
+                        DebugLoc DL) const override;
 
 };
 
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp
index f64017e..341fb64 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-reg-info"
-
 #include "MSP430RegisterInfo.h"
 #include "MSP430.h"
 #include "MSP430MachineFunctionInfo.h"
@@ -26,38 +24,40 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "MSP430GenRegisterInfo.inc"
 
-using namespace llvm;
-
 // FIXME: Provide proper call frame setup / destroy opcodes.
 MSP430RegisterInfo::MSP430RegisterInfo(MSP430TargetMachine &tm)
   : MSP430GenRegisterInfo(MSP430::PCW), TM(tm) {
   StackAlign = TM.getFrameLowering()->getStackAlignment();
 }
 
-const uint16_t*
+const MCPhysReg*
 MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const TargetFrameLowering *TFI = MF->getTarget().getFrameLowering();
   const Function* F = MF->getFunction();
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     MSP430::FPW, MSP430::R5W, MSP430::R6W, MSP430::R7W,
     MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
     0
   };
-  static const uint16_t CalleeSavedRegsFP[] = {
+  static const MCPhysReg CalleeSavedRegsFP[] = {
     MSP430::R5W, MSP430::R6W, MSP430::R7W,
     MSP430::R8W, MSP430::R9W, MSP430::R10W, MSP430::R11W,
     0
   };
-  static const uint16_t CalleeSavedRegsIntr[] = {
+  static const MCPhysReg CalleeSavedRegsIntr[] = {
     MSP430::FPW,  MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
     MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
     MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
     0
   };
-  static const uint16_t CalleeSavedRegsIntrFP[] = {
+  static const MCPhysReg CalleeSavedRegsIntrFP[] = {
     MSP430::R5W,  MSP430::R6W,  MSP430::R7W,
     MSP430::R8W,  MSP430::R9W,  MSP430::R10W, MSP430::R11W,
     MSP430::R12W, MSP430::R13W, MSP430::R14W, MSP430::R15W,
diff --git a/lib/Target/MSP430/MSP430RegisterInfo.h b/lib/Target/MSP430/MSP430RegisterInfo.h
index 78047cc..a607528 100644
--- a/lib/Target/MSP430/MSP430RegisterInfo.h
+++ b/lib/Target/MSP430/MSP430RegisterInfo.h
@@ -35,18 +35,20 @@ public:
   MSP430RegisterInfo(MSP430TargetMachine &tm);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   const TargetRegisterClass*
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind = 0) const;
+  getPointerRegClass(const MachineFunction &MF,
+                     unsigned Kind = 0) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
index 24f45fa..c700383 100644
--- a/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
+++ b/lib/Target/MSP430/MSP430SelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "msp430-selectiondag-info"
 #include "MSP430TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "msp430-selectiondag-info"
+
 MSP430SelectionDAGInfo::MSP430SelectionDAGInfo(const MSP430TargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/MSP430/MSP430Subtarget.cpp b/lib/Target/MSP430/MSP430Subtarget.cpp
index edeaf34..68ad091 100644
--- a/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -15,12 +15,14 @@
 #include "MSP430.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "msp430-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MSP430GenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void MSP430Subtarget::anchor() { }
 
 MSP430Subtarget::MSP430Subtarget(const std::string &TT,
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 98a6003..50be2be 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -51,8 +51,8 @@ public:
     return getTM<MSP430TargetMachine>();
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index be695a2..ea5d407 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -43,25 +43,25 @@ public:
                       Reloc::Model RM, CodeModel::Model CM,
                       CodeGenOpt::Level OL);
 
-  virtual const TargetFrameLowering *getFrameLowering() const {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const MSP430InstrInfo *getInstrInfo() const  { return &InstrInfo; }
-  virtual const DataLayout *getDataLayout() const     { return &DL;}
-  virtual const MSP430Subtarget *getSubtargetImpl() const { return &Subtarget; }
+  const MSP430InstrInfo *getInstrInfo() const override  { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override     { return &DL;}
+  const MSP430Subtarget *getSubtargetImpl() const override { return &Subtarget; }
 
-  virtual const TargetRegisterInfo *getRegisterInfo() const {
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const MSP430TargetLowering *getTargetLowering() const {
+  const MSP430TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const MSP430SelectionDAGInfo* getSelectionDAGInfo() const {
+  const MSP430SelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 }; // MSP430TargetMachine.
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/Android.mk b/lib/Target/Mips/Android.mk
index 74b8a3b..4e8831c 100644
--- a/lib/Target/Mips/Android.mk
+++ b/lib/Target/Mips/Android.mk
@@ -24,6 +24,7 @@ mips_codegen_SRC_FILES := \
   MipsCodeEmitter.cpp \
   MipsConstantIslandPass.cpp \
   MipsDelaySlotFiller.cpp \
+  MipsFastISel.cpp \
   MipsFrameLowering.cpp \
   MipsInstrInfo.cpp \
   MipsISelDAGToDAG.cpp \
diff --git a/lib/Target/Mips/AsmParser/LLVMBuild.txt b/lib/Target/Mips/AsmParser/LLVMBuild.txt
index e7ca243..dd8e3cf 100644
--- a/lib/Target/Mips/AsmParser/LLVMBuild.txt
+++ b/lib/Target/Mips/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MipsAsmParser
 parent = Mips
-required_libraries = MC MCParser Support MipsDesc MipsInfo
+required_libraries = MC MCParser MipsDesc MipsInfo Support
 add_to_library_groups = Mips
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 911a119..86fd386 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -29,6 +29,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-asm-parser"
+
 namespace llvm {
 class MCInstrInfo;
 }
@@ -73,10 +75,10 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand *> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
+                               bool MatchingInlineAsm) override;
 
   /// Parse a register as used in CFI directives
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   bool ParseParenSuffix(StringRef Name,
                         SmallVectorImpl<MCParsedAsmOperand *> &Operands);
@@ -84,11 +86,11 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool ParseBracketSuffix(StringRef Name,
                           SmallVectorImpl<MCParsedAsmOperand *> &Operands);
 
-  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand *> &Operands);
+  bool
+  ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
+                   SmallVectorImpl<MCParsedAsmOperand *> &Operands) override;
 
-  bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirective(AsmToken DirectiveID) override;
 
   MipsAsmParser::OperandMatchResultTy
   parseMemOperand(SmallVectorImpl<MCParsedAsmOperand *> &Operands);
@@ -135,6 +137,7 @@ class MipsAsmParser : public MCTargetAsmParser {
                      SmallVectorImpl<MCInst> &Instructions, bool isLoad,
                      bool isImmOpnd);
   bool reportParseError(StringRef ErrorMsg);
+  bool reportParseError(SMLoc Loc, StringRef ErrorMsg);
 
   bool parseMemOffset(const MCExpr *&Res, bool isParenExpr);
   bool parseRelocOperand(const MCExpr *&Res);
@@ -143,7 +146,9 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool isEvaluated(const MCExpr *Expr);
   bool parseSetFeature(uint64_t Feature);
+  bool parseDirectiveCPLoad(SMLoc Loc);
   bool parseDirectiveCPSetup();
+  bool parseDirectiveNaN();
   bool parseDirectiveSet();
   bool parseDirectiveOption();
 
@@ -212,21 +217,22 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   void setFeatureBits(unsigned Feature, StringRef FeatureString) {
     if (!(STI.getFeatureBits() & Feature)) {
-      setAvailableFeatures(ComputeAvailableFeatures(
-                           STI.ToggleFeature(FeatureString)));
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
   }
 
   void clearFeatureBits(unsigned Feature, StringRef FeatureString) {
     if (STI.getFeatureBits() & Feature) {
-     setAvailableFeatures(ComputeAvailableFeatures(
-                           STI.ToggleFeature(FeatureString)));
+      setAvailableFeatures(
+          ComputeAvailableFeatures(STI.ToggleFeature(FeatureString)));
     }
   }
 
 public:
   MipsAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                const MCInstrInfo &MII)
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -266,11 +272,12 @@ public:
                           /// context).
     RegKind_CCR = 128,    /// CCR
     RegKind_HWRegs = 256, /// HWRegs
+    RegKind_COP3 = 512,   /// COP3
 
     /// Potentially any (e.g. $1)
     RegKind_Numeric = RegKind_GPR | RegKind_FGR | RegKind_FCC | RegKind_MSA128 |
                       RegKind_MSACtrl | RegKind_COP2 | RegKind_ACC |
-                      RegKind_CCR | RegKind_HWRegs
+                      RegKind_CCR | RegKind_HWRegs | RegKind_COP3
   };
 
 private:
@@ -422,6 +429,14 @@ private:
     return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
   }
 
+  /// Coerce the register to COP3 and return the real register for the
+  /// current target.
+  unsigned getCOP3Reg() const {
+    assert(isRegIdx() && (RegIdx.Kind & RegKind_COP3) && "Invalid access!");
+    unsigned ClassID = Mips::COP3RegClassID;
+    return RegIdx.RegInfo->getRegClass(ClassID).getRegister(RegIdx.Index);
+  }
+
   /// Coerce the register to ACC64DSP and return the real register for the
   /// current target.
   unsigned getACC64DSPReg() const {
@@ -465,7 +480,7 @@ private:
 public:
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediate when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -533,6 +548,11 @@ public:
     Inst.addOperand(MCOperand::CreateReg(getCOP2Reg()));
   }
 
+  void addCOP3AsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::CreateReg(getCOP3Reg()));
+  }
+
   void addACC64DSPAsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::CreateReg(getACC64DSPReg()));
@@ -573,7 +593,7 @@ public:
     addExpr(Inst, Expr);
   }
 
-  bool isReg() const {
+  bool isReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
     if (isGPRAsmReg() && RegIdx.Index == 0)
@@ -582,16 +602,16 @@ public:
     return Kind == k_PhysRegister;
   }
   bool isRegIdx() const { return Kind == k_RegisterIndex; }
-  bool isImm() const { return Kind == k_Immediate; }
+  bool isImm() const override { return Kind == k_Immediate; }
   bool isConstantImm() const {
     return isImm() && dyn_cast<MCConstantExpr>(getImm());
   }
-  bool isToken() const {
+  bool isToken() const override {
     // Note: It's not possible to pretend that other operand kinds are tokens.
     // The matcher emitter checks tokens first.
     return Kind == k_Token;
   }
-  bool isMem() const { return Kind == k_Memory; }
+  bool isMem() const override { return Kind == k_Memory; }
   bool isInvNum() const { return Kind == k_Immediate; }
   bool isLSAImm() const {
     if (!isConstantImm())
@@ -605,7 +625,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     // As a special case until we sort out the definition of div/divu, pretend
     // that $0/$zero are k_PhysRegister so that MCK_ZERO works correctly.
     if (Kind == k_RegisterIndex && RegIdx.Index == 0 &&
@@ -744,6 +764,9 @@ public:
   bool isCOP2AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_COP2 && RegIdx.Index <= 31;
   }
+  bool isCOP3AsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_COP3 && RegIdx.Index <= 31;
+  }
   bool isMSA128AsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_MSA128 && RegIdx.Index <= 31;
   }
@@ -752,11 +775,25 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getStartLoc() const override { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
 
-  virtual void print(raw_ostream &OS) const {
+  virtual ~MipsOperand() {
+    switch (Kind) {
+    case k_Immediate:
+      break;
+    case k_Memory:
+      delete Mem.Base;
+      break;
+    case k_PhysRegister:
+    case k_RegisterIndex:
+    case k_Token:
+      break;
+    }
+  }
+
+  void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Immediate:
       OS << "Imm<";
@@ -906,10 +943,6 @@ bool MipsAsmParser::needsExpansion(MCInst &Inst) {
   case Mips::LoadImm32Reg:
   case Mips::LoadAddr32Imm:
   case Mips::LoadAddr32Reg:
-  case Mips::SUBi:
-  case Mips::SUBiu:
-  case Mips::DSUBi:
-  case Mips::DSUBiu:
     return true;
   default:
     return false;
@@ -925,30 +958,6 @@ void MipsAsmParser::expandInstruction(MCInst &Inst, SMLoc IDLoc,
     return expandLoadAddressImm(Inst, IDLoc, Instructions);
   case Mips::LoadAddr32Reg:
     return expandLoadAddressReg(Inst, IDLoc, Instructions);
-  case Mips::SUBi:
-    Instructions.push_back(MCInstBuilder(Mips::ADDi)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::SUBiu:
-    Instructions.push_back(MCInstBuilder(Mips::ADDiu)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::DSUBi:
-    Instructions.push_back(MCInstBuilder(Mips::DADDi)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
-  case Mips::DSUBiu:
-    Instructions.push_back(MCInstBuilder(Mips::DADDiu)
-                               .addReg(Inst.getOperand(0).getReg())
-                               .addReg(Inst.getOperand(1).getReg())
-                               .addImm(-Inst.getOperand(2).getImm()));
-    return;
   }
 }
 
@@ -1586,6 +1595,8 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       RegNo = isGP64() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
     }
 
+    delete &Operand;
+
     return (RegNo == (unsigned)-1);
   }
 
@@ -1624,7 +1635,7 @@ bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
 MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
     SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
   DEBUG(dbgs() << "parseMemOperand\n");
-  const MCExpr *IdVal = 0;
+  const MCExpr *IdVal = nullptr;
   SMLoc S;
   bool isParenExpr = false;
   MipsAsmParser::OperandMatchResultTy Res = MatchOperand_NoMatch;
@@ -1654,6 +1665,7 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
             SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
 
         // Zero register assumed, add a memory operand with ZERO as its base.
+        // "Base" will be managed by k_Memory.
         MipsOperand *Base = MipsOperand::CreateGPRReg(
             0, getContext().getRegisterInfo(), S, E, *this);
         Operands.push_back(MipsOperand::CreateMem(Base, IdVal, S, E, *this));
@@ -1679,12 +1691,13 @@ MipsAsmParser::OperandMatchResultTy MipsAsmParser::parseMemOperand(
 
   Parser.Lex(); // Eat the ')' token.
 
-  if (IdVal == 0)
+  if (!IdVal)
     IdVal = MCConstantExpr::Create(0, getContext());
 
   // Replace the register operand with the memory operand.
   MipsOperand *op = static_cast<MipsOperand *>(Operands.back());
   // Remove the register from the operands.
+  // "op" will be managed by k_Memory.
   Operands.pop_back();
   // Add the memory operand.
   if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(IdVal)) {
@@ -1969,9 +1982,11 @@ MCSymbolRefExpr::VariantKind MipsAsmParser::getVariantKind(StringRef Symbol) {
           .Case("call_lo", MCSymbolRefExpr::VK_Mips_CALL_LO16)
           .Case("higher", MCSymbolRefExpr::VK_Mips_HIGHER)
           .Case("highest", MCSymbolRefExpr::VK_Mips_HIGHEST)
+          .Case("pcrel_hi", MCSymbolRefExpr::VK_Mips_PCREL_HI16)
+          .Case("pcrel_lo", MCSymbolRefExpr::VK_Mips_PCREL_LO16)
           .Default(MCSymbolRefExpr::VK_None);
 
-  assert (VK != MCSymbolRefExpr::VK_None);
+  assert(VK != MCSymbolRefExpr::VK_None);
 
   return VK;
 }
@@ -2089,6 +2104,10 @@ bool MipsAsmParser::reportParseError(StringRef ErrorMsg) {
   return Error(Loc, ErrorMsg);
 }
 
+bool MipsAsmParser::reportParseError(SMLoc Loc, StringRef ErrorMsg) {
+  return Error(Loc, ErrorMsg);
+}
+
 bool MipsAsmParser::parseSetNoAtDirective() {
   // Line should look like: ".set noat".
   // set at reg to 0.
@@ -2248,29 +2267,30 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return reportParseError("unexpected token in .set directive");
 
-  switch(Feature) {
-    default: llvm_unreachable("Unimplemented feature");
-    case Mips::FeatureDSP:
-      setFeatureBits(Mips::FeatureDSP, "dsp");
-      getTargetStreamer().emitDirectiveSetDsp();
+  switch (Feature) {
+  default:
+    llvm_unreachable("Unimplemented feature");
+  case Mips::FeatureDSP:
+    setFeatureBits(Mips::FeatureDSP, "dsp");
+    getTargetStreamer().emitDirectiveSetDsp();
     break;
-    case Mips::FeatureMicroMips:
-      getTargetStreamer().emitDirectiveSetMicroMips();
+  case Mips::FeatureMicroMips:
+    getTargetStreamer().emitDirectiveSetMicroMips();
     break;
-    case Mips::FeatureMips16:
-      getTargetStreamer().emitDirectiveSetMips16();
+  case Mips::FeatureMips16:
+    getTargetStreamer().emitDirectiveSetMips16();
     break;
-    case Mips::FeatureMips32r2:
-      setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
-      getTargetStreamer().emitDirectiveSetMips32R2();
+  case Mips::FeatureMips32r2:
+    setFeatureBits(Mips::FeatureMips32r2, "mips32r2");
+    getTargetStreamer().emitDirectiveSetMips32R2();
     break;
-    case Mips::FeatureMips64:
-      setFeatureBits(Mips::FeatureMips64, "mips64");
-      getTargetStreamer().emitDirectiveSetMips64();
+  case Mips::FeatureMips64:
+    setFeatureBits(Mips::FeatureMips64, "mips64");
+    getTargetStreamer().emitDirectiveSetMips64();
     break;
-    case Mips::FeatureMips64r2:
-      setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
-      getTargetStreamer().emitDirectiveSetMips64R2();
+  case Mips::FeatureMips64r2:
+    setFeatureBits(Mips::FeatureMips64r2, "mips64r2");
+    getTargetStreamer().emitDirectiveSetMips64R2();
     break;
   }
   return false;
@@ -2302,10 +2322,34 @@ bool MipsAsmParser::eatComma(StringRef ErrorStr) {
     return Error(Loc, ErrorStr);
   }
 
-  Parser.Lex();  // Eat the comma.
+  Parser.Lex(); // Eat the comma.
   return true;
 }
 
+bool MipsAsmParser::parseDirectiveCPLoad(SMLoc Loc) {
+  if (Options.isReorder())
+    Warning(Loc, ".cpload in reorder section");
+
+  // FIXME: Warn if cpload is used in Mips16 mode.
+
+  SmallVector<MCParsedAsmOperand *, 1> Reg;
+  OperandMatchResultTy ResTy = ParseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register containing function address");
+    return false;
+  }
+
+  MipsOperand *RegOpnd = static_cast<MipsOperand *>(Reg[0]);
+  if (!RegOpnd->isGPRAsmReg()) {
+    reportParseError(RegOpnd->getStartLoc(), "invalid register");
+    return false;
+  }
+
+  getTargetStreamer().emitDirectiveCpload(RegOpnd->getGPR32Reg());
+  delete RegOpnd;
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveCPSetup() {
   unsigned FuncReg;
   unsigned Save;
@@ -2336,60 +2380,28 @@ bool MipsAsmParser::parseDirectiveCPSetup() {
   if (Parser.parseIdentifier(Name))
     reportParseError("expected identifier");
   MCSymbol *Sym = getContext().GetOrCreateSymbol(Name);
-  unsigned GPReg = getGPR(matchCPURegisterName("gp"));
 
-  // FIXME: The code below this point should be in the TargetStreamers.
-  // Only N32 and N64 emit anything for .cpsetup
-  // FIXME: We should only emit something for PIC mode too.
-  if (!isN32() && !isN64())
-    return false;
+  getTargetStreamer().emitDirectiveCpsetup(FuncReg, Save, *Sym, SaveIsReg);
+  return false;
+}
 
-  MCStreamer &TS = getStreamer();
-  MCInst Inst;
-  // Either store the old $gp in a register or on the stack
-  if (SaveIsReg) {
-    // move $save, $gpreg
-    Inst.setOpcode(Mips::DADDu);
-    Inst.addOperand(MCOperand::CreateReg(Save));
-    Inst.addOperand(MCOperand::CreateReg(GPReg));
-    Inst.addOperand(MCOperand::CreateReg(getGPR(0)));
-  } else {
-    // sd $gpreg, offset($sp)
-    Inst.setOpcode(Mips::SD);
-    Inst.addOperand(MCOperand::CreateReg(GPReg));
-    Inst.addOperand(MCOperand::CreateReg(getGPR(matchCPURegisterName("sp"))));
-    Inst.addOperand(MCOperand::CreateImm(Save));
-  }
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
-      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI,
-      getContext());
-  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
-      Sym->getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO,
-      getContext());
-  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::LUi);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
-  Inst.setOpcode(Mips::ADDiu);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
-  TS.EmitInstruction(Inst, STI);
-  Inst.clear();
-
-  // daddu  $gp, $gp, $funcreg
-  Inst.setOpcode(Mips::DADDu);
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(GPReg));
-  Inst.addOperand(MCOperand::CreateReg(FuncReg));
-  TS.EmitInstruction(Inst, STI);
+bool MipsAsmParser::parseDirectiveNaN() {
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    const AsmToken &Tok = Parser.getTok();
+
+    if (Tok.getString() == "2008") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaN2008();
+      return false;
+    } else if (Tok.getString() == "legacy") {
+      Parser.Lex();
+      getTargetStreamer().emitDirectiveNaNLegacy();
+      return false;
+    }
+  }
+  // If we don't recognize the option passed to the .nan
+  // directive (e.g. no option or unknown option), emit an error.
+  reportParseError("invalid option in .nan directive");
   return false;
 }
 
@@ -2419,15 +2431,15 @@ bool MipsAsmParser::parseDirectiveSet() {
     Parser.eatToEndOfStatement();
     return false;
   } else if (Tok.getString() == "micromips") {
-      return parseSetFeature(Mips::FeatureMicroMips);
+    return parseSetFeature(Mips::FeatureMicroMips);
   } else if (Tok.getString() == "mips32r2") {
-      return parseSetFeature(Mips::FeatureMips32r2);
+    return parseSetFeature(Mips::FeatureMips32r2);
   } else if (Tok.getString() == "mips64") {
-      return parseSetFeature(Mips::FeatureMips64);
+    return parseSetFeature(Mips::FeatureMips64);
   } else if (Tok.getString() == "mips64r2") {
-      return parseSetFeature(Mips::FeatureMips64r2);
+    return parseSetFeature(Mips::FeatureMips64r2);
   } else if (Tok.getString() == "dsp") {
-      return parseSetFeature(Mips::FeatureDSP);
+    return parseSetFeature(Mips::FeatureDSP);
   } else {
     // It is just an identifier, look for an assignment.
     parseSetAssignment();
@@ -2537,6 +2549,8 @@ bool MipsAsmParser::parseDirectiveOption() {
 bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   StringRef IDVal = DirectiveID.getString();
 
+  if (IDVal == ".cpload")
+    return parseDirectiveCPLoad(DirectiveID.getLoc());
   if (IDVal == ".dword") {
     parseDataDirective(8, DirectiveID.getLoc());
     return false;
@@ -2576,6 +2590,9 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   }
 
+  if (IDVal == ".nan")
+    return parseDirectiveNaN();
+
   if (IDVal == ".gpword") {
     parseDirectiveGpWord();
     return false;
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index c304ee3..bf67d71 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -7,6 +7,7 @@ tablegen(LLVM MipsGenCodeEmitter.inc -gen-emitter)
 tablegen(LLVM MipsGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
 tablegen(LLVM MipsGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM MipsGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM MipsGenFastISel.inc -gen-fast-isel)
 tablegen(LLVM MipsGenCallingConv.inc -gen-callingconv)
 tablegen(LLVM MipsGenSubtargetInfo.inc -gen-subtarget)
 tablegen(LLVM MipsGenAsmMatcher.inc -gen-asm-matcher)
@@ -26,6 +27,7 @@ add_llvm_target(MipsCodeGen
   MipsCodeEmitter.cpp
   MipsConstantIslandPass.cpp
   MipsDelaySlotFiller.cpp
+  MipsFastISel.cpp
   MipsJITInfo.cpp
   MipsInstrInfo.cpp
   MipsISelDAGToDAG.cpp
diff --git a/lib/Target/Mips/Disassembler/LLVMBuild.txt b/lib/Target/Mips/Disassembler/LLVMBuild.txt
index 7101c06..bb70fd3 100644
--- a/lib/Target/Mips/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Mips/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = MipsDisassembler
 parent = Mips
-required_libraries = MC Support MipsInfo
+required_libraries = MC MipsInfo Support
 add_to_library_groups = Mips
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index fc3b922..95670aa 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -14,6 +14,7 @@
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -24,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -33,19 +36,16 @@ class MipsDisassemblerBase : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassemblerBase(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  MipsDisassemblerBase(const MCSubtargetInfo &STI, MCContext &Ctx,
                        bool bigEndian) :
-    MCDisassembler(STI), RegInfo(Info),
+    MCDisassembler(STI, Ctx),
     IsN64(STI.getFeatureBits() & Mips::FeatureN64), isBigEndian(bigEndian) {}
 
   virtual ~MipsDisassemblerBase() {}
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-
   bool isN64() const { return IsN64; }
 
 private:
-  OwningPtr<const MCRegisterInfo> RegInfo;
   bool IsN64;
 protected:
   bool isBigEndian;
@@ -57,19 +57,23 @@ class MipsDisassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  MipsDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                    bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {
+    MipsDisassemblerBase(STI, Ctx, bigEndian) {
       IsMicroMips = STI.getFeatureBits() & Mips::FeatureMicroMips;
     }
 
+  bool isMips32r6() const {
+    return STI.getFeatureBits() & Mips::FeatureMips32r6;
+  }
+
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 
@@ -78,17 +82,17 @@ class Mips64Disassembler : public MipsDisassemblerBase {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  Mips64Disassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info,
+  Mips64Disassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
                      bool bigEndian) :
-    MipsDisassemblerBase(STI, Info, bigEndian) {}
+    MipsDisassemblerBase(STI, Ctx, bigEndian) {}
 
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 } // end anonymous namespace
@@ -195,6 +199,11 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
                                                uint64_t Address,
                                                const void *Decoder);
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder);
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
@@ -205,6 +214,16 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      uint64_t Address,
                                      const void *Decoder);
 
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
 // DecodeBranchTargetMM - Decode microMIPS branch offset, which is
 // shifted left by 1 bit.
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
@@ -263,11 +282,40 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
                                   uint64_t Address,
                                   const void *Decoder);
 
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
 /// INSVE_[BHWD] have an implicit operand that the generated decoder doesn't
 /// handle.
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeAddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeDaddiGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBlezlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                       const void *Decoder);
+
+template <typename InsnType>
+static DecodeStatus
+DecodeBgtzGroupBranch(MCInst &MI, InsnType insn, uint64_t Address,
+                      const void *Decoder);
+
 namespace llvm {
 extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
               TheMips64elTarget;
@@ -275,26 +323,30 @@ extern Target TheMipselTarget, TheMipsTarget, TheMips64Target,
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI, T.createMCRegInfo(""), true);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, true);
 }
 
 static MCDisassembler *createMipselDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new MipsDisassembler(STI, T.createMCRegInfo(""), false);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new MipsDisassembler(STI, Ctx, false);
 }
 
 static MCDisassembler *createMips64Disassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, T.createMCRegInfo(""), true);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, true);
 }
 
 static MCDisassembler *createMips64elDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new Mips64Disassembler(STI, T.createMCRegInfo(""), false);
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new Mips64Disassembler(STI, Ctx, false);
 }
 
 extern "C" void LLVMInitializeMipsDisassembler() {
@@ -311,6 +363,12 @@ extern "C" void LLVMInitializeMipsDisassembler() {
 
 #include "MipsGenDisassemblerTables.inc"
 
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
+}
+
 template <typename InsnType>
 static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
                                    const void *Decoder) {
@@ -357,6 +415,202 @@ static DecodeStatus DecodeINSVE_DF(MCInst &MI, InsnType insn, uint64_t Address,
   return MCDisassembler::Success;
 }
 
+template <typename InsnType>
+static DecodeStatus DecodeAddiGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b001000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BOVC if rs >= rt
+  //      BEQZALC if rs == 0 && rt != 0
+  //      BEQC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BOVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BEQC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BEQZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeDaddiGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the ADDI instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b011000 sssss ttttt iiiiiiiiiiiiiiii
+  //      BNVC if rs >= rt
+  //      BNEZALC if rs == 0 && rt != 0
+  //      BNEC if rs < rt && rs != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+
+  if (Rs >= Rt) {
+    MI.setOpcode(Mips::BNVC);
+    HasRs = true;
+  } else if (Rs != 0 && Rs < Rt) {
+    MI.setOpcode(Mips::BNEC);
+    HasRs = true;
+  } else
+    MI.setOpcode(Mips::BNEZALC);
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBlezlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BLEZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010110 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BLEZC   if rs == 0  && rt != 0
+  //      BGEZC   if rs == rt && rt != 0
+  //      BGEC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BLEZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BGEZC);
+  else
+    return MCDisassembler::Fail; // FIXME: BGEC is not implemented yet.
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzlGroupBranch(MCInst &MI, InsnType insn,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZL instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b010111 sssss ttttt iiiiiiiiiiiiiiii
+  //      Invalid if rs == 0
+  //      BGTZC   if rs == 0  && rt != 0
+  //      BLTZC   if rs == rt && rt != 0
+  //      BLTC    if rs != rt && rs != 0  && rt != 0
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+
+  if (Rt == 0)
+    return MCDisassembler::Fail;
+  else if (Rs == 0)
+    MI.setOpcode(Mips::BGTZC);
+  else if (Rs == Rt)
+    MI.setOpcode(Mips::BLTZC);
+  else
+    return MCDisassembler::Fail; // FIXME: BLTC is not implemented yet.
+
+  MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                     Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
+template <typename InsnType>
+static DecodeStatus DecodeBgtzGroupBranch(MCInst &MI, InsnType insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  // If we are called then we can assume that MIPS32r6/MIPS64r6 is enabled
+  // (otherwise we would have matched the BGTZ instruction from the earlier
+  // ISA's instead).
+  //
+  // We have:
+  //    0b000111 sssss ttttt iiiiiiiiiiiiiiii
+  //      BGTZ    if rt == 0
+  //      BGTZALC if rs == 0 && rt != 0
+  //      BLTZALC if rs != 0 && rs == rt
+  //      BLTUC   if rs != 0 && rs != rt
+
+  InsnType Rs = fieldFromInstruction(insn, 21, 5);
+  InsnType Rt = fieldFromInstruction(insn, 16, 5);
+  InsnType Imm = SignExtend64(fieldFromInstruction(insn, 0, 16), 16) << 2;
+  bool HasRs = false;
+  bool HasRt = false;
+
+  if (Rt == 0) {
+    MI.setOpcode(Mips::BGTZ);
+    HasRs = true;
+  } else if (Rs == 0) {
+    MI.setOpcode(Mips::BGTZALC);
+    HasRt = true;
+  } else if (Rs == Rt) {
+    MI.setOpcode(Mips::BLTZALC);
+    HasRs = true;
+  } else
+    return MCDisassembler::Fail; // BLTUC not implemented yet
+
+  if (HasRs)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rs)));
+
+  if (HasRt)
+    MI.addOperand(MCOperand::CreateReg(getReg(Decoder, Mips::GPR32RegClassID,
+                                       Rt)));
+
+  MI.addOperand(MCOperand::CreateImm(Imm));
+
+  return MCDisassembler::Success;
+}
+
   /// readInstruction - read four bytes from the MemoryObject
   /// and return 32 bit word sorted according to the given endianess
 static DecodeStatus readInstruction32(const MemoryObject &region,
@@ -426,6 +680,15 @@ MipsDisassembler::getInstruction(MCInst &instr,
     return MCDisassembler::Fail;
   }
 
+  if (isMips32r6()) {
+    Result = decodeInstruction(DecoderTableMips32r6_64r632, instr, Insn,
+                               Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      return Result;
+    }
+  }
+
   // Calling the auto-generated decoder function.
   Result = decodeInstruction(DecoderTableMips32, instr, Insn, Address,
                              this, STI);
@@ -469,11 +732,6 @@ Mips64Disassembler::getInstruction(MCInst &instr,
   return MCDisassembler::Fail;
 }
 
-static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
-  const MipsDisassemblerBase *Dis = static_cast<const MipsDisassemblerBase*>(D);
-  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
-}
-
 static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
@@ -828,12 +1086,23 @@ static DecodeStatus DecodeMSACtrlRegisterClass(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeCOP2RegisterClass(MCInst &Inst,
+                                            unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Reg = getReg(Decoder, Mips::COP2RegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTarget(MCInst &Inst,
                                        unsigned Offset,
                                        uint64_t Address,
                                        const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 2) + 4;
+  int32_t BranchOffset = (SignExtend32<16>(Offset) << 2) + 4;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -848,12 +1117,31 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeBranchTarget21(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<21>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBranchTarget26(MCInst &Inst,
+                                         unsigned Offset,
+                                         uint64_t Address,
+                                         const void *Decoder) {
+  int32_t BranchOffset = SignExtend32<26>(Offset) << 2;
+
+  Inst.addOperand(MCOperand::CreateImm(BranchOffset));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeBranchTargetMM(MCInst &Inst,
                                          unsigned Offset,
                                          uint64_t Address,
                                          const void *Decoder) {
-  unsigned BranchOffset = Offset & 0xffff;
-  BranchOffset = SignExtend32<18>(BranchOffset << 1);
+  int32_t BranchOffset = SignExtend32<16>(Offset) << 1;
   Inst.addOperand(MCOperand::CreateImm(BranchOffset));
   return MCDisassembler::Success;
 }
@@ -903,3 +1191,9 @@ static DecodeStatus DecodeExtSize(MCInst &Inst,
   Inst.addOperand(MCOperand::CreateImm(SignExtend32<16>(Size)));
   return MCDisassembler::Success;
 }
+
+static DecodeStatus DecodeSimm19Lsl2(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder) {
+  Inst.addOperand(MCOperand::CreateImm(SignExtend32<19>(Insn) << 2));
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
index c8f08f1..8c79751 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "MipsInstPrinter.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MipsInstrInfo.h"
@@ -24,6 +23,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #define PRINT_ALIAS_INSTR
 #include "MipsGenAsmWriter.inc"
 
@@ -165,6 +166,8 @@ static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
   case MCSymbolRefExpr::VK_Mips_GOT_LO16:  OS << "%got_lo("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_HI16: OS << "%call_hi("; break;
   case MCSymbolRefExpr::VK_Mips_CALL_LO16: OS << "%call_lo("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_HI16: OS << "%pcrel_hi("; break;
+  case MCSymbolRefExpr::VK_Mips_PCREL_LO16: OS << "%pcrel_lo("; break;
   }
 
   OS << SRE->getSymbol();
diff --git a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
index 2b745f0..550a0f1 100644
--- a/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
+++ b/lib/Target/Mips/InstPrinter/MipsInstPrinter.h
@@ -85,10 +85,12 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 0f99ecc..5375a00 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 // Prepare value for the target space for it
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = NULL) {
+                                 MCContext *Ctx = nullptr) {
 
   unsigned Kind = Fixup.getKind();
 
@@ -56,6 +56,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_MICROMIPS_GOT_PAGE:
   case Mips::fixup_MICROMIPS_GOT_OFST:
   case Mips::fixup_MICROMIPS_GOT_DISP:
+  case Mips::fixup_MIPS_PCLO16:
     break;
   case Mips::fixup_Mips_PC16:
     // So far we are only using this type for branches.
@@ -80,6 +81,7 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case Mips::fixup_Mips_GOT_HI16:
   case Mips::fixup_Mips_CALL_HI16:
   case Mips::fixup_MICROMIPS_HI16:
+  case Mips::fixup_MIPS_PCHI16:
     // Get the 2nd 16-bits. Also add 1 if bit 15 is 1.
     Value = ((Value + 0x8000) >> 16) & 0xffff;
     break;
@@ -102,6 +104,22 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     if (!isIntN(16, Value) && Ctx)
       Ctx->FatalError(Fixup.getLoc(), "out of range PC16 fixup");
     break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 21-bit signed immediate.
+    if (!isIntN(21, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC21 fixup");
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Value -= 4;
+    // Forcing a signed division because Value can be negative.
+    Value = (int64_t) Value / 4;
+    // We now check if Value can be encoded as a 26-bit signed immediate.
+    if (!isIntN(26, Value) && Ctx)
+      Ctx->FatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    break;
   }
 
   return Value;
@@ -189,7 +207,7 @@ void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 const MCFixupKindInfo &MipsAsmBackend::
 getFixupKindInfo(MCFixupKind Kind) const {
-  const static MCFixupKindInfo Infos[Mips::NumTargetFixupKinds] = {
+  const static MCFixupKindInfo LittleEndianInfos[Mips::NumTargetFixupKinds] = {
     // This table *must* be in same the order of fixup_* kinds in
     // MipsFixupKinds.h.
     //
@@ -229,6 +247,10 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_Mips_GOT_LO16",     0,     16,   0 },
     { "fixup_Mips_CALL_HI16",    0,     16,   0 },
     { "fixup_Mips_CALL_LO16",    0,     16,   0 },
+    { "fixup_MIPS_PC21_S2",      0,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      0,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",       0,     16,  MCFixupKindInfo::FKF_IsPCRel },
     { "fixup_MICROMIPS_26_S1",   0,     26,   0 },
     { "fixup_MICROMIPS_HI16",    0,     16,   0 },
     { "fixup_MICROMIPS_LO16",    0,     16,   0 },
@@ -246,12 +268,76 @@ getFixupKindInfo(MCFixupKind Kind) const {
     { "fixup_MICROMIPS_TLS_TPREL_LO16",  0,     16,   0 }
   };
 
+  const static MCFixupKindInfo BigEndianInfos[Mips::NumTargetFixupKinds] = {
+    // This table *must* be in same the order of fixup_* kinds in
+    // MipsFixupKinds.h.
+    //
+    // name                    offset  bits  flags
+    { "fixup_Mips_16",          16,     16,   0 },
+    { "fixup_Mips_32",           0,     32,   0 },
+    { "fixup_Mips_REL32",        0,     32,   0 },
+    { "fixup_Mips_26",           6,     26,   0 },
+    { "fixup_Mips_HI16",        16,     16,   0 },
+    { "fixup_Mips_LO16",        16,     16,   0 },
+    { "fixup_Mips_GPREL16",     16,     16,   0 },
+    { "fixup_Mips_LITERAL",     16,     16,   0 },
+    { "fixup_Mips_GOT_Global",  16,     16,   0 },
+    { "fixup_Mips_GOT_Local",   16,     16,   0 },
+    { "fixup_Mips_PC16",        16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_CALL16",      16,     16,   0 },
+    { "fixup_Mips_GPREL32",      0,     32,   0 },
+    { "fixup_Mips_SHIFT5",      21,      5,   0 },
+    { "fixup_Mips_SHIFT6",      21,      5,   0 },
+    { "fixup_Mips_64",           0,     64,   0 },
+    { "fixup_Mips_TLSGD",       16,     16,   0 },
+    { "fixup_Mips_GOTTPREL",    16,     16,   0 },
+    { "fixup_Mips_TPREL_HI",    16,     16,   0 },
+    { "fixup_Mips_TPREL_LO",    16,     16,   0 },
+    { "fixup_Mips_TLSLDM",      16,     16,   0 },
+    { "fixup_Mips_DTPREL_HI",   16,     16,   0 },
+    { "fixup_Mips_DTPREL_LO",   16,     16,   0 },
+    { "fixup_Mips_Branch_PCRel",16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_Mips_GPOFF_HI",    16,     16,   0 },
+    { "fixup_Mips_GPOFF_LO",    16,     16,   0 },
+    { "fixup_Mips_GOT_PAGE",    16,     16,   0 },
+    { "fixup_Mips_GOT_OFST",    16,     16,   0 },
+    { "fixup_Mips_GOT_DISP",    16,     16,   0 },
+    { "fixup_Mips_HIGHER",      16,     16,   0 },
+    { "fixup_Mips_HIGHEST",     16,     16,   0 },
+    { "fixup_Mips_GOT_HI16",    16,     16,   0 },
+    { "fixup_Mips_GOT_LO16",    16,     16,   0 },
+    { "fixup_Mips_CALL_HI16",   16,     16,   0 },
+    { "fixup_Mips_CALL_LO16",   16,     16,   0 },
+    { "fixup_MIPS_PC21_S2",     11,     21,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PC26_S2",      6,     26,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCHI16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MIPS_PCLO16",      16,     16,  MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_26_S1",   6,     26,   0 },
+    { "fixup_MICROMIPS_HI16",   16,     16,   0 },
+    { "fixup_MICROMIPS_LO16",   16,     16,   0 },
+    { "fixup_MICROMIPS_GOT16",  16,     16,   0 },
+    { "fixup_MICROMIPS_PC16_S1",16,     16,   MCFixupKindInfo::FKF_IsPCRel },
+    { "fixup_MICROMIPS_CALL16", 16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_DISP",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_PAGE",        16,     16,   0 },
+    { "fixup_MICROMIPS_GOT_OFST",        16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_GD",          16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_LDM",         16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_HI16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_DTPREL_LO16", 16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_HI16",  16,     16,   0 },
+    { "fixup_MICROMIPS_TLS_TPREL_LO16",  16,     16,   0 }
+  };
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
   assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
           "Invalid kind!");
-  return Infos[Kind - FirstTargetFixupKind];
+
+  if (IsLittle)
+    return LittleEndianInfos[Kind - FirstTargetFixupKind];
+  return BigEndianInfos[Kind - FirstTargetFixupKind];
 }
 
 /// WriteNopData - Write an (optimal) nop sequence of Count bytes
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index cc5207a..bc695e6 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -37,14 +37,14 @@ public:
       : MCAsmBackend(), OSType(_OSType), IsLittle(_isLittle),
         Is64Bit(_is64Bit) {}
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const;
+                  uint64_t Value, bool IsPCRel) const override;
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const;
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
-  unsigned getNumFixupKinds() const {
+  unsigned getNumFixupKinds() const override {
     return Mips::NumTargetFixupKinds;
   }
 
@@ -55,7 +55,7 @@ public:
   /// relaxation.
   ///
   /// \param Inst - The instruction to test.
-  bool mayNeedRelaxation(const MCInst &Inst) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
 
@@ -63,7 +63,7 @@ public:
   /// fixup requires the associated instruction to be relaxed.
    bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                              const MCRelaxableFragment *DF,
-                             const MCAsmLayout &Layout) const {
+                             const MCAsmLayout &Layout) const override {
     // FIXME.
     assert(0 && "RelaxInstruction() unimplemented");
     return false;
@@ -75,16 +75,16 @@ public:
   /// \param Inst - The instruction to relax, which may be the same
   /// as the output.
   /// \param [out] Res On return, the relaxed instruction.
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {}
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {}
 
   /// @}
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const;
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
   void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
                          const MCFixup &Fixup, const MCFragment *DF,
                          const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved);
+                         bool &IsResolved) override;
 
 }; // class MipsAsmBackend
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 794978b..74c12ff 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -193,6 +193,18 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case Mips::fixup_MICROMIPS_TLS_TPREL_LO16:
     Type = ELF::R_MICROMIPS_TLS_TPREL_LO16;
     break;
+  case Mips::fixup_MIPS_PC21_S2:
+    Type = ELF::R_MIPS_PC21_S2;
+    break;
+  case Mips::fixup_MIPS_PC26_S2:
+    Type = ELF::R_MIPS_PC26_S2;
+    break;
+  case Mips::fixup_MIPS_PCHI16:
+    Type = ELF::R_MIPS_PCHI16;
+    break;
+  case Mips::fixup_MIPS_PCLO16:
+    Type = ELF::R_MIPS_PCLO16;
+    break;
   }
   return Type;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
index dc6192c..3079004 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsFixupKinds.h
@@ -128,6 +128,18 @@ namespace Mips {
     // resulting in - R_MIPS_CALL_LO16
     fixup_Mips_CALL_LO16,
 
+    // resulting in - R_MIPS_PC21_S2
+    fixup_MIPS_PC21_S2,
+
+    // resulting in - R_MIPS_PC26_S2
+    fixup_MIPS_PC26_S2,
+
+    // resulting in - R_MIPS_PCHI16
+    fixup_MIPS_PCHI16,
+
+    // resulting in - R_MIPS_PCLO16
+    fixup_MIPS_PCLO16,
+
     // resulting in - R_MICROMIPS_26_S1
     fixup_MICROMIPS_26_S1,
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
index 1000113..37ba0c4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h
@@ -20,7 +20,7 @@ namespace llvm {
   class StringRef;
 
   class MipsMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit MipsMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index edd2146..85e0bf1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 //
 
-#define DEBUG_TYPE "mccodeemitter"
-
 #include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
@@ -28,6 +26,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mccodeemitter"
+
 #define GET_INSTRMAP_INFO
 #include "MipsGenInstrInfo.inc"
 #undef GET_INSTRMAP_INFO
@@ -242,6 +242,69 @@ getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+/// getBranchTarget21OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget21OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC21_S2)));
+  return 0;
+}
+
+/// getBranchTarget26OpValue - Return binary encoding of the branch
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  // If the destination is an immediate, divide by 4.
+  if (MO.isImm()) return MO.getImm() >> 2;
+
+  assert(MO.isExpr() &&
+         "getBranchTarget26OpValue expects only expressions or immediates");
+
+  const MCExpr *Expr = MO.getExpr();
+  Fixups.push_back(MCFixup::Create(0, Expr,
+                                   MCFixupKind(Mips::fixup_MIPS_PC26_S2)));
+  return 0;
+}
+
+/// getJumpOffset16OpValue - Return binary encoding of the jump
+/// target operand. If the machine operand requires relocation,
+/// record the relocation and return zero.
+unsigned MipsMCCodeEmitter::
+getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const {
+
+  const MCOperand &MO = MI.getOperand(OpNo);
+
+  if (MO.isImm()) return MO.getImm();
+
+  assert(MO.isExpr() &&
+         "getJumpOffset16OpValue expects only expressions or an immediate");
+
+   // TODO: Push fixup.
+   return 0;
+}
+
 /// getJumpTargetOpValue - Return binary encoding of the jump
 /// target operand. If the machine operand requires relocation,
 /// record the relocation and return zero.
@@ -417,6 +480,12 @@ getExprOpValue(const MCExpr *Expr,SmallVectorImpl<MCFixup> &Fixups,
     case MCSymbolRefExpr::VK_Mips_CALL_LO16:
       FixupKind = Mips::fixup_Mips_CALL_LO16;
       break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_HI16:
+      FixupKind = Mips::fixup_MIPS_PCHI16;
+      break;
+    case MCSymbolRefExpr::VK_Mips_PCREL_LO16:
+      FixupKind = Mips::fixup_MIPS_PCLO16;
+      break;
     } // switch
 
     Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
@@ -548,5 +617,15 @@ MipsMCCodeEmitter::getLSAImmEncoding(const MCInst &MI, unsigned OpNo,
   return getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI) - 1;
 }
 
-#include "MipsGenMCCodeEmitter.inc"
+unsigned
+MipsMCCodeEmitter::getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  assert(MI.getOperand(OpNo).isImm());
+  // The immediate is encoded as 'immediate << 2'.
+  unsigned Res = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups, STI);
+  assert((Res & 3) == 0);
+  return Res >> 2;
+}
 
+#include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 49a2490..3f7daab 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -52,7 +52,7 @@ public:
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
@@ -88,6 +88,27 @@ public:
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
 
+  // getBranchTarget21OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+
+  // getBranchTarget26OpValue - Return binary encoding of the branch
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
+                                    SmallVectorImpl<MCFixup> &Fixups,
+                                    const MCSubtargetInfo &STI) const;
+
+  // getJumpOffset16OpValue - Return binary encoding of the jump
+  // offset operand. If the machine operand requires relocation,
+  // record the relocation and return zero.
+  unsigned getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+
   // getMachineOpValue - Return binary encoding of operand. If the machin
   // operand requires relocation, record the relocation and return zero.
   unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
@@ -116,6 +137,10 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
+  unsigned getSimm19Lsl2Encoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
   unsigned getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index c7ba12d..21ccc3c 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mipsmcexpr"
 #include "MipsMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -15,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mipsmcexpr"
+
 bool MipsMCExpr::isSupportedBinaryExpr(MCSymbolRefExpr::VariantKind VK,
                                        const MCBinaryExpr *BE) {
   switch (VK) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index 722bba7..8d7aacd 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -46,16 +46,16 @@ public:
   /// getSubExpr - Get the child of this expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
   // There are no TLS MipsMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
index 6992d06..01d5363 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -18,7 +18,7 @@ namespace llvm {
 static const unsigned MIPS_NACL_BUNDLE_ALIGN = 4u;
 
 bool isBasePlusOffsetMemoryAccess(unsigned Opcode, unsigned *AddrIdx,
-                                  bool *IsStore = NULL);
+                                  bool *IsStore = nullptr);
 bool baseRegNeedsLoadStoreMask(unsigned Reg);
 
 // This function creates an MCELFStreamer for Mips NaCl.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index eecca68..660e5a7 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -30,6 +30,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "MipsGenInstrInfo.inc"
 
@@ -39,8 +41,6 @@
 #define GET_REGINFO_MC_DESC
 #include "MipsGenRegisterInfo.inc"
 
-using namespace llvm;
-
 /// Select the Mips CPU for the given triple and cpu name.
 /// FIXME: Merge with the copy in MipsSubtarget.cpp
 static inline StringRef selectMipsCPU(StringRef TT, StringRef CPU) {
@@ -79,7 +79,7 @@ static MCAsmInfo *createMipsMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   MCAsmInfo *MAI = new MipsMCAsmInfo(TT);
 
   unsigned SP = MRI.getDwarfRegNum(Mips::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -124,12 +124,11 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new MipsTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 639a058..cd6be73 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -17,8 +17,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-mc-nacl"
-
 #include "Mips.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCNaCl.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-mc-nacl"
+
 namespace {
 
 const unsigned IndirectBranchMaskReg = Mips::T6;
@@ -120,7 +120,8 @@ private:
 public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer.  We override it to mask dangerous instructions.
-  virtual void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) {
+  void EmitInstruction(const MCInst &Inst,
+                       const MCSubtargetInfo &STI) override {
     // Sandbox indirect jumps.
     if (isIndirectJump(Inst)) {
       if (PendingCall)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index fb6aff2..a8fa272 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -85,6 +85,13 @@ void MipsTargetAsmStreamer::emitDirectiveEnt(const MCSymbol &Symbol) {
 }
 
 void MipsTargetAsmStreamer::emitDirectiveAbiCalls() { OS << "\t.abicalls\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaN2008() { OS << "\t.nan\t2008\n"; }
+
+void MipsTargetAsmStreamer::emitDirectiveNaNLegacy() {
+  OS << "\t.nan\tlegacy\n";
+}
+
 void MipsTargetAsmStreamer::emitDirectiveOptionPic0() {
   OS << "\t.option\tpic0\n";
 }
@@ -137,6 +144,29 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
+void MipsTargetAsmStreamer::emitDirectiveCpload(unsigned RegNo) {
+  OS << "\t.cpload\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+}
+
+void MipsTargetAsmStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  OS << "\t.cpsetup\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << ", ";
+
+  if (IsReg)
+    OS << "$"
+       << StringRef(MipsInstPrinter::getRegisterName(RegOrOffset)).lower();
+  else
+    OS << RegOrOffset;
+
+  OS << ", ";
+
+  OS << Sym.getName() << "\n";
+}
+
 // This part is for ELF object output.
 MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
                                              const MCSubtargetInfo &STI)
@@ -180,6 +210,10 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
     EFlags |= ELF::EF_MIPS_ABI_O32;
   }
 
+  // Other options.
+  if (Features & Mips::FeatureNaN2008)
+    EFlags |= ELF::EF_MIPS_NAN2008;
+
   MCA.setELFHeaderEFlags(EFlags);
 }
 
@@ -325,6 +359,21 @@ void MipsTargetELFStreamer::emitDirectiveAbiCalls() {
   Flags |= ELF::EF_MIPS_CPIC | ELF::EF_MIPS_PIC;
   MCA.setELFHeaderEFlags(Flags);
 }
+
+void MipsTargetELFStreamer::emitDirectiveNaN2008() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags |= ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
+void MipsTargetELFStreamer::emitDirectiveNaNLegacy() {
+  MCAssembler &MCA = getStreamer().getAssembler();
+  unsigned Flags = MCA.getELFHeaderEFlags();
+  Flags &= ~ELF::EF_MIPS_NAN2008;
+  MCA.setELFHeaderEFlags(Flags);
+}
+
 void MipsTargetELFStreamer::emitDirectiveOptionPic0() {
   MCAssembler &MCA = getStreamer().getAssembler();
   unsigned Flags = MCA.getELFHeaderEFlags();
@@ -376,3 +425,107 @@ void MipsTargetELFStreamer::emitDirectiveSetMips64R2() {
 void MipsTargetELFStreamer::emitDirectiveSetDsp() {
   // No action required for ELF output.
 }
+
+void MipsTargetELFStreamer::emitDirectiveCpload(unsigned RegNo) {
+  // .cpload $reg
+  // This directive expands to:
+  // lui   $gp, %hi(_gp_disp)
+  // addui $gp, $gp, %lo(_gp_disp)
+  // addu  $gp, $gp, $reg
+  // when support for position independent code is enabled.
+  if (!Pic || (isN32() || isN64()))
+    return;
+
+  // There's a GNU extension controlled by -mno-shared that allows
+  // locally-binding symbols to be accessed using absolute addresses.
+  // This is currently not supported. When supported -mno-shared makes
+  // .cpload expand to:
+  //   lui     $gp, %hi(__gnu_local_gp)
+  //   addiu   $gp, $gp, %lo(__gnu_local_gp)
+
+  StringRef SymName("_gp_disp");
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCSymbol *GP_Disp = MCA.getContext().GetOrCreateSymbol(SymName);
+  MCA.getOrCreateSymbolData(*GP_Disp);
+
+  MCInst TmpInst;
+  TmpInst.setOpcode(Mips::LUi);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *HiSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_HI, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(HiSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDiu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  const MCSymbolRefExpr *LoSym = MCSymbolRefExpr::Create(
+      "_gp_disp", MCSymbolRefExpr::VK_Mips_ABS_LO, MCA.getContext());
+  TmpInst.addOperand(MCOperand::CreateExpr(LoSym));
+  getStreamer().EmitInstruction(TmpInst, STI);
+
+  TmpInst.clear();
+
+  TmpInst.setOpcode(Mips::ADDu);
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(Mips::GP));
+  TmpInst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(TmpInst, STI);
+}
+
+void MipsTargetELFStreamer::emitDirectiveCpsetup(unsigned RegNo,
+                                                 int RegOrOffset,
+                                                 const MCSymbol &Sym,
+                                                 bool IsReg) {
+  // Only N32 and N64 emit anything for .cpsetup iff PIC is set.
+  if (!Pic || !(isN32() || isN64()))
+    return;
+
+  MCAssembler &MCA = getStreamer().getAssembler();
+  MCInst Inst;
+
+  // Either store the old $gp in a register or on the stack
+  if (IsReg) {
+    // move $save, $gpreg
+    Inst.setOpcode(Mips::DADDu);
+    Inst.addOperand(MCOperand::CreateReg(RegOrOffset));
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::ZERO));
+  } else {
+    // sd $gpreg, offset($sp)
+    Inst.setOpcode(Mips::SD);
+    Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+    Inst.addOperand(MCOperand::CreateReg(Mips::SP));
+    Inst.addOperand(MCOperand::CreateImm(RegOrOffset));
+  }
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  const MCSymbolRefExpr *HiExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_HI, MCA.getContext());
+  const MCSymbolRefExpr *LoExpr = MCSymbolRefExpr::Create(
+      Sym.getName(), MCSymbolRefExpr::VK_Mips_GPOFF_LO, MCA.getContext());
+  // lui $gp, %hi(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::LUi);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(HiExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // addiu  $gp, $gp, %lo(%neg(%gp_rel(funcSym)))
+  Inst.setOpcode(Mips::ADDiu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateExpr(LoExpr));
+  getStreamer().EmitInstruction(Inst, STI);
+  Inst.clear();
+
+  // daddu  $gp, $gp, $funcreg
+  Inst.setOpcode(Mips::DADDu);
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(Mips::GP));
+  Inst.addOperand(MCOperand::CreateReg(RegNo));
+  getStreamer().EmitInstruction(Inst, STI);
+}
diff --git a/lib/Target/Mips/Makefile b/lib/Target/Mips/Makefile
index bcf951e..41efa47 100644
--- a/lib/Target/Mips/Makefile
+++ b/lib/Target/Mips/Makefile
@@ -13,7 +13,7 @@ TARGET = Mips
 
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = MipsGenRegisterInfo.inc MipsGenInstrInfo.inc \
-                MipsGenAsmWriter.inc MipsGenCodeEmitter.inc \
+                MipsGenAsmWriter.inc MipsGenFastISel.inc MipsGenCodeEmitter.inc \
                 MipsGenDAGISel.inc MipsGenCallingConv.inc \
                 MipsGenSubtargetInfo.inc MipsGenMCCodeEmitter.inc \
                 MipsGenDisassemblerTables.inc \
diff --git a/lib/Target/Mips/MicroMipsInstrFPU.td b/lib/Target/Mips/MicroMipsInstrFPU.td
index 91d447a..d95f9b0 100644
--- a/lib/Target/Mips/MicroMipsInstrFPU.td
+++ b/lib/Target/Mips/MicroMipsInstrFPU.td
@@ -28,9 +28,9 @@ def LWXC1_MM : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>,
 def SWXC1_MM : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>,
                SWXC1_FM_MM<0x88>;
 def LUXC1_MM : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>,
-               LWXC1_FM_MM<0x148>;
+               LWXC1_FM_MM<0x148>, INSN_MIPS5_32R2;
 def SUXC1_MM : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>,
-               SWXC1_FM_MM<0x188>;
+               SWXC1_FM_MM<0x188>, INSN_MIPS5_32R2;
 
 def FCMP_S32_MM : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>,
                   CEQS_FM_MM<0>;
@@ -70,9 +70,9 @@ def FSQRT_MM : MMRel, ABSS_FT<"sqrt.d", AFGR64Opnd, AFGR64Opnd, II_SQRT_D,
                               fsqrt>, ROUND_W_FM_MM<1, 0x28>;
 
 def CVT_L_S_MM   : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                   ROUND_W_FM_MM<0, 0x4>;
+                   ROUND_W_FM_MM<0, 0x4>, INSN_MIPS3_32R2;
 def CVT_L_D64_MM : MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-                   ROUND_W_FM_MM<1, 0x4>;
+                   ROUND_W_FM_MM<1, 0x4>, INSN_MIPS3_32R2;
 
 def FABS_S_MM : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
                 ABS_FM_MM<0, 0xd>;
@@ -95,7 +95,7 @@ def FNEG_MM : MMRel, ABSS_FT<"neg.d", AFGR64Opnd, AFGR64Opnd, II_NEG, fneg>,
               ABS_FM_MM<1, 0x2d>;
 
 def FMOV_D32_MM : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-                  ABS_FM_MM<1, 0x1>, Requires<[NotFP64bit, HasStdEnc]>;
+                  ABS_FM_MM<1, 0x1>, AdditionalRequires<[NotFP64bit]>;
 
 def MOVZ_I_S_MM : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd,
                                      II_MOVZ_S>, CMov_I_F_FM_MM<0x78, 0>;
@@ -124,9 +124,9 @@ def MFC1_MM : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd,
 def MTC1_MM : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd,
                              II_MTC1, bitconvert>, MFC1_FM_MM<0xa0>;
 def MFHC1_MM : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-               MFC1_FM_MM<3>;
+               MFC1_FM_MM<3>, ISA_MIPS32R2;
 def MTHC1_MM : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-               MFC1_FM_MM<7>;
+               MFC1_FM_MM<7>, ISA_MIPS32R2;
 
 def MADD_S_MM : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
                 MADDS_FM_MM<0x1>;
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index 3f13e83..9904bc6 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -218,15 +218,20 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def MSUBU_MM : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM_MM<0x3ec>;
 
   /// Count Leading
-  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>;
-  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>;
+  def CLZ_MM : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM_MM<0x16c>,
+               ISA_MIPS32;
+  def CLO_MM : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM_MM<0x12c>,
+               ISA_MIPS32;
 
   /// Sign Ext In Register Instructions.
-  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM_MM<0x0ac>;
-  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM_MM<0x0ec>;
+  def SEB_MM : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+               SEB_FM_MM<0x0ac>, ISA_MIPS32R2;
+  def SEH_MM : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+               SEB_FM_MM<0x0ec>, ISA_MIPS32R2;
 
   /// Word Swap Bytes Within Halfwords
-  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>;
+  def WSBH_MM : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM_MM<0x1ec>,
+                ISA_MIPS32R2;
 
   def EXT_MM : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>,
                EXT_FM_MM<0x2c>;
@@ -268,8 +273,10 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
   def WAIT_MM    : WaitMM<"wait">, WAIT_FM_MM;
   def ERET_MM    : MMRel, ER_FT<"eret">, ER_FM_MM<0x3cd>;
   def DERET_MM   : MMRel, ER_FT<"deret">, ER_FM_MM<0x38d>;
-  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>;
-  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>;
+  def EI_MM      : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM_MM<0x15d>,
+                   ISA_MIPS32R2;
+  def DI_MM      : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM_MM<0x11d>,
+                   ISA_MIPS32R2;
 
   /// Trap Instructions
   def TEQ_MM  : MMRel, TEQ_FT<"teq", GPR32Opnd>, TEQ_FM_MM<0x0>;
@@ -296,5 +303,5 @@ let DecoderNamespace = "MicroMips", Predicates = [InMicroMips] in {
 //===----------------------------------------------------------------------===//
 
 let Predicates = [InMicroMips] in {
-  def : InstAlias<"wait", (WAIT_MM 0x0), 1>;
+  def : MipsInstAlias<"wait", (WAIT_MM 0x0), 1>;
 }
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 10a4699..ea16331 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -15,6 +15,33 @@
 
 include "llvm/Target/Target.td"
 
+// The overall idea of the PredicateControl class is to chop the Predicates list
+// into subsets that are usually overridden independently. This allows
+// subclasses to partially override the predicates of their superclasses without
+// having to re-add all the existing predicates.
+class PredicateControl {
+  // Predicates for the encoding scheme in use such as HasStdEnc
+  list<Predicate> EncodingPredicates = [];
+  // Predicates for the GPR size such as IsGP64bit
+  list<Predicate> GPRPredicates = [];
+  // Predicates for the FGR size and layout such as IsFP64bit
+  list<Predicate> FGRPredicates = [];
+  // Predicates for the instruction group membership such as ISA's and ASE's
+  list<Predicate> InsnPredicates = [];
+  // Predicates for anything else
+  list<Predicate> AdditionalPredicates = [];
+  list<Predicate> Predicates = !listconcat(EncodingPredicates,
+                                           GPRPredicates,
+                                           FGRPredicates,
+                                           InsnPredicates,
+                                           AdditionalPredicates);
+}
+
+// Like Requires<> but for the AdditionalPredicates list
+class AdditionalRequires<list<Predicate> preds> {
+  list<Predicate> AdditionalPredicates = preds;
+}
+
 //===----------------------------------------------------------------------===//
 // Register File, Calling Conv, Instruction Descriptions
 //===----------------------------------------------------------------------===//
@@ -34,6 +61,8 @@ def FeatureGP64Bit     : SubtargetFeature<"gp64", "IsGP64bit", "true",
                                 "General Purpose Registers are 64-bit wide.">;
 def FeatureFP64Bit     : SubtargetFeature<"fp64", "IsFP64bit", "true",
                                 "Support 64-bit FP registers.">;
+def FeatureNaN2008     : SubtargetFeature<"nan2008", "IsNaN2008bit", "true",
+                                "IEEE 754-2008 NaN encoding.">;
 def FeatureSingleFloat : SubtargetFeature<"single-float", "IsSingleFloat",
                                 "true", "Only supports single precision float">;
 def FeatureO32         : SubtargetFeature<"o32", "MipsABI", "O32",
@@ -46,33 +75,62 @@ def FeatureEABI        : SubtargetFeature<"eabi", "MipsABI", "EABI",
                                 "Enable eabi ABI">;
 def FeatureVFPU        : SubtargetFeature<"vfpu", "HasVFPU",
                                 "true", "Enable vector FPU instructions.">;
-def FeatureSEInReg     : SubtargetFeature<"seinreg", "HasSEInReg", "true",
-                                "Enable 'signext in register' instructions.">;
-def FeatureCondMov     : SubtargetFeature<"condmov", "HasCondMov", "true",
-                                "Enable 'conditional move' instructions.">;
-def FeatureSwap        : SubtargetFeature<"swap", "HasSwap", "true",
-                                "Enable 'byte/half swap' instructions.">;
-def FeatureBitCount    : SubtargetFeature<"bitcount", "HasBitCount", "true",
-                                "Enable 'count leading bits' instructions.">;
-def FeatureFPIdx       : SubtargetFeature<"FPIdx", "HasFPIdx", "true",
-                                "Enable 'FP indexed load/store' instructions.">;
+def FeatureMips1       : SubtargetFeature<"mips1", "MipsArchVersion", "Mips1",
+                                "Mips I ISA Support [highly experimental]">;
+def FeatureMips2       : SubtargetFeature<"mips2", "MipsArchVersion", "Mips2",
+                                "Mips II ISA Support [highly experimental]",
+                                [FeatureMips1]>;
+def FeatureMips3_32    : SubtargetFeature<"mips3_32", "HasMips3_32", "true",
+                                "Subset of MIPS-III that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips3_32r2  : SubtargetFeature<"mips3_32r2", "HasMips3_32r2", "true",
+                                "Subset of MIPS-III that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips3       : SubtargetFeature<"mips3", "MipsArchVersion", "Mips3",
+                                "MIPS III ISA Support [highly experimental]",
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips3_32r2, FeatureGP64Bit,
+                                 FeatureFP64Bit]>;
+def FeatureMips4_32    : SubtargetFeature<"mips4_32", "HasMips4_32", "true",
+                                "Subset of MIPS-IV that is also in MIPS32 "
+                                "[highly experimental]">;
+def FeatureMips4_32r2  : SubtargetFeature<"mips4_32r2", "HasMips4_32r2", "true",
+                                "Subset of MIPS-IV that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
+                                "Mips4", "MIPS IV ISA Support",
+                                [FeatureMips3, FeatureMips4_32,
+                                 FeatureMips4_32r2]>;
+def FeatureMips5_32r2  : SubtargetFeature<"mips5_32r2", "HasMips5_32r2", "true",
+                                "Subset of MIPS-V that is also in MIPS32r2 "
+                                "[highly experimental]">;
+def FeatureMips5       : SubtargetFeature<"mips5", "MipsArchVersion", "Mips5",
+                                "MIPS V ISA Support [highly experimental]",
+                                [FeatureMips4, FeatureMips5_32r2]>;
 def FeatureMips32      : SubtargetFeature<"mips32", "MipsArchVersion", "Mips32",
                                 "Mips32 ISA Support",
-                                [FeatureCondMov, FeatureBitCount]>;
+                                [FeatureMips2, FeatureMips3_32,
+                                 FeatureMips4_32]>;
 def FeatureMips32r2    : SubtargetFeature<"mips32r2", "MipsArchVersion",
                                 "Mips32r2", "Mips32r2 ISA Support",
-                                [FeatureMips32, FeatureSEInReg, FeatureSwap,
-                                 FeatureFPIdx]>;
-def FeatureMips4       : SubtargetFeature<"mips4", "MipsArchVersion",
-                                "Mips4", "MIPS IV ISA Support",
-                                [FeatureGP64Bit, FeatureFP64Bit,
-                                 FeatureCondMov]>;
+                                [FeatureMips3_32r2, FeatureMips4_32r2,
+                                 FeatureMips5_32r2, FeatureMips32]>;
+def FeatureMips32r6    : SubtargetFeature<"mips32r6", "MipsArchVersion",
+                                "Mips32r6",
+                                "Mips32r6 ISA Support [experimental]",
+                                [FeatureMips32r2, FeatureFP64Bit,
+                                 FeatureNaN2008]>;
 def FeatureMips64      : SubtargetFeature<"mips64", "MipsArchVersion",
                                 "Mips64", "Mips64 ISA Support",
-                                [FeatureMips4, FeatureMips32, FeatureFPIdx]>;
+                                [FeatureMips5, FeatureMips32]>;
 def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
                                 "Mips64r2", "Mips64r2 ISA Support",
                                 [FeatureMips64, FeatureMips32r2]>;
+def FeatureMips64r6    : SubtargetFeature<"mips64r6", "MipsArchVersion",
+                                "Mips64r6",
+                                "Mips64r6 ISA Support [experimental]",
+                                [FeatureMips32r6, FeatureMips64r2,
+                                 FeatureNaN2008]>;
 
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
@@ -97,11 +155,18 @@ def FeatureCnMips     : SubtargetFeature<"cnmips", "HasCnMips",
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, MipsGenericItineraries, Features>;
 
+def : Proc<"mips1", [FeatureMips1, FeatureO32]>;
+def : Proc<"mips2", [FeatureMips2, FeatureO32]>;
 def : Proc<"mips32", [FeatureMips32, FeatureO32]>;
 def : Proc<"mips32r2", [FeatureMips32r2, FeatureO32]>;
+def : Proc<"mips32r6", [FeatureMips32r6, FeatureO32]>;
+
+def : Proc<"mips3", [FeatureMips3, FeatureN64]>;
 def : Proc<"mips4", [FeatureMips4, FeatureN64]>;
+def : Proc<"mips5", [FeatureMips5, FeatureN64]>;
 def : Proc<"mips64", [FeatureMips64, FeatureN64]>;
 def : Proc<"mips64r2", [FeatureMips64r2, FeatureN64]>;
+def : Proc<"mips64r6", [FeatureMips64r6, FeatureN64]>;
 def : Proc<"mips16", [FeatureMips16, FeatureO32]>;
 def : Proc<"octeon", [FeatureMips64r2, FeatureN64, FeatureCnMips]>;
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 028b049..c01d03a 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -71,7 +71,7 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF) const {
   }
   if (hasFP(MF))
     BuildMI(MBB, MBBI, dl, TII.get(Mips::MoveR3216), Mips::S0)
-      .addReg(Mips::SP);
+      .addReg(Mips::SP).setMIFlag(MachineInstr::FrameSetup);
 
 }
 
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 8ce2ced..3f7829d 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -24,27 +24,27 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
+                                            RegScavenger *RS) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index d321e21..14055d6 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips16-hard-float"
 #include "Mips16HardFloat.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
@@ -20,6 +19,8 @@
 #include <algorithm>
 #include <string>
 
+#define DEBUG_TYPE "mips16-hard-float"
+
 static void inlineAsmOut
   (LLVMContext &C, StringRef AsmString, BasicBlock *BB ) {
   std::vector<llvm::Type *> AsmArgTypes;
@@ -354,9 +355,8 @@ static const char *IntrinsicInline[] =
   };
 
 static bool isIntrinsicInline(Function *F) {
-  return std::binary_search(
-    IntrinsicInline, array_endof(IntrinsicInline),
-    F->getName());
+  return std::binary_search(std::begin(IntrinsicInline),
+                            std::end(IntrinsicInline), F->getName());
 }
 //
 // Returns of float, double and complex need to be handled with a helper
@@ -407,11 +407,11 @@ static bool fixupFPReturnAndCall
         CallInst::Create(F, Params, "", &Inst );
       } else if (const CallInst *CI = dyn_cast<CallInst>(I)) {
           const Value* V = CI->getCalledValue();
-          const Type* T = 0;
+          const Type* T = nullptr;
           if (V) T = V->getType();
-          const PointerType *PFT=0;
+          const PointerType *PFT=nullptr;
           if (T) PFT = dyn_cast<PointerType>(T);
-          const FunctionType *FT=0;
+          const FunctionType *FT=nullptr;
           if (PFT) FT = dyn_cast<FunctionType>(PFT->getElementType());
           Function *F_ =  CI->getCalledFunction();
           if (FT && needsFPReturnHelper(*FT) &&
diff --git a/lib/Target/Mips/Mips16HardFloat.h b/lib/Target/Mips/Mips16HardFloat.h
index b7f712a..826887e 100644
--- a/lib/Target/Mips/Mips16HardFloat.h
+++ b/lib/Target/Mips/Mips16HardFloat.h
@@ -34,11 +34,11 @@ public:
     TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS16 Hard Float Pass";
   }
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
 protected:
   /// Keep a pointer to the MipsSubtarget around so that we can make the right
diff --git a/lib/Target/Mips/Mips16HardFloatInfo.cpp b/lib/Target/Mips/Mips16HardFloatInfo.cpp
index d8b685e..2eb6e5d 100644
--- a/lib/Target/Mips/Mips16HardFloatInfo.cpp
+++ b/lib/Target/Mips/Mips16HardFloatInfo.cpp
@@ -30,7 +30,7 @@ const FuncNameSignature PredefinedFuncs[] = {
   { "__fixunssfsi", { FSig, NoFPRet } },
   { "__fixunssfdi", { FSig, NoFPRet } },
   { "__floatundisf", { NoSig, FRet } },
-  { 0, { NoSig, NoFPRet } }
+  { nullptr, { NoSig, NoFPRet } }
 };
 
 // just do a search for now. there are very few of these special cases.
@@ -44,7 +44,7 @@ extern FuncSignature const *findFuncSignature(const char *name) {
       return &PredefinedFuncs[i].Signature;
     i++;
   }
-  return 0;
+  return nullptr;
 }
 }
 }
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 9e36546..4e86a27 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "Mips16ISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   if (!Subtarget.inMips16Mode())
     return false;
@@ -44,7 +45,7 @@ bool Mips16DAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 std::pair<SDNode*, SDNode*>
 Mips16DAGToDAGISel::selectMULT(SDNode *N, unsigned Opc, SDLoc DL, EVT Ty,
                                bool HasLo, bool HasHi) {
-  SDNode *Lo = 0, *Hi = 0;
+  SDNode *Lo = nullptr, *Hi = nullptr;
   SDNode *Mul = CurDAG->getMachineNode(Opc, DL, MVT::Glue, N->getOperand(0),
                                        N->getOperand(1));
   SDValue InFlag = SDValue(Mul, 0);
@@ -224,10 +225,12 @@ bool Mips16DAGToDAGISel::selectAddr16(
     // If an indexed floating point load/store can be emitted, return false.
     const LSBaseSDNode *LS = dyn_cast<LSBaseSDNode>(Parent);
 
-    if (LS &&
-        (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
-        Subtarget.hasFPIdx())
-      return false;
+    if (LS) {
+      if (LS->getMemoryVT() == MVT::f32 && Subtarget.hasMips4_32r2())
+        return false;
+      if (LS->getMemoryVT() == MVT::f64 && Subtarget.hasMips4_32r2())
+        return false;
+    }
   }
   Base   = Addr;
   Offset = CurDAG->getTargetConstant(0, ValTy);
@@ -297,7 +300,7 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
     if (!SDValue(Node, 1).use_empty())
       ReplaceUses(SDValue(Node, 1), SDValue(LoHi.second, 0));
 
-    return std::make_pair(true, (SDNode*)NULL);
+    return std::make_pair(true, nullptr);
   }
 
   case ISD::MULHS:
@@ -308,7 +311,7 @@ std::pair<bool, SDNode*> Mips16DAGToDAGISel::selectNode(SDNode *Node) {
   }
   }
 
-  return std::make_pair(false, (SDNode*)NULL);
+  return std::make_pair(false, nullptr);
 }
 
 FunctionPass *llvm::createMips16ISelDag(MipsTargetMachine &TM) {
diff --git a/lib/Target/Mips/Mips16ISelDAGToDAG.h b/lib/Target/Mips/Mips16ISelDAGToDAG.h
index 49dc6e5..e653b39 100644
--- a/lib/Target/Mips/Mips16ISelDAGToDAG.h
+++ b/lib/Target/Mips/Mips16ISelDAGToDAG.h
@@ -28,16 +28,16 @@ private:
 
   SDValue getMips16SPAliasReg();
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
   void getMips16SPRefReg(SDNode *Parent, SDValue &AliasReg);
 
-  virtual bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
-                            SDValue &Offset, SDValue &Alias);
+  bool selectAddr16(SDNode *Parent, SDValue N, SDValue &Base,
+                    SDValue &Offset, SDValue &Alias) override;
 
-  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
 
-  virtual void processFunctionAfterISel(MachineFunction &MF);
+  void processFunctionAfterISel(MachineFunction &MF) override;
 
   // Insert instructions to initialize the global base register in the
   // first MBB of the function.
diff --git a/lib/Target/Mips/Mips16ISelLowering.cpp b/lib/Target/Mips/Mips16ISelLowering.cpp
index 5c6f302..9102450 100644
--- a/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -10,7 +10,6 @@
 // Subclass of MipsTargetLowering specialized for mips16.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-lower"
 #include "Mips16ISelLowering.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsRegisterInfo.h"
@@ -23,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-lower"
+
 static cl::opt<bool> DontExpandCondPseudos16(
   "mips16-dont-expand-cond-pseudo",
   cl::init(false),
@@ -353,7 +354,7 @@ unsigned int Mips16TargetLowering::getMips16HelperFunctionStubNumber
 #define T P "0" , T1
 #define P P_
 static char const * vMips16Helper[MAX_STUB_NUMBER+1] =
-  {0, T1 };
+  {nullptr, T1 };
 #undef P
 #define P P_ "sf_"
 static char const * sfMips16Helper[MAX_STUB_NUMBER+1] =
@@ -430,7 +431,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
   SelectionDAG &DAG = CLI.DAG;
   MachineFunction &MF = DAG.getMachineFunction();
   MipsFunctionInfo *FuncInfo = MF.getInfo<MipsFunctionInfo>();
-  const char* Mips16HelperFunction = 0;
+  const char* Mips16HelperFunction = nullptr;
   bool NeedMips16Helper = false;
 
   if (Subtarget->inMips16HardFloat()) {
@@ -443,8 +444,8 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
     if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(CLI.Callee)) {
       Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL, S->getSymbol() };
 
-      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
-                             Find))
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
         LookupHelper = false;
       else {
         const char *Symbol = S->getSymbol();
@@ -471,13 +472,12 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
             FuncInfo->setSaveS2();
         }
         // one more look at list of intrinsics
-        if (std::binary_search(Mips16IntrinsicHelper,
-            array_endof(Mips16IntrinsicHelper),
-                                     IntrinsicFind)) {
-          const Mips16IntrinsicHelperType *h =(std::find(Mips16IntrinsicHelper,
-              array_endof(Mips16IntrinsicHelper),
-                                       IntrinsicFind));
-          Mips16HelperFunction = h->Helper;
+        const Mips16IntrinsicHelperType *Helper =
+            std::lower_bound(std::begin(Mips16IntrinsicHelper),
+                             std::end(Mips16IntrinsicHelper), IntrinsicFind);
+        if (Helper != std::end(Mips16IntrinsicHelper) &&
+            *Helper == IntrinsicFind) {
+          Mips16HelperFunction = Helper->Helper;
           NeedMips16Helper = true;
           LookupHelper = false;
         }
@@ -488,13 +488,13 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
       Mips16Libcall Find = { RTLIB::UNKNOWN_LIBCALL,
                              G->getGlobal()->getName().data() };
 
-      if (std::binary_search(HardFloatLibCalls, array_endof(HardFloatLibCalls),
-                             Find))
+      if (std::binary_search(std::begin(HardFloatLibCalls),
+                             std::end(HardFloatLibCalls), Find))
         LookupHelper = false;
     }
-    if (LookupHelper) Mips16HelperFunction =
-      getMips16HelperFunction(CLI.RetTy, CLI.Args, NeedMips16Helper);
-
+    if (LookupHelper)
+      Mips16HelperFunction =
+        getMips16HelperFunction(CLI.RetTy, CLI.getArgs(), NeedMips16Helper);
   }
 
   SDValue JumpTarget = Callee;
diff --git a/lib/Target/Mips/Mips16ISelLowering.h b/lib/Target/Mips/Mips16ISelLowering.h
index 618ec90..df88333 100644
--- a/lib/Target/Mips/Mips16ISelLowering.h
+++ b/lib/Target/Mips/Mips16ISelLowering.h
@@ -21,17 +21,17 @@ namespace llvm {
   public:
     explicit Mips16TargetLowering(MipsTargetMachine &TM);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-                                               bool *Fast) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
+                                       bool *Fast) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
   private:
-    virtual bool
-    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                      unsigned NextStackOffset,
-                                      const MipsFunctionInfo& FI) const;
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
 
     void setMips16HardFloatLibCalls();
 
@@ -41,11 +41,12 @@ namespace llvm {
     const char *getMips16HelperFunction
       (Type* RetTy, ArgListTy &Args, bool &needHelper) const;
 
-    virtual void
+    void
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
 
     MachineBasicBlock *emitSel16(unsigned Opc, MachineInstr *MI,
                                  MachineBasicBlock *BB) const;
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index 43c2fbd..79607de 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -29,6 +29,7 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips16-instrinfo"
 
 Mips16InstrInfo::Mips16InstrInfo(MipsTargetMachine &tm)
   : MipsInstrInfo(tm, Mips::Bimm16),
diff --git a/lib/Target/Mips/Mips16InstrInfo.h b/lib/Target/Mips/Mips16InstrInfo.h
index e93925c..0dc0046 100644
--- a/lib/Target/Mips/Mips16InstrInfo.h
+++ b/lib/Target/Mips/Mips16InstrInfo.h
@@ -25,46 +25,46 @@ class Mips16InstrInfo : public MipsInstrInfo {
 public:
   explicit Mips16InstrInfo(MipsTargetMachine &TM);
 
-  virtual const MipsRegisterInfo &getRegisterInfo() const;
+  const MipsRegisterInfo &getRegisterInfo() const override;
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
-  virtual void storeRegToStack(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MBBI,
-                               unsigned SrcReg, bool isKill, int FrameIndex,
-                               const TargetRegisterClass *RC,
-                               const TargetRegisterInfo *TRI,
-                               int64_t Offset) const;
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
 
-  virtual void loadRegFromStack(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                unsigned DestReg, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI,
-                                int64_t Offset) const;
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
   // Adjust SP by FrameSize bytes. Save RA, S0, S1
   void makeFrame(unsigned SP, int64_t FrameSize, MachineBasicBlock &MBB,
@@ -104,9 +104,9 @@ public:
     (MachineBasicBlock &MBB, MachineBasicBlock::iterator I, int64_t Imm) const;
 
   unsigned getInlineAsmLength(const char *Str,
-                              const MCAsmInfo &MAI) const;
+                              const MCAsmInfo &MAI) const override;
 private:
-  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
 
   void ExpandRetRA16(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/Mips16RegisterInfo.cpp b/lib/Target/Mips/Mips16RegisterInfo.cpp
index 3a50ed9..dbee774 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips16-registerinfo"
+
 Mips16RegisterInfo::Mips16RegisterInfo(const MipsSubtarget &ST)
   : MipsRegisterInfo(ST) {}
 
diff --git a/lib/Target/Mips/Mips16RegisterInfo.h b/lib/Target/Mips/Mips16RegisterInfo.h
index 13e82a3..f59f1a7 100644
--- a/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/lib/Target/Mips/Mips16RegisterInfo.h
@@ -23,24 +23,24 @@ class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
   Mips16RegisterInfo(const MipsSubtarget &Subtarget);
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
                                      MachineBasicBlock::iterator &UseMI,
                                      const TargetRegisterClass *RC,
-                                     unsigned Reg) const;
+                                     unsigned Reg) const override;
 
-  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
 
 private:
-  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
-                           int FrameIndex, uint64_t StackSize,
-                           int64_t SPOffset) const;
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/Mips32r6InstrFormats.td b/lib/Target/Mips/Mips32r6InstrFormats.td
new file mode 100644
index 0000000..a3f9df5
--- /dev/null
+++ b/lib/Target/Mips/Mips32r6InstrFormats.td
@@ -0,0 +1,386 @@
+//=- Mips32r6InstrFormats.td - Mips32r6 Instruction Formats -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class MipsR6Inst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>,
+                   PredicateControl {
+  let DecoderNamespace = "Mips32r6_64r6";
+  let EncodingPredicates = [HasStdEnc];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Field Values
+//
+//===----------------------------------------------------------------------===//
+
+class OPGROUP<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPGROUP_COP1     : OPGROUP<0b010001>;
+def OPGROUP_COP2     : OPGROUP<0b010010>;
+def OPGROUP_ADDI     : OPGROUP<0b001000>;
+def OPGROUP_AUI      : OPGROUP<0b001111>;
+def OPGROUP_BLEZ     : OPGROUP<0b000110>;
+def OPGROUP_BGTZ     : OPGROUP<0b000111>;
+def OPGROUP_BLEZL    : OPGROUP<0b010110>;
+def OPGROUP_BGTZL    : OPGROUP<0b010111>;
+def OPGROUP_DADDI    : OPGROUP<0b011000>;
+def OPGROUP_DAUI     : OPGROUP<0b011101>;
+def OPGROUP_PCREL    : OPGROUP<0b111011>;
+def OPGROUP_REGIMM   : OPGROUP<0b000001>;
+def OPGROUP_SPECIAL  : OPGROUP<0b000000>;
+def OPGROUP_SPECIAL3 : OPGROUP<0b011111>;
+
+class OPCODE2<bits<2> Val> {
+  bits<2> Value = Val;
+}
+def OPCODE2_ADDIUPC : OPCODE2<0b00>;
+def OPCODE2_LWPC    : OPCODE2<0b01>;
+def OPCODE2_LWUPC   : OPCODE2<0b10>;
+
+class OPCODE5<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def OPCODE5_ALUIPC : OPCODE5<0b11111>;
+def OPCODE5_AUIPC  : OPCODE5<0b11110>;
+def OPCODE5_DAHI : OPCODE5<0b00110>;
+def OPCODE5_DATI : OPCODE5<0b11110>;
+def OPCODE5_BC1EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC1NEZ : OPCODE5<0b01101>;
+def OPCODE5_BC2EQZ : OPCODE5<0b01001>;
+def OPCODE5_BC2NEZ : OPCODE5<0b01101>;
+
+class OPCODE6<bits<6> Val> {
+  bits<6> Value = Val;
+}
+def OPCODE6_ALIGN    : OPCODE6<0b100000>;
+def OPCODE6_DALIGN   : OPCODE6<0b100100>;
+def OPCODE6_BITSWAP  : OPCODE6<0b100000>;
+def OPCODE6_DBITSWAP : OPCODE6<0b100100>;
+
+class FIELD_FMT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_FMT_S : FIELD_FMT<0b10000>;
+def FIELD_FMT_D : FIELD_FMT<0b10001>;
+
+class FIELD_CMP_COND<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_COND_F    : FIELD_CMP_COND<0b00000>;
+def FIELD_CMP_COND_UN   : FIELD_CMP_COND<0b00001>;
+def FIELD_CMP_COND_EQ   : FIELD_CMP_COND<0b00010>;
+def FIELD_CMP_COND_UEQ  : FIELD_CMP_COND<0b00011>;
+def FIELD_CMP_COND_OLT  : FIELD_CMP_COND<0b00100>;
+def FIELD_CMP_COND_ULT  : FIELD_CMP_COND<0b00101>;
+def FIELD_CMP_COND_OLE  : FIELD_CMP_COND<0b00110>;
+def FIELD_CMP_COND_ULE  : FIELD_CMP_COND<0b00111>;
+def FIELD_CMP_COND_SF   : FIELD_CMP_COND<0b01000>;
+def FIELD_CMP_COND_NGLE : FIELD_CMP_COND<0b01001>;
+def FIELD_CMP_COND_SEQ  : FIELD_CMP_COND<0b01010>;
+def FIELD_CMP_COND_NGL  : FIELD_CMP_COND<0b01011>;
+def FIELD_CMP_COND_LT   : FIELD_CMP_COND<0b01100>;
+def FIELD_CMP_COND_NGE  : FIELD_CMP_COND<0b01101>;
+def FIELD_CMP_COND_LE   : FIELD_CMP_COND<0b01110>;
+def FIELD_CMP_COND_NGT  : FIELD_CMP_COND<0b01111>;
+
+class FIELD_CMP_FORMAT<bits<5> Val> {
+  bits<5> Value = Val;
+}
+def FIELD_CMP_FORMAT_S : FIELD_CMP_FORMAT<0b10100>;
+def FIELD_CMP_FORMAT_D : FIELD_CMP_FORMAT<0b10101>;
+
+//===----------------------------------------------------------------------===//
+//
+// Disambiguators
+//
+//===----------------------------------------------------------------------===//
+//
+// Some encodings are ambiguous except by comparing field values.
+
+class DecodeDisambiguates<string Name> {
+  string DecoderMethod = !strconcat("Decode", Name);
+}
+
+class DecodeDisambiguatedBy<string Name> : DecodeDisambiguates<Name> {
+  string DecoderNamespace = "Mips32r6_64r6_Ambiguous";
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Encoding Formats
+//
+//===----------------------------------------------------------------------===//
+
+class AUI_FM : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_AUI.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = imm;
+}
+
+class DAUI_FM : AUI_FM {
+  let Inst{31-26} = OPGROUP_DAUI.Value;
+}
+
+class COP1_2R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = 0b00000;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class COP1_3R_FM<bits<6> funct, FIELD_FMT Format> : MipsR6Inst {
+  bits<5> ft;
+  bits<5> fs;
+  bits<5> fd;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = funct;
+}
+
+class COP1_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ft;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-0} = offset;
+}
+
+class COP2_BCCZ_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> ct;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP2.Value;
+  let Inst{25-21} = Operation.Value;
+  let Inst{20-16} = ct;
+  let Inst{15-0} = offset;
+}
+
+class PCREL16_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class PCREL19_FM<OPCODE2 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<19> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_PCREL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-19} = Operation.Value;
+  let Inst{18-0} = imm;
+}
+
+class SPECIAL3_2R_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = 0b00000;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL_3R_FM<bits<5> mulop, bits<6> funct> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-6}  = mulop;
+  let Inst{5-0}   = funct;
+}
+
+// This class is ambiguous with other branches:
+//   BEQC/BNEC require that rs > rt
+class CMP_BRANCH_2R_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rs;
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLEZC/BGEZC/BEQZALC/BNEZALC/BGTZALC require that rs == 0 && rt != 0
+// The '1R_RT' in the name means 1 register in the rt field.
+class CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = 0b00000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+// This class is ambiguous with other branches:
+//   BLTZC/BGTZC/BLTZALC/BGEZALC require that rs == rt && rt != 0
+// The '1R_BOTH' in the name means 1 register in both the rs and rt fields.
+class CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct.Value;
+  let Inst{25-21} = rt;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class CMP_BRANCH_OFF21_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rs; // rs != 0
+  bits<21> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = rs;
+  let Inst{20-0} = offset;
+}
+
+class JMP_IDX_COMPACT_FM<bits<6> funct> : MipsR6Inst {
+  bits<5> rt;
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = funct;
+  let Inst{25-21} = 0b000000;
+  let Inst{20-16} = rt;
+  let Inst{15-0} = offset;
+}
+
+class BRANCH_OFF26_FM<bits<6> funct> : MipsR6Inst {
+  bits<32> Inst;
+  bits<26> offset;
+
+  let Inst{31-26} = funct;
+  let Inst{25-0} = offset;
+}
+
+class SPECIAL3_ALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<2> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-8}  = 0b010;
+  let Inst{7-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class SPECIAL3_DALIGN_FM<OPCODE6 Operation> : MipsR6Inst {
+  bits<5> rd;
+  bits<5> rs;
+  bits<5> rt;
+  bits<3> bp;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_SPECIAL3.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-11} = rd;
+  let Inst{10-9}  = 0b01;
+  let Inst{8-6}   = bp;
+  let Inst{5-0}   = Operation.Value;
+}
+
+class REGIMM_FM<OPCODE5 Operation> : MipsR6Inst {
+  bits<5> rs;
+  bits<16> imm;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_REGIMM.Value;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = Operation.Value;
+  let Inst{15-0} = imm;
+}
+
+class COP1_CMP_CONDN_FM<FIELD_CMP_FORMAT Format,
+                        FIELD_CMP_COND Cond> : MipsR6Inst {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = OPGROUP_COP1.Value;
+  let Inst{25-21} = Format.Value;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5}     = 0;
+  let Inst{4-0}   = Cond.Value;
+}
+
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
new file mode 100644
index 0000000..ffaf965
--- /dev/null
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -0,0 +1,583 @@
+//=- Mips32r6InstrInfo.td - Mips32r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips32r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+include "Mips32r6InstrFormats.td"
+
+// Notes about removals/changes from MIPS32r6:
+// Unclear: ssnop
+// Reencoded: cache, pref
+// Reencoded: clo, clz
+// Reencoded: jr -> jalr
+// Reencoded: jr.hb -> jalr.hb
+// Reencoded: ldc2
+// Reencoded: ll, sc
+// Reencoded: lwc2
+// Reencoded: sdbbp
+// Reencoded: sdc2
+// Reencoded: swc2
+// Removed: bc1any2, bc1any4
+// Removed: bc2[ft]
+// Removed: bc2f, bc2t
+// Removed: bgezal
+// Removed: bltzal
+// Removed: c.cond.fmt, bc1[ft]
+// Removed: div, divu
+// Removed: jalx
+// Removed: ldxc1
+// Removed: luxc1
+// Removed: lwxc1
+// Removed: madd.[ds], nmadd.[ds], nmsub.[ds], sub.[ds]
+// Removed: mfhi, mflo, mthi, mtlo, madd, maddu, msub, msubu, mul
+// Removed: movf, movt
+// Removed: movf.fmt, movt.fmt, movn.fmt, movz.fmt
+// Removed: movn, movz
+// Removed: mult, multu
+// Removed: prefx
+// Removed: sdxc1
+// Removed: suxc1
+// Removed: swxc1
+// Rencoded: [ls][wd]c2
+
+def brtarget21 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget21OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget21";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def brtarget26 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTarget26OpValue";
+  let OperandType = "OPERAND_PCREL";
+  let DecoderMethod = "DecodeBranchTarget26";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def jmpoffset16 : Operand<OtherVT> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+def calloffset16 : Operand<iPTR> {
+  let EncoderMethod = "getJumpOffset16OpValue";
+  let ParserMatchClass = MipsJumpTargetAsmOperand;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class ADDIUPC_ENC : PCREL19_FM<OPCODE2_ADDIUPC>;
+class ALIGN_ENC  : SPECIAL3_ALIGN_FM<OPCODE6_ALIGN>;
+class ALUIPC_ENC : PCREL16_FM<OPCODE5_ALUIPC>;
+class AUI_ENC    : AUI_FM;
+class AUIPC_ENC  : PCREL16_FM<OPCODE5_AUIPC>;
+
+class BALC_ENC  : BRANCH_OFF26_FM<0b111010>;
+class BC_ENC    : BRANCH_OFF26_FM<0b110010>;
+class BEQC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                  DecodeDisambiguates<"AddiGroupBranch">;
+class BEQZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_ADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BNEC_ENC  : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                  DecodeDisambiguates<"DaddiGroupBranch">;
+class BNEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_DADDI>,
+                    DecodeDisambiguatedBy<"DaddiGroupBranch">;
+
+class BLTZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguates<"BgtzlGroupBranch">;
+class BGEZC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguates<"BlezlGroupBranch">;
+class BGTZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguatedBy<"BgtzGroupBranch">;
+
+class BLEZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZL>,
+                  DecodeDisambiguatedBy<"BlezlGroupBranch">;
+class BLTZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BGTZ>,
+                    DecodeDisambiguates<"BgtzGroupBranch">;
+class BGTZC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BGTZL>,
+                  DecodeDisambiguatedBy<"BgtzlGroupBranch">;
+
+class BEQZC_ENC : CMP_BRANCH_OFF21_FM<0b110110>;
+class BGEZALC_ENC : CMP_BRANCH_1R_BOTH_OFF16_FM<OPGROUP_BLEZ>;
+class BNEZC_ENC : CMP_BRANCH_OFF21_FM<0b111110>;
+
+class BC1EQZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1EQZ>;
+class BC1NEZ_ENC : COP1_BCCZ_FM<OPCODE5_BC1NEZ>;
+class BC2EQZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2EQZ>;
+class BC2NEZ_ENC : COP2_BCCZ_FM<OPCODE5_BC2NEZ>;
+
+class JIALC_ENC : JMP_IDX_COMPACT_FM<0b111110>;
+class JIC_ENC   : JMP_IDX_COMPACT_FM<0b110110>;
+
+class BITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_BITSWAP>;
+class BLEZALC_ENC : CMP_BRANCH_1R_RT_OFF16_FM<OPGROUP_BLEZ>;
+class BNVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_DADDI>,
+                   DecodeDisambiguatedBy<"DaddiGroupBranch">;
+class BOVC_ENC   : CMP_BRANCH_2R_OFF16_FM<OPGROUP_ADDI>,
+                   DecodeDisambiguatedBy<"AddiGroupBranch">;
+class DIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011010>;
+class DIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011011>;
+class MOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011010>;
+class MODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011011>;
+class MUH_ENC    : SPECIAL_3R_FM<0b00011, 0b011000>;
+class MUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b011001>;
+class MUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b011000>;
+class MULU_ENC   : SPECIAL_3R_FM<0b00010, 0b011001>;
+
+class MADDF_S_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_S>;
+class MADDF_D_ENC  : COP1_3R_FM<0b011000, FIELD_FMT_D>;
+class MSUBF_S_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_S>;
+class MSUBF_D_ENC  : COP1_3R_FM<0b011001, FIELD_FMT_D>;
+
+class SEL_D_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_D>;
+class SEL_S_ENC  : COP1_3R_FM<0b010000, FIELD_FMT_S>;
+
+class SELEQZ_ENC : SPECIAL_3R_FM<0b00000, 0b110101>;
+class SELNEZ_ENC : SPECIAL_3R_FM<0b00000, 0b110111>;
+
+class LWPC_ENC   : PCREL19_FM<OPCODE2_LWPC>;
+class LWUPC_ENC  : PCREL19_FM<OPCODE2_LWUPC>;
+
+class MAX_S_ENC : COP1_3R_FM<0b011101, FIELD_FMT_S>;
+class MAX_D_ENC : COP1_3R_FM<0b011101, FIELD_FMT_D>;
+class MIN_S_ENC : COP1_3R_FM<0b011100, FIELD_FMT_S>;
+class MIN_D_ENC : COP1_3R_FM<0b011100, FIELD_FMT_D>;
+
+class MAXA_S_ENC : COP1_3R_FM<0b011111, FIELD_FMT_S>;
+class MAXA_D_ENC : COP1_3R_FM<0b011111, FIELD_FMT_D>;
+class MINA_S_ENC : COP1_3R_FM<0b011110, FIELD_FMT_S>;
+class MINA_D_ENC : COP1_3R_FM<0b011110, FIELD_FMT_D>;
+
+class SELEQZ_S_ENC : COP1_3R_FM<0b010100, FIELD_FMT_S>;
+class SELEQZ_D_ENC : COP1_3R_FM<0b010100, FIELD_FMT_D>;
+class SELNEZ_S_ENC : COP1_3R_FM<0b010111, FIELD_FMT_S>;
+class SELNEZ_D_ENC : COP1_3R_FM<0b010111, FIELD_FMT_D>;
+
+class RINT_S_ENC : COP1_2R_FM<0b011010, FIELD_FMT_S>;
+class RINT_D_ENC : COP1_2R_FM<0b011010, FIELD_FMT_D>;
+class CLASS_S_ENC : COP1_2R_FM<0b011011, FIELD_FMT_S>;
+class CLASS_D_ENC : COP1_2R_FM<0b011011, FIELD_FMT_D>;
+
+class CMP_CONDN_DESC_BASE<string CondStr, string Typestr, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat("cmp.", CondStr, ".", Typestr, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Multiclasses
+//
+//===----------------------------------------------------------------------===//
+
+multiclass CMP_CC_M <FIELD_CMP_FORMAT Format, string Typestr,
+                     RegisterOperand FGROpnd>{
+  def CMP_F_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_F>,
+                    CMP_CONDN_DESC_BASE<"f", Typestr, FGROpnd>,
+                    ISA_MIPS32R6;
+  def CMP_UN_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UN>,
+                     CMP_CONDN_DESC_BASE<"un", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_EQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_EQ>,
+                     CMP_CONDN_DESC_BASE<"eq", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_UEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_UEQ>,
+                      CMP_CONDN_DESC_BASE<"ueq", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_OLT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLT>,
+                      CMP_CONDN_DESC_BASE<"olt", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_ULT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULT>,
+                      CMP_CONDN_DESC_BASE<"ult", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_OLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_OLE>,
+                      CMP_CONDN_DESC_BASE<"ole", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_ULE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_ULE>,
+                      CMP_CONDN_DESC_BASE<"ule", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_SF_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SF>,
+                     CMP_CONDN_DESC_BASE<"sf", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGLE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGLE>,
+                       CMP_CONDN_DESC_BASE<"ngle", Typestr, FGROpnd>,
+                       ISA_MIPS32R6;
+  def CMP_SEQ_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_SEQ>,
+                      CMP_CONDN_DESC_BASE<"seq", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_NGL_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGL>,
+                      CMP_CONDN_DESC_BASE<"ngl", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_LT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LT>,
+                     CMP_CONDN_DESC_BASE<"lt", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGE>,
+                      CMP_CONDN_DESC_BASE<"nge", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+  def CMP_LE_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_LE>,
+                     CMP_CONDN_DESC_BASE<"le", Typestr, FGROpnd>,
+                     ISA_MIPS32R6;
+  def CMP_NGT_#NAME : COP1_CMP_CONDN_FM<Format, FIELD_CMP_COND_NGT>,
+                      CMP_CONDN_DESC_BASE<"ngt", Typestr, FGROpnd>,
+                      ISA_MIPS32R6;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class PCREL19_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm19_lsl2:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ADDIUPC_DESC : PCREL19_DESC_BASE<"addiupc", GPR32Opnd>;
+class LWPC_DESC: PCREL19_DESC_BASE<"lwpc", GPR32Opnd>;
+class LWUPC_DESC: PCREL19_DESC_BASE<"lwupc", GPR32Opnd>;
+
+class ALIGN_DESC_BASE<string instr_asm, RegisterOperand GPROpnd,
+                      Operand ImmOpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, ImmOpnd:$bp);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt, $bp");
+  list<dag> Pattern = [];
+}
+
+class ALIGN_DESC : ALIGN_DESC_BASE<"align", GPR32Opnd, uimm2>;
+
+class ALUIPC_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $imm");
+  list<dag> Pattern = [];
+}
+
+class ALUIPC_DESC : ALUIPC_DESC_BASE<"aluipc", GPR32Opnd>;
+class AUIPC_DESC : ALUIPC_DESC_BASE<"auipc", GPR32Opnd>;
+
+class AUI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $imm");
+  list<dag> Pattern = [];
+}
+
+class AUI_DESC : AUI_DESC_BASE<"aui", GPR32Opnd>;
+
+class BRANCH_DESC_BASE {
+  bit isBranch = 1;
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+}
+
+class BC_DESC_BASE<string instr_asm, DAGOperand opnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$offset");
+  bit isBarrier = 1;
+}
+
+class CMP_BC_DESC_BASE<string instr_asm, DAGOperand opnd,
+                       RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_EQNE_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                               RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rs, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rs, $offset");
+  list<Register> Defs = [AT];
+}
+
+class CMP_CBR_RT_Z_DESC_BASE<string instr_asm, DAGOperand opnd,
+                             RegisterOperand GPROpnd> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $offset");
+  list<Register> Defs = [AT];
+}
+
+class BALC_DESC : BC_DESC_BASE<"balc", brtarget26> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class BC_DESC : BC_DESC_BASE<"bc", brtarget26>;
+class BEQC_DESC : CMP_BC_DESC_BASE<"beqc", brtarget, GPR32Opnd>;
+class BNEC_DESC : CMP_BC_DESC_BASE<"bnec", brtarget, GPR32Opnd>;
+
+class BLTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzc", brtarget, GPR32Opnd>;
+class BGEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezc", brtarget, GPR32Opnd>;
+
+class BLEZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezc", brtarget, GPR32Opnd>;
+class BGTZC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzc", brtarget, GPR32Opnd>;
+
+class BEQZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"beqzc", brtarget21, GPR32Opnd>;
+class BNEZC_DESC : CMP_CBR_EQNE_Z_DESC_BASE<"bnezc", brtarget21, GPR32Opnd>;
+
+class COP1_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins FGR64Opnd:$ft, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC1EQZ_DESC : COP1_BCCZ_DESC_BASE<"bc1eqz $ft, $offset">;
+class BC1NEZ_DESC : COP1_BCCZ_DESC_BASE<"bc1nez $ft, $offset">;
+
+class COP2_BCCZ_DESC_BASE<string instr_asm> : BRANCH_DESC_BASE {
+  dag InOperandList = (ins COP2Opnd:$ct, brtarget:$offset);
+  dag OutOperandList = (outs);
+  string AsmString = instr_asm;
+  bit hasDelaySlot = 1;
+}
+
+class BC2EQZ_DESC : COP2_BCCZ_DESC_BASE<"bc2eqz $ct, $offset">;
+class BC2NEZ_DESC : COP2_BCCZ_DESC_BASE<"bc2nez $ct, $offset">;
+
+class BOVC_DESC   : CMP_BC_DESC_BASE<"bovc", brtarget, GPR32Opnd>;
+class BNVC_DESC   : CMP_BC_DESC_BASE<"bnvc", brtarget, GPR32Opnd>;
+
+class JMP_IDX_COMPACT_DESC_BASE<string opstr, DAGOperand opnd,
+                                RegisterOperand GPROpnd> {
+  dag InOperandList = (ins GPROpnd:$rt, opnd:$offset);
+  string AsmString = !strconcat(opstr, "\t$rt, $offset");
+  list<dag> Pattern = [];
+  bit isTerminator = 1;
+  bit hasDelaySlot = 0;
+  string DecoderMethod = "DecodeSimm16";
+}
+
+class JIALC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jialc", calloffset16,
+                                             GPR32Opnd> {
+  bit isCall = 1;
+  list<Register> Defs = [RA];
+}
+
+class JIC_DESC : JMP_IDX_COMPACT_DESC_BASE<"jic", jmpoffset16, GPR32Opnd> {
+  bit isBarrier = 1;
+  list<Register> Defs = [AT];
+}
+
+class BITSWAP_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rt");
+  list<dag> Pattern = [];
+}
+
+class BITSWAP_DESC : BITSWAP_DESC_BASE<"bitswap", GPR32Opnd>;
+
+class DIVMOD_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class DIV_DESC  : DIVMOD_DESC_BASE<"div", GPR32Opnd>;
+class DIVU_DESC : DIVMOD_DESC_BASE<"divu", GPR32Opnd>;
+class MOD_DESC  : DIVMOD_DESC_BASE<"mod", GPR32Opnd>;
+class MODU_DESC : DIVMOD_DESC_BASE<"modu", GPR32Opnd>;
+
+class BEQZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"beqzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BGTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bgtzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"blezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BLTZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bltzalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+
+class BNEZALC_DESC : CMP_CBR_RT_Z_DESC_BASE<"bnezalc", brtarget, GPR32Opnd> {
+  list<Register> Defs = [RA];
+}
+class MUL_R6_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class MUH_DESC    : MUL_R6_DESC_BASE<"muh", GPR32Opnd>;
+class MUHU_DESC   : MUL_R6_DESC_BASE<"muhu", GPR32Opnd>;
+class MUL_R6_DESC : MUL_R6_DESC_BASE<"mul", GPR32Opnd>;
+class MULU_DESC   : MUL_R6_DESC_BASE<"mulu", GPR32Opnd>;
+
+class COP1_4R_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fd_in, FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+  string Constraints = "$fd_in = $fd";
+}
+
+class SEL_D_DESC : COP1_4R_DESC_BASE<"sel.d", FGR64Opnd>;
+class SEL_S_DESC : COP1_4R_DESC_BASE<"sel.s", FGR32Opnd>;
+
+class SELEQNE_Z_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rd);
+  dag InOperandList = (ins GPROpnd:$rs, GPROpnd:$rt);
+  string AsmString = !strconcat(instr_asm, "\t$rd, $rs, $rt");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_DESC : SELEQNE_Z_DESC_BASE<"seleqz", GPR32Opnd>;
+class SELNEZ_DESC : SELEQNE_Z_DESC_BASE<"selnez", GPR32Opnd>;
+
+class MADDF_S_DESC  : COP1_4R_DESC_BASE<"maddf.s", FGR32Opnd>;
+class MADDF_D_DESC  : COP1_4R_DESC_BASE<"maddf.d", FGR64Opnd>;
+class MSUBF_S_DESC  : COP1_4R_DESC_BASE<"msubf.s", FGR32Opnd>;
+class MSUBF_D_DESC  : COP1_4R_DESC_BASE<"msubf.d", FGR64Opnd>;
+
+class MAX_MIN_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class MAX_S_DESC : MAX_MIN_DESC_BASE<"max.s", FGR32Opnd>;
+class MAX_D_DESC : MAX_MIN_DESC_BASE<"max.d", FGR64Opnd>;
+class MIN_S_DESC : MAX_MIN_DESC_BASE<"min.s", FGR32Opnd>;
+class MIN_D_DESC : MAX_MIN_DESC_BASE<"min.d", FGR64Opnd>;
+
+class MAXA_S_DESC : MAX_MIN_DESC_BASE<"maxa.s", FGR32Opnd>;
+class MAXA_D_DESC : MAX_MIN_DESC_BASE<"maxa.d", FGR64Opnd>;
+class MINA_S_DESC : MAX_MIN_DESC_BASE<"mina.s", FGR32Opnd>;
+class MINA_D_DESC : MAX_MIN_DESC_BASE<"mina.d", FGR64Opnd>;
+
+class SELEQNEZ_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs, FGROpnd:$ft);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs, $ft");
+  list<dag> Pattern = [];
+}
+
+class SELEQZ_S_DESC : SELEQNEZ_DESC_BASE<"seleqz.s", FGR32Opnd>;
+class SELEQZ_D_DESC : SELEQNEZ_DESC_BASE<"seleqz.d", FGR64Opnd>;
+class SELNEZ_S_DESC : SELEQNEZ_DESC_BASE<"selnez.s", FGR32Opnd>;
+class SELNEZ_D_DESC : SELEQNEZ_DESC_BASE<"selnez.d", FGR64Opnd>;
+
+class CLASS_RINT_DESC_BASE<string instr_asm, RegisterOperand FGROpnd> {
+  dag OutOperandList = (outs FGROpnd:$fd);
+  dag InOperandList = (ins FGROpnd:$fs);
+  string AsmString = !strconcat(instr_asm, "\t$fd, $fs");
+  list<dag> Pattern = [];
+}
+
+class RINT_S_DESC : CLASS_RINT_DESC_BASE<"rint.s", FGR32Opnd>;
+class RINT_D_DESC : CLASS_RINT_DESC_BASE<"rint.d", FGR64Opnd>;
+class CLASS_S_DESC : CLASS_RINT_DESC_BASE<"class.s", FGR32Opnd>;
+class CLASS_D_DESC : CLASS_RINT_DESC_BASE<"class.d", FGR64Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def ADDIUPC : ADDIUPC_ENC, ADDIUPC_DESC, ISA_MIPS32R6;
+def ALIGN : ALIGN_ENC, ALIGN_DESC, ISA_MIPS32R6;
+def ALUIPC : ALUIPC_ENC, ALUIPC_DESC, ISA_MIPS32R6;
+def AUI : AUI_ENC, AUI_DESC, ISA_MIPS32R6;
+def AUIPC : AUIPC_ENC, AUIPC_DESC, ISA_MIPS32R6;
+def BALC : BALC_ENC, BALC_DESC, ISA_MIPS32R6;
+def BC1EQZ : BC1EQZ_ENC, BC1EQZ_DESC, ISA_MIPS32R6;
+def BC1NEZ : BC1NEZ_ENC, BC1NEZ_DESC, ISA_MIPS32R6;
+def BC2EQZ : BC2EQZ_ENC, BC2EQZ_DESC, ISA_MIPS32R6;
+def BC2NEZ : BC2NEZ_ENC, BC2NEZ_DESC, ISA_MIPS32R6;
+def BC : BC_ENC, BC_DESC, ISA_MIPS32R6;
+def BEQC : BEQC_ENC, BEQC_DESC, ISA_MIPS32R6;
+def BEQZALC : BEQZALC_ENC, BEQZALC_DESC, ISA_MIPS32R6;
+def BEQZC : BEQZC_ENC, BEQZC_DESC, ISA_MIPS32R6;
+def BGEC;  // Also aliased to blec with operands swapped
+def BGEUC; // Also aliased to bleuc with operands swapped
+def BGEZALC : BGEZALC_ENC, BGEZALC_DESC, ISA_MIPS32R6;
+def BGEZC : BGEZC_ENC, BGEZC_DESC, ISA_MIPS32R6;
+def BGTZALC : BGTZALC_ENC, BGTZALC_DESC, ISA_MIPS32R6;
+def BGTZC : BGTZC_ENC, BGTZC_DESC, ISA_MIPS32R6;
+def BITSWAP : BITSWAP_ENC, BITSWAP_DESC, ISA_MIPS32R6;
+def BLEZALC : BLEZALC_ENC, BLEZALC_DESC, ISA_MIPS32R6;
+def BLEZC : BLEZC_ENC, BLEZC_DESC, ISA_MIPS32R6;
+def BLTC; // Also aliased to bgtc with operands swapped
+def BLTUC; // Also aliased to bgtuc with operands swapped
+def BLTZALC : BLTZALC_ENC, BLTZALC_DESC, ISA_MIPS32R6;
+def BLTZC : BLTZC_ENC, BLTZC_DESC, ISA_MIPS32R6;
+def BNEC : BNEC_ENC, BNEC_DESC, ISA_MIPS32R6;
+def BNEZALC : BNEZALC_ENC, BNEZALC_DESC, ISA_MIPS32R6;
+def BNEZC : BNEZC_ENC, BNEZC_DESC, ISA_MIPS32R6;
+def BNVC : BNVC_ENC, BNVC_DESC, ISA_MIPS32R6;
+def BOVC : BOVC_ENC, BOVC_DESC, ISA_MIPS32R6;
+def CLASS_D : CLASS_D_ENC, CLASS_D_DESC, ISA_MIPS32R6;
+def CLASS_S : CLASS_S_ENC, CLASS_S_DESC, ISA_MIPS32R6;
+defm S : CMP_CC_M<FIELD_CMP_FORMAT_S, "s", FGR32Opnd>;
+defm D : CMP_CC_M<FIELD_CMP_FORMAT_D, "d", FGR64Opnd>;
+def DIV : DIV_ENC, DIV_DESC, ISA_MIPS32R6;
+def DIVU : DIVU_ENC, DIVU_DESC, ISA_MIPS32R6;
+def JIALC : JIALC_ENC, JIALC_DESC, ISA_MIPS32R6;
+def JIC : JIC_ENC, JIC_DESC, ISA_MIPS32R6;
+// def LSA; // See MSA
+def LWPC : LWPC_ENC, LWPC_DESC, ISA_MIPS32R6;
+def LWUPC : LWUPC_ENC, LWUPC_DESC, ISA_MIPS32R6;
+def MADDF_S : MADDF_S_ENC, MADDF_S_DESC, ISA_MIPS32R6;
+def MADDF_D : MADDF_D_ENC, MADDF_D_DESC, ISA_MIPS32R6;
+def MAXA_D : MAXA_D_ENC, MAXA_D_DESC, ISA_MIPS32R6;
+def MAXA_S : MAXA_S_ENC, MAXA_S_DESC, ISA_MIPS32R6;
+def MAX_D : MAX_D_ENC, MAX_D_DESC, ISA_MIPS32R6;
+def MAX_S : MAX_S_ENC, MAX_S_DESC, ISA_MIPS32R6;
+def MINA_D : MINA_D_ENC, MINA_D_DESC, ISA_MIPS32R6;
+def MINA_S : MINA_S_ENC, MINA_S_DESC, ISA_MIPS32R6;
+def MIN_D : MIN_D_ENC, MIN_D_DESC, ISA_MIPS32R6;
+def MIN_S : MIN_S_ENC, MIN_S_DESC, ISA_MIPS32R6;
+def MOD : MOD_ENC, MOD_DESC, ISA_MIPS32R6;
+def MODU : MODU_ENC, MODU_DESC, ISA_MIPS32R6;
+def MSUBF_S : MSUBF_S_ENC, MSUBF_S_DESC, ISA_MIPS32R6;
+def MSUBF_D : MSUBF_D_ENC, MSUBF_D_DESC, ISA_MIPS32R6;
+def MUH    : MUH_ENC, MUH_DESC, ISA_MIPS32R6;
+def MUHU   : MUHU_ENC, MUHU_DESC, ISA_MIPS32R6;
+def MUL_R6 : MUL_R6_ENC, MUL_R6_DESC, ISA_MIPS32R6;
+def MULU   : MULU_ENC, MULU_DESC, ISA_MIPS32R6;
+def NAL; // BAL with rd=0
+def RINT_D : RINT_D_ENC, RINT_D_DESC, ISA_MIPS32R6;
+def RINT_S : RINT_S_ENC, RINT_S_DESC, ISA_MIPS32R6;
+def SELEQZ : SELEQZ_ENC, SELEQZ_DESC, ISA_MIPS32R6;
+def SELEQZ_D : SELEQZ_D_ENC, SELEQZ_D_DESC, ISA_MIPS32R6;
+def SELEQZ_S : SELEQZ_S_ENC, SELEQZ_S_DESC, ISA_MIPS32R6;
+def SELNEZ : SELNEZ_ENC, SELNEZ_DESC, ISA_MIPS32R6;
+def SELNEZ_D : SELNEZ_D_ENC, SELNEZ_D_DESC, ISA_MIPS32R6;
+def SELNEZ_S : SELNEZ_S_ENC, SELNEZ_S_DESC, ISA_MIPS32R6;
+def SEL_D : SEL_D_ENC, SEL_D_DESC, ISA_MIPS32R6;
+def SEL_S : SEL_S_ENC, SEL_S_DESC, ISA_MIPS32R6;
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 7115d11..924b325 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -20,6 +20,9 @@ def uimm16_64      : Operand<i64> {
   let PrintMethod = "printUnsignedImm";
 }
 
+// Signed Operand
+def simm10_64 : Operand<i64>;
+
 // Transformation Function - get Imm - 32.
 def Subtract32 : SDNodeXForm<imm, [{
   return getImm(N, (unsigned)N->getZExtValue() - 32);
@@ -28,6 +31,11 @@ def Subtract32 : SDNodeXForm<imm, [{
 // shamt must fit in 6 bits.
 def immZExt6 : ImmLeaf<i32, [{return Imm == (Imm & 0x3f);}]>;
 
+// Node immediate fits as 10-bit sign extended on target immediate.
+// e.g. seqi, snei
+def immSExt10_64 : PatLeaf<(i64 imm),
+                           [{ return isInt<10>(N->getSExtValue()); }]>;
+
 //===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
@@ -53,10 +61,11 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 //===----------------------------------------------------------------------===//
 let DecoderNamespace = "Mips64" in {
 /// Arithmetic Instructions (ALU Immediate)
-def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>;
+def DADDi   : ArithLogicI<"daddi", simm16_64, GPR64Opnd>, ADDI_FM<0x18>,
+              ISA_MIPS3;
 def DADDiu  : ArithLogicI<"daddiu", simm16_64, GPR64Opnd, II_DADDIU,
                           immSExt16, add>,
-              ADDI_FM<0x19>, IsAsCheapAsAMove;
+              ADDI_FM<0x19>, IsAsCheapAsAMove, ISA_MIPS3;
 
 let isCodeGenOnly = 1 in {
 def SLTi64  : SetCC_I<"slti", setlt, simm16_64, immSExt16, GPR64Opnd>,
@@ -73,12 +82,14 @@ def LUi64   : LoadUpper<"lui", GPR64Opnd, uimm16_64>, LUI_FM;
 }
 
 /// Arithmetic Instructions (3-Operand, R-Type)
-def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>;
-def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>,
-                              ADD_FM<0, 0x2d>;
-def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>,
-                              ADD_FM<0, 0x2f>;
-def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB, sub>, ADD_FM<0, 0x2e>;
+def DADD   : ArithLogicR<"dadd", GPR64Opnd, 1, II_DADD>, ADD_FM<0, 0x2c>,
+             ISA_MIPS3;
+def DADDu  : ArithLogicR<"daddu", GPR64Opnd, 1, II_DADDU, add>, ADD_FM<0, 0x2d>,
+             ISA_MIPS3;
+def DSUBu  : ArithLogicR<"dsubu", GPR64Opnd, 0, II_DSUBU, sub>, ADD_FM<0, 0x2f>,
+             ISA_MIPS3;
+def DSUB   : ArithLogicR<"dsub", GPR64Opnd, 0, II_DSUB>, ADD_FM<0, 0x2e>,
+             ISA_MIPS3;
 
 let isCodeGenOnly = 1 in {
 def SLT64  : SetCC_R<"slt", setlt, GPR64Opnd>, ADD_FM<0, 0x2a>;
@@ -91,33 +102,32 @@ def NOR64  : LogicNOR<"nor", GPR64Opnd>, ADD_FM<0, 0x27>;
 
 /// Shift Instructions
 def DSLL   : shift_rotate_imm<"dsll", uimm6, GPR64Opnd, II_DSLL, shl, immZExt6>,
-             SRA_FM<0x38, 0>;
+             SRA_FM<0x38, 0>, ISA_MIPS3;
 def DSRL   : shift_rotate_imm<"dsrl", uimm6, GPR64Opnd, II_DSRL, srl, immZExt6>,
-             SRA_FM<0x3a, 0>;
+             SRA_FM<0x3a, 0>, ISA_MIPS3;
 def DSRA   : shift_rotate_imm<"dsra", uimm6, GPR64Opnd, II_DSRA, sra, immZExt6>,
-             SRA_FM<0x3b, 0>;
+             SRA_FM<0x3b, 0>, ISA_MIPS3;
 def DSLLV  : shift_rotate_reg<"dsllv", GPR64Opnd, II_DSLLV, shl>,
-             SRLV_FM<0x14, 0>;
+             SRLV_FM<0x14, 0>, ISA_MIPS3;
 def DSRLV  : shift_rotate_reg<"dsrlv", GPR64Opnd, II_DSRLV, srl>,
-             SRLV_FM<0x16, 0>;
+             SRLV_FM<0x16, 0>, ISA_MIPS3;
 def DSRAV  : shift_rotate_reg<"dsrav", GPR64Opnd, II_DSRAV, sra>,
-             SRLV_FM<0x17, 0>;
+             SRLV_FM<0x17, 0>, ISA_MIPS3;
 def DSLL32 : shift_rotate_imm<"dsll32", uimm5, GPR64Opnd, II_DSLL32>,
-             SRA_FM<0x3c, 0>;
+             SRA_FM<0x3c, 0>, ISA_MIPS3;
 def DSRL32 : shift_rotate_imm<"dsrl32", uimm5, GPR64Opnd, II_DSRL32>,
-             SRA_FM<0x3e, 0>;
+             SRA_FM<0x3e, 0>, ISA_MIPS3;
 def DSRA32 : shift_rotate_imm<"dsra32", uimm5, GPR64Opnd, II_DSRA32>,
-             SRA_FM<0x3f, 0>;
+             SRA_FM<0x3f, 0>, ISA_MIPS3;
 
 // Rotate Instructions
-let Predicates = [HasMips64r2, HasStdEnc] in {
-  def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
-                                immZExt6>, SRA_FM<0x3a, 1>;
-  def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
-               SRLV_FM<0x16, 1>;
-  def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
-                SRA_FM<0x3e, 1>;
-}
+def DROTR  : shift_rotate_imm<"drotr", uimm6, GPR64Opnd, II_DROTR, rotr,
+                              immZExt6>,
+             SRA_FM<0x3a, 1>, ISA_MIPS64R2;
+def DROTRV : shift_rotate_reg<"drotrv", GPR64Opnd, II_DROTRV, rotr>,
+             SRLV_FM<0x16, 1>, ISA_MIPS64R2;
+def DROTR32 : shift_rotate_imm<"drotr32", uimm5, GPR64Opnd, II_DROTR32>,
+              SRA_FM<0x3e, 1>, ISA_MIPS64R2;
 
 /// Load and Store Instructions
 ///  aligned
@@ -132,9 +142,9 @@ def SH64  : Store<"sh", GPR64Opnd, truncstorei16, II_SH>, LW_FM<0x29>;
 def SW64  : Store<"sw", GPR64Opnd, truncstorei32, II_SW>, LW_FM<0x2b>;
 }
 
-def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>;
-def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>;
-def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>;
+def LWu   : Load<"lwu", GPR64Opnd, zextloadi32, II_LWU>, LW_FM<0x27>, ISA_MIPS3;
+def LD    : Load<"ld", GPR64Opnd, load, II_LD>, LW_FM<0x37>, ISA_MIPS3;
+def SD    : Store<"sd", GPR64Opnd, store, II_SD>, LW_FM<0x3f>, ISA_MIPS3;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
@@ -144,14 +154,18 @@ def SWL64 : StoreLeftRight<"swl", MipsSWL, GPR64Opnd, II_SWL>, LW_FM<0x2a>;
 def SWR64 : StoreLeftRight<"swr", MipsSWR, GPR64Opnd, II_SWR>, LW_FM<0x2e>;
 }
 
-def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>;
-def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>;
-def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>;
-def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>;
+def LDL   : LoadLeftRight<"ldl", MipsLDL, GPR64Opnd, II_LDL>, LW_FM<0x1a>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def LDR   : LoadLeftRight<"ldr", MipsLDR, GPR64Opnd, II_LDR>, LW_FM<0x1b>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDL   : StoreLeftRight<"sdl", MipsSDL, GPR64Opnd, II_SDL>, LW_FM<0x2c>,
+            ISA_MIPS3_NOT_32R6_64R6;
+def SDR   : StoreLeftRight<"sdr", MipsSDR, GPR64Opnd, II_SDR>, LW_FM<0x2d>,
+            ISA_MIPS3_NOT_32R6_64R6;
 
 /// Load-linked, Store-conditional
-def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>;
-def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>;
+def LLD : LLBase<"lld", GPR64Opnd>, LW_FM<0x34>, ISA_MIPS3;
+def SCD : SCBase<"scd", GPR64Opnd>, LW_FM<0x3c>, ISA_MIPS3;
 
 /// Jump and Branch Instructions
 let isCodeGenOnly = 1 in {
@@ -169,17 +183,17 @@ def TAILCALL64_R : TailCallReg<GPR64Opnd, JR, GPR32Opnd>;
 
 /// Multiply and Divide Instructions.
 def DMULT  : Mult<"dmult", II_DMULT, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1c>;
+             MULT_FM<0, 0x1c>, ISA_MIPS3;
 def DMULTu : Mult<"dmultu", II_DMULTU, GPR64Opnd, [HI0_64, LO0_64]>,
-             MULT_FM<0, 0x1d>;
+             MULT_FM<0, 0x1d>, ISA_MIPS3;
 def PseudoDMULT  : MultDivPseudo<DMULT, ACC128, GPR64Opnd, MipsMult,
                                  II_DMULT>;
 def PseudoDMULTu : MultDivPseudo<DMULTu, ACC128, GPR64Opnd, MipsMultu,
                                  II_DMULTU>;
 def DSDIV : Div<"ddiv", II_DDIV, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1e>;
+            MULT_FM<0, 0x1e>, ISA_MIPS3;
 def DUDIV : Div<"ddivu", II_DDIVU, GPR64Opnd, [HI0_64, LO0_64]>,
-            MULT_FM<0, 0x1f>;
+            MULT_FM<0, 0x1f>, ISA_MIPS3;
 def PseudoDSDIV : MultDivPseudo<DSDIV, ACC128, GPR64Opnd, MipsDivRem,
                                 II_DDIV, 0, 1, 1>;
 def PseudoDUDIV : MultDivPseudo<DUDIV, ACC128, GPR64Opnd, MipsDivRemU,
@@ -195,17 +209,19 @@ def PseudoMFLO64 : PseudoMFLOHI<GPR64, ACC128, MipsMFLO>;
 def PseudoMTLOHI64 : PseudoMTLOHI<ACC128, GPR64>;
 
 /// Sign Ext In Register Instructions.
-def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
-def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
+def SEB64 : SignExtInReg<"seb", i8, GPR64Opnd, II_SEB>, SEB_FM<0x10, 0x20>,
+            ISA_MIPS32R2;
+def SEH64 : SignExtInReg<"seh", i16, GPR64Opnd, II_SEH>, SEB_FM<0x18, 0x20>,
+            ISA_MIPS32R2;
 }
 
 /// Count Leading
-def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>;
-def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>;
+def DCLZ : CountLeading0<"dclz", GPR64Opnd>, CLO_FM<0x24>, ISA_MIPS64;
+def DCLO : CountLeading1<"dclo", GPR64Opnd>, CLO_FM<0x25>, ISA_MIPS64;
 
 /// Double Word Swap Bytes/HalfWords
-def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>;
-def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>;
+def DSBH : SubwordSwap<"dsbh", GPR64Opnd>, SEB_FM<2, 0x24>, ISA_MIPS64R2;
+def DSHD : SubwordSwap<"dshd", GPR64Opnd>, SEB_FM<5, 0x24>, ISA_MIPS64R2;
 
 def LEA_ADDiu64 : EffectiveAddress<"daddiu", GPR64Opnd>, LW_FM<0x19>;
 
@@ -229,8 +245,19 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
                     "sll\t$rd, $rt, 0", [], II_SLL>;
 }
 
+// We need the following pseudo instruction to avoid offset calculation for
+// long branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: daddiu $dst, $src, %PART($tgt - $baltgt)
+// where %PART may be %hi or %lo, depending on the relocation kind
+// that $tgt is annotated with.
+def LONG_BRANCH_DADDiu : PseudoSE<(outs GPR64Opnd:$dst),
+  (ins GPR64Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
 // Cavium Octeon cmMIPS instructions
-let Predicates = [HasCnMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: The lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [HasCnMips] in {
 
 class Count1s<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
@@ -254,6 +281,14 @@ class SetCC64_R<string opstr, PatFrag cond_op> :
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
+class SetCC64_I<string opstr, PatFrag cond_op>:
+  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, simm10_64:$imm10),
+         !strconcat(opstr, "\t$rt, $rs, $imm10"),
+         [(set GPR64Opnd:$rt, (cond_op GPR64Opnd:$rs, immSExt10_64:$imm10))],
+         II_SEQI_SNEI, FrmI, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rs";
+}
+
 // Unsigned Byte Add
 let Pattern = [(set GPR64Opnd:$rd,
                     (and (add GPR64Opnd:$rs, GPR64Opnd:$rt), 255))] in
@@ -287,7 +322,25 @@ def DPOP  : Count1s<"dpop", GPR64Opnd>, POP_FM<0x2d>;
 
 // Set on equal/not equal
 def SEQ   : SetCC64_R<"seq", seteq>, SEQ_FM<0x2a>;
+def SEQi  : SetCC64_I<"seqi", seteq>, SEQI_FM<0x2e>;
 def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
+def SNEi  : SetCC64_I<"snei", setne>, SEQI_FM<0x2f>;
+
+// 192-bit x 64-bit Unsigned Multiply and Add
+let Defs = [P0, P1, P2] in
+def V3MULU: ArithLogicR<"v3mulu", GPR64Opnd, 0, II_DMUL>,
+                                  ADD_FM<0x1c, 0x11>;
+
+// 64-bit Unsigned Multiply and Add Move
+let Defs = [MPL0, P0, P1, P2] in
+def VMM0  : ArithLogicR<"vmm0", GPR64Opnd, 0, II_DMUL>,
+                                ADD_FM<0x1c, 0x10>;
+
+// 64-bit Unsigned Multiply and Add
+let Defs = [MPL1, MPL2, P0, P1, P2] in
+def VMULU : ArithLogicR<"vmulu", GPR64Opnd, 0, II_DMUL>,
+                                 ADD_FM<0x1c, 0x0f>;
+
 }
 
 }
@@ -297,12 +350,10 @@ def SNE   : SetCC64_R<"sne", setne>, SEQ_FM<0x2b>;
 //===----------------------------------------------------------------------===//
 
 // extended loads
-let Predicates = [HasStdEnc] in {
-  def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
-}
+def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
+def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 
 // hi/lo relocs
 def : MipsPat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
@@ -355,8 +406,7 @@ defm : SetgeImmPats<GPR64, SLTi64, SLTiu64>;
 
 // truncate
 def : MipsPat<(i32 (trunc GPR64:$src)),
-              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>,
-      Requires<[HasStdEnc]>;
+              (SLL (EXTRACT_SUBREG GPR64:$src, sub_32), 0)>;
 
 // 32-to-64-bit extension
 def : MipsPat<(i64 (anyext GPR32:$src)), (SLL64_32 GPR32:$src)>;
@@ -373,64 +423,59 @@ def : MipsPat<(bswap GPR64:$rt), (DSHD (DSBH GPR64:$rt))>;
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst, $src",
-                (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
-      Requires<[HasMips64]>;
-def : InstAlias<"daddu $rs, $rt, $imm",
-                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                0>;
-def : InstAlias<"dadd $rs, $rt, $imm",
-                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
-                0>;
-def : InstAlias<"daddu $rs, $imm",
-                (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"dadd $rs, $imm",
-                (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"add $rs, $imm",
-                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"addu $rs, $imm",
-                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-let isPseudo=1, usesCustomInserter=1, isCodeGenOnly=1 in {
-def SUBi : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
-                    "sub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def SUBiu : MipsInst<(outs GPR32Opnd: $rt), (ins GPR32Opnd: $rs, simm16: $imm),
-                    "subu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def DSUBi : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
-                    "ssub\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-def DSUBiu : MipsInst<(outs GPR64Opnd: $rt), (ins GPR64Opnd: $rs, simm16_64: $imm),
-                    "ssubu\t$rt, $rs, $imm", [], II_DSUB, Pseudo>;
-}
-def : InstAlias<"dsubu $rt, $rs, $imm",
-                (DSUBiu GPR64Opnd:$rt, GPR64Opnd:$rs, simm16_64: $imm),
-                0>;
-def : InstAlias<"sub $rs, $imm",
-                (SUBi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"subu $rs, $imm",
-                (SUBiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
-                0>;
-def : InstAlias<"dsub $rs, $imm",
-                (DSUBi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
-def : InstAlias<"dsubu $rs, $imm",
-                (DSUBiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
-                0>;
+def : MipsInstAlias<"move $dst, $src",
+                    (DADDu GPR64Opnd:$dst,  GPR64Opnd:$src, ZERO_64), 1>,
+      GPR_64;
+def : MipsInstAlias<"daddu $rs, $rt, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $rt, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rt, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"daddu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"dadd $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
+                    0>;
+def : MipsInstAlias<"add $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"addu $rs, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm),
+                    0>;
+def : MipsInstAlias<"dsll $rd, $rt, $rs",
+                    (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsubu $rt, $rs, $imm",
+                    (DADDiu GPR64Opnd:$rt, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm), 0>;
+def : MipsInstAlias<"dsub $rs, $imm",
+                    (DADDi GPR64Opnd:$rs, GPR64Opnd:$rs,
+                           InvertedImOperand64:$imm),
+                    0>;
+def : MipsInstAlias<"dsubu $rs, $imm",
+                    (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs,
+                            InvertedImOperand64:$imm),
+                    0>;
+def : MipsInstAlias<"dsra $rd, $rt, $rs",
+                    (DSRAV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
+def : MipsInstAlias<"dsrl $rd, $rt, $rs",
+                    (DSRLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
+                    ISA_MIPS3;
 
 /// Move between CPU and coprocessor registers
 let DecoderNamespace = "Mips64", Predicates = [HasMips64] in {
 def DMFC0 : MFC3OP<"dmfc0", GPR64Opnd>, MFC3OP_FM<0x10, 1>;
-def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>;
-def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>;
-def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>;
+def DMTC0 : MFC3OP<"dmtc0", GPR64Opnd>, MFC3OP_FM<0x10, 5>, ISA_MIPS3;
+def DMFC2 : MFC3OP<"dmfc2", GPR64Opnd>, MFC3OP_FM<0x12, 1>, ISA_MIPS3;
+def DMTC2 : MFC3OP<"dmtc2", GPR64Opnd>, MFC3OP_FM<0x12, 5>, ISA_MIPS3;
 }
 
 // Two operand (implicit 0 selector) versions:
-def : InstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
-def : InstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmfc0 $rt, $rd", (DMFC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc0 $rt, $rd", (DMTC0 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmfc2 $rt, $rd", (DMFC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"dmtc2 $rt, $rd", (DMTC2 GPR64Opnd:$rt, GPR64Opnd:$rd, 0), 0>;
 
diff --git a/lib/Target/Mips/Mips64r6InstrInfo.td b/lib/Target/Mips/Mips64r6InstrInfo.td
new file mode 100644
index 0000000..f971218
--- /dev/null
+++ b/lib/Target/Mips/Mips64r6InstrInfo.td
@@ -0,0 +1,88 @@
+//=- Mips64r6InstrInfo.td - Mips64r6 Instruction Information -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips64r6 instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// Notes about removals/changes from MIPS32r6:
+// Reencoded: dclo, dclz
+// Reencoded: lld, scd
+// Removed: daddi
+// Removed: ddiv, ddivu, dmult, dmultu
+// Removed: div, divu
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Encodings
+//
+//===----------------------------------------------------------------------===//
+
+class DALIGN_ENC  : SPECIAL3_DALIGN_FM<OPCODE6_DALIGN>;
+class DAUI_ENC    : DAUI_FM;
+class DAHI_ENC    : REGIMM_FM<OPCODE5_DAHI>;
+class DATI_ENC    : REGIMM_FM<OPCODE5_DATI>;
+class DBITSWAP_ENC : SPECIAL3_2R_FM<OPCODE6_DBITSWAP>;
+class DDIV_ENC    : SPECIAL_3R_FM<0b00010, 0b011110>;
+class DDIVU_ENC   : SPECIAL_3R_FM<0b00010, 0b011111>;
+class DMOD_ENC    : SPECIAL_3R_FM<0b00011, 0b011110>;
+class DMODU_ENC   : SPECIAL_3R_FM<0b00011, 0b011111>;
+class DMUH_ENC    : SPECIAL_3R_FM<0b00011, 0b111000>;
+class DMUHU_ENC   : SPECIAL_3R_FM<0b00011, 0b111001>;
+class DMUL_R6_ENC : SPECIAL_3R_FM<0b00010, 0b111000>;
+class DMULU_ENC   : SPECIAL_3R_FM<0b00010, 0b111001>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Descriptions
+//
+//===----------------------------------------------------------------------===//
+
+class AHI_ATI_DESC_BASE<string instr_asm, RegisterOperand GPROpnd> {
+  dag OutOperandList = (outs GPROpnd:$rs);
+  dag InOperandList = (ins GPROpnd:$rt, simm16:$imm);
+  string AsmString = !strconcat(instr_asm, "\t$rt, $imm");
+  string Constraints = "$rs = $rt";
+}
+
+class DALIGN_DESC  : ALIGN_DESC_BASE<"dalign", GPR64Opnd, uimm3>;
+class DAHI_DESC    : AHI_ATI_DESC_BASE<"dahi", GPR64Opnd>;
+class DATI_DESC    : AHI_ATI_DESC_BASE<"dati", GPR64Opnd>;
+class DAUI_DESC    : AUI_DESC_BASE<"daui", GPR64Opnd>;
+class DBITSWAP_DESC : BITSWAP_DESC_BASE<"dbitswap", GPR64Opnd>;
+class DDIV_DESC    : DIVMOD_DESC_BASE<"ddiv", GPR64Opnd>;
+class DDIVU_DESC   : DIVMOD_DESC_BASE<"ddivu", GPR64Opnd>;
+class DMOD_DESC    : DIVMOD_DESC_BASE<"dmod", GPR64Opnd>;
+class DMODU_DESC   : DIVMOD_DESC_BASE<"dmodu", GPR64Opnd>;
+class DMUH_DESC    : MUL_R6_DESC_BASE<"dmuh", GPR64Opnd>;
+class DMUHU_DESC   : MUL_R6_DESC_BASE<"dmuhu", GPR64Opnd>;
+class DMUL_R6_DESC : MUL_R6_DESC_BASE<"dmul", GPR64Opnd>;
+class DMULU_DESC   : MUL_R6_DESC_BASE<"dmulu", GPR64Opnd>;
+
+//===----------------------------------------------------------------------===//
+//
+// Instruction Definitions
+//
+//===----------------------------------------------------------------------===//
+
+def DAHI : DAHI_ENC, DAHI_DESC, ISA_MIPS64R6;
+def DALIGN : DALIGN_ENC, DALIGN_DESC, ISA_MIPS64R6;
+def DATI : DATI_ENC, DATI_DESC, ISA_MIPS64R6;
+def DAUI : DAUI_ENC, DAUI_DESC, ISA_MIPS64R6;
+def DBITSWAP : DBITSWAP_ENC, DBITSWAP_DESC, ISA_MIPS64R6;
+def DDIV : DDIV_ENC, DDIV_DESC, ISA_MIPS64R6;
+def DDIVU : DDIVU_ENC, DDIVU_DESC, ISA_MIPS64R6;
+// def DLSA; // See MSA
+def DMOD : DMOD_ENC, DMOD_DESC, ISA_MIPS64R6;
+def DMODU : DMODU_ENC, DMODU_DESC, ISA_MIPS64R6;
+def DMUH: DMUH_ENC, DMUH_DESC, ISA_MIPS64R6;
+def DMUHU: DMUHU_ENC, DMUHU_DESC, ISA_MIPS64R6;
+def DMUL_R6: DMUL_R6_ENC, DMUL_R6_DESC, ISA_MIPS64R6;
+def DMULU: DMULU_ENC, DMULU_DESC, ISA_MIPS64R6;
+def LDPC;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index d5df855..6df90aa 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-asm-printer"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
@@ -52,6 +51,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-asm-printer"
+
 MipsTargetStreamer &MipsAsmPrinter::getTargetStreamer() {
   return static_cast<MipsTargetStreamer &>(*OutStreamer.getTargetStreamer());
 }
@@ -147,7 +148,8 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // removing another test for this situation downstream in the
     // callchain.
     //
-    if (I->isPseudo() && !Subtarget->inMips16Mode())
+    if (I->isPseudo() && !Subtarget->inMips16Mode()
+        && !isLongBranchPseudo(I->getOpcode()))
       llvm_unreachable("Pseudo opcode found in EmitInstruction()");
 
     MCInst TmpInst0;
@@ -285,9 +287,8 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
 
   if (Subtarget->inMicroMipsMode())
     TS.emitDirectiveSetMicroMips();
-  // leave out until FSF available gas has micromips changes
-  //  else
-  //    TS.emitDirectiveSetNoMicroMips();
+  else
+    TS.emitDirectiveSetNoMicroMips();
 
   if (Subtarget->inMips16Mode())
     TS.emitDirectiveSetMips16();
@@ -621,16 +622,29 @@ printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // TODO: Need to add -mabicalls and -mno-abicalls flags.
   // Currently we assume that -mabicalls is the default.
-  getTargetStreamer().emitDirectiveAbiCalls();
-  Reloc::Model RM = Subtarget->getRelocationModel();
-  if (RM == Reloc::Static && !Subtarget->hasMips64())
-    getTargetStreamer().emitDirectiveOptionPic0();
+  bool IsABICalls = true;
+  if (IsABICalls) {
+    getTargetStreamer().emitDirectiveAbiCalls();
+    Reloc::Model RM = Subtarget->getRelocationModel();
+    // FIXME: This condition should be a lot more complicated that it is here.
+    //        Ideally it should test for properties of the ABI and not the ABI
+    //        itself.
+    //        For the moment, I'm only correcting enough to make MIPS-IV work.
+    if (RM == Reloc::Static && !Subtarget->isABI_N64())
+      getTargetStreamer().emitDirectiveOptionPic0();
+  }
 
   // Tell the assembler which ABI we are using
   std::string SectionName = std::string(".mdebug.") + getCurrentABIString();
   OutStreamer.SwitchSection(OutContext.getELFSection(
       SectionName, ELF::SHT_PROGBITS, 0, SectionKind::getDataRel()));
 
+  // NaN: At the moment we only support:
+  // 1. .nan legacy (default)
+  // 2. .nan 2008
+  Subtarget->isNaN2008() ? getTargetStreamer().emitDirectiveNaN2008()
+    : getTargetStreamer().emitDirectiveNaNLegacy();
+
   // TODO: handle O64 ABI
 
   if (Subtarget->isABI_EABI()) {
@@ -824,7 +838,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   const MCSectionELF *M = OutContext.getELFSection(
       ".mips16.call.fp." + std::string(Symbol), ELF::SHT_PROGBITS,
       ELF::SHF_ALLOC | ELF::SHF_EXECINSTR, SectionKind::getText());
-  OutStreamer.SwitchSection(M, 0);
+  OutStreamer.SwitchSection(M, nullptr);
   //
   // .align 2
   //
@@ -941,6 +955,12 @@ void MipsAsmPrinter::NaClAlignIndirectJumpTargets(MachineFunction &MF) {
   }
 }
 
+bool MipsAsmPrinter::isLongBranchPseudo(int Opcode) const {
+  return (Opcode == Mips::LONG_BRANCH_LUi
+          || Opcode == Mips::LONG_BRANCH_ADDiu
+          || Opcode == Mips::LONG_BRANCH_DADDiu);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeMipsAsmPrinter() {
   RegisterAsmPrinter<MipsAsmPrinter> X(TheMipsTarget);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 3e9093e..e82b145 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -75,6 +75,8 @@ private:
 
   void NaClAlignIndirectJumpTargets(MachineFunction &MF);
 
+  bool isLongBranchPseudo(int Opcode) const;
+
 public:
 
   const MipsSubtarget *Subtarget;
@@ -82,18 +84,18 @@ public:
   MipsMCInstLower MCInstLowering;
 
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
-    : AsmPrinter(TM, Streamer), MCP(0), InConstantPool(false),
+    : AsmPrinter(TM, Streamer), MCP(nullptr), InConstantPool(false),
       MCInstLowering(*this) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Mips Assembly Printer";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual void EmitConstantPool() override {
+  void EmitConstantPool() override {
     bool UsingConstantPools =
       (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
     if (!UsingConstantPools)
@@ -101,30 +103,30 @@ public:
     // we emit constant pools customly!
   }
 
-  void EmitInstruction(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
   void printSavedRegsBitmask();
   void emitFrameDirective();
   const char *getCurrentABIString() const;
-  virtual void EmitFunctionEntryLabel();
-  virtual void EmitFunctionBodyStart();
-  virtual void EmitFunctionBodyEnd();
-  virtual bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
-                                                 MBB) const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  bool isBlockOnlyReachableByFallthrough(
+                                   const MachineBasicBlock* MBB) const override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &O);
+                       raw_ostream &O) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &O);
+                             raw_ostream &O) override;
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printUnsignedImm8(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printMemOperandEA(const MachineInstr *MI, int opNum, raw_ostream &O);
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                       const char *Modifier = 0);
-  void EmitStartOfAsmFile(Module &M);
-  void EmitEndOfAsmFile(Module &M);
+                       const char *Modifier = nullptr);
+  void EmitStartOfAsmFile(Module &M) override;
+  void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 };
 }
diff --git a/lib/Target/Mips/MipsCallingConv.td b/lib/Target/Mips/MipsCallingConv.td
index 615310f..c83d880 100644
--- a/lib/Target/Mips/MipsCallingConv.td
+++ b/lib/Target/Mips/MipsCallingConv.td
@@ -245,8 +245,8 @@ def CSR_O32 : CalleeSavedRegs<(add (sequence "D%u", 15, 10), RA, FP,
 def CSR_O32_FP64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 20), RA, FP,
                                         (sequence "S%u", 7, 0))>;
 
-def CSR_N32 : CalleeSavedRegs<(add D31_64, D29_64, D27_64, D25_64, D24_64,
-                                   D23_64, D22_64, D21_64, RA_64, FP_64, GP_64,
+def CSR_N32 : CalleeSavedRegs<(add D20_64, D22_64, D24_64, D26_64, D28_64,
+                                   D30_64, RA_64, FP_64, GP_64,
                                    (sequence "S%u_64", 7, 0))>;
 
 def CSR_N64 : CalleeSavedRegs<(add (sequence "D%u_64", 31, 24), RA_64, FP_64,
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index ea49086..13fa546 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsInstrInfo.h"
@@ -41,6 +40,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -56,7 +57,7 @@ class MipsCodeEmitter : public MachineFunctionPass {
   const std::vector<MachineJumpTableEntry> *MJTEs;
   bool IsPIC;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfo> ();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -65,13 +66,13 @@ class MipsCodeEmitter : public MachineFunctionPass {
 
 public:
   MipsCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
-      TM(tm), MCE(mce), MCPEs(0), MJTEs(0),
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr), MJTEs(nullptr),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-  bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Mips Machine Code Emitter";
   }
 
@@ -109,6 +110,12 @@ private:
   unsigned getBranchTargetOpValueMM(const MachineInstr &MI,
                                     unsigned OpNo) const;
 
+  unsigned getBranchTarget21OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getBranchTarget26OpValue(const MachineInstr &MI,
+                                    unsigned OpNo) const;
+  unsigned getJumpOffset16OpValue(const MachineInstr &MI, unsigned OpNo) const;
+
   unsigned getBranchTargetOpValue(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getMemEncodingMMImm12(const MachineInstr &MI, unsigned OpNo) const;
@@ -116,6 +123,7 @@ private:
   unsigned getSizeExtEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getSizeInsEncoding(const MachineInstr &MI, unsigned OpNo) const;
   unsigned getLSAImmEncoding(const MachineInstr &MI, unsigned OpNo) const;
+  unsigned getSimm19Lsl2Encoding(const MachineInstr &MI, unsigned OpNo) const;
 
   /// Expand pseudo instructions with accumulator register operands.
   void expandACCInstr(MachineBasicBlock::instr_iterator MI,
@@ -138,7 +146,7 @@ bool MipsCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   TD = Target.getDataLayout();
   Subtarget = &TM.getSubtarget<MipsSubtarget> ();
   MCPEs = &MF.getConstantPool()->getConstants();
-  MJTEs = 0;
+  MJTEs = nullptr;
   if (MF.getJumpTableInfo()) MJTEs = &MF.getJumpTableInfo()->getJumpTables();
   JTI->Initialize(MF, IsPIC, Subtarget->isLittle());
   MCE.setModuleInfo(&getAnalysis<MachineModuleInfo> ());
@@ -201,6 +209,24 @@ unsigned MipsCodeEmitter::getBranchTargetOpValueMM(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getBranchTarget21OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getBranchTarget26OpValue(const MachineInstr &MI,
+                                                   unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
+unsigned MipsCodeEmitter::getJumpOffset16OpValue(const MachineInstr &MI,
+                                                 unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 unsigned MipsCodeEmitter::getBranchTargetOpValue(const MachineInstr &MI,
                                                  unsigned OpNo) const {
   MachineOperand MO = MI.getOperand(OpNo);
@@ -247,6 +273,12 @@ unsigned MipsCodeEmitter::getLSAImmEncoding(const MachineInstr &MI,
   return 0;
 }
 
+unsigned MipsCodeEmitter::getSimm19Lsl2Encoding(const MachineInstr &MI,
+                                                unsigned OpNo) const {
+  llvm_unreachable("Unimplemented function.");
+  return 0;
+}
+
 /// getMachineOpValue - Return binary encoding of operand. If the machine
 /// operand requires relocation, record the relocation and return zero.
 unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 567eef9..7177f65 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -104,9 +104,9 @@ multiclass MovnPats<RegisterClass CRC, RegisterClass DRC, Instruction MOVNInst,
 
 // Instantiation of instructions.
 def MOVZ_I_I : MMRel, CMov_I_I_FT<"movz", GPR32Opnd, GPR32Opnd, II_MOVZ>,
-               ADD_FM<0, 0xa>;
+               ADD_FM<0, 0xa>, INSN_MIPS4_32;
 
-let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   def MOVZ_I_I64   : CMov_I_I_FT<"movz", GPR32Opnd, GPR64Opnd, II_MOVZ>,
                      ADD_FM<0, 0xa>;
   def MOVZ_I64_I   : CMov_I_I_FT<"movz", GPR64Opnd, GPR32Opnd, II_MOVZ>,
@@ -116,9 +116,9 @@ let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
 }
 
 def MOVN_I_I       : MMRel, CMov_I_I_FT<"movn", GPR32Opnd, GPR32Opnd, II_MOVN>,
-                     ADD_FM<0, 0xb>;
+                     ADD_FM<0, 0xb>, INSN_MIPS4_32;
 
-let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
+let isCodeGenOnly = 1 in {
   def MOVN_I_I64   : CMov_I_I_FT<"movn", GPR32Opnd, GPR64Opnd, II_MOVN>,
                      ADD_FM<0, 0xb>;
   def MOVN_I64_I   : CMov_I_I_FT<"movn", GPR64Opnd, GPR32Opnd, II_MOVN>,
@@ -128,118 +128,112 @@ let Predicates = [HasStdEnc], isCodeGenOnly = 1 in {
 }
 
 def MOVZ_I_S : MMRel, CMov_I_F_FT<"movz.s", GPR32Opnd, FGR32Opnd, II_MOVZ_S>,
-               CMov_I_F_FM<18, 16>;
+               CMov_I_F_FM<18, 16>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVZ_I64_S : CMov_I_F_FT<"movz.s", GPR64Opnd, FGR32Opnd, II_MOVZ_S>,
-                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]>;
+                 CMov_I_F_FM<18, 16>, AdditionalRequires<[HasMips64]>;
 
 def MOVN_I_S : MMRel, CMov_I_F_FT<"movn.s", GPR32Opnd, FGR32Opnd, II_MOVN_S>,
-               CMov_I_F_FM<19, 16>;
+               CMov_I_F_FM<19, 16>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVN_I64_S : CMov_I_F_FT<"movn.s", GPR64Opnd, FGR32Opnd, II_MOVN_S>,
-                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]>;
+                 CMov_I_F_FM<19, 16>, AdditionalRequires<[IsGP64bit]>;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
-                                      II_MOVZ_D>, CMov_I_F_FM<18, 17>;
-  def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
-                                      II_MOVN_D>, CMov_I_F_FM<19, 17>;
-}
+def MOVZ_I_D32 : MMRel, CMov_I_F_FT<"movz.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVZ_D>, CMov_I_F_FM<18, 17>,
+                 INSN_MIPS4_32, FGR_32;
+def MOVN_I_D32 : MMRel, CMov_I_F_FT<"movn.d", GPR32Opnd, AFGR64Opnd,
+                                    II_MOVN_D>, CMov_I_F_FM<19, 17>,
+                 INSN_MIPS4_32, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", GPR32Opnd, FGR64Opnd, II_MOVZ_D>,
-                   CMov_I_F_FM<18, 17>;
+                   CMov_I_F_FM<18, 17>, INSN_MIPS4_32, FGR_64;
   def MOVN_I_D64 : CMov_I_F_FT<"movn.d", GPR32Opnd, FGR64Opnd, II_MOVN_D>,
-                   CMov_I_F_FM<19, 17>;
+                   CMov_I_F_FM<19, 17>, INSN_MIPS4_32, FGR_64;
   let isCodeGenOnly = 1 in {
     def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>;
+                                   II_MOVZ_D>, CMov_I_F_FM<18, 17>, FGR_64;
     def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", GPR64Opnd, FGR64Opnd,
-                                   II_MOVN_D>, CMov_I_F_FM<19, 17>;
+                                   II_MOVN_D>, CMov_I_F_FM<19, 17>, FGR_64;
   }
 }
 
 def MOVT_I : MMRel, CMov_F_I_FT<"movt", GPR32Opnd, II_MOVT, MipsCMovFP_T>,
-             CMov_F_I_FM<1>;
+             CMov_F_I_FM<1>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVT_I64 : CMov_F_I_FT<"movt", GPR64Opnd, II_MOVT, MipsCMovFP_T>,
-               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]>;
+               CMov_F_I_FM<1>, AdditionalRequires<[IsGP64bit]>;
 
 def MOVF_I : MMRel, CMov_F_I_FT<"movf", GPR32Opnd, II_MOVF, MipsCMovFP_F>,
-             CMov_F_I_FM<0>;
+             CMov_F_I_FM<0>, INSN_MIPS4_32;
 
 let isCodeGenOnly = 1 in
 def MOVF_I64 : CMov_F_I_FT<"movf", GPR64Opnd, II_MOVF, MipsCMovFP_F>,
-               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]>;
+               CMov_F_I_FM<0>, AdditionalRequires<[IsGP64bit]>;
 
 def MOVT_S : MMRel, CMov_F_F_FT<"movt.s", FGR32Opnd, II_MOVT_S, MipsCMovFP_T>,
-             CMov_F_F_FM<16, 1>;
+             CMov_F_F_FM<16, 1>, INSN_MIPS4_32;
 def MOVF_S : MMRel, CMov_F_F_FT<"movf.s", FGR32Opnd, II_MOVF_S, MipsCMovFP_F>,
-             CMov_F_F_FM<16, 0>;
+             CMov_F_F_FM<16, 0>, INSN_MIPS4_32;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
-                                    MipsCMovFP_T>, CMov_F_F_FM<17, 1>;
-  def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
-                                    MipsCMovFP_F>, CMov_F_F_FM<17, 0>;
-}
+def MOVT_D32 : MMRel, CMov_F_F_FT<"movt.d", AFGR64Opnd, II_MOVT_D,
+                                  MipsCMovFP_T>, CMov_F_F_FM<17, 1>,
+               INSN_MIPS4_32, FGR_32;
+def MOVF_D32 : MMRel, CMov_F_F_FT<"movf.d", AFGR64Opnd, II_MOVF_D,
+                                  MipsCMovFP_F>, CMov_F_F_FM<17, 0>,
+               INSN_MIPS4_32, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64Opnd, II_MOVT_D, MipsCMovFP_T>,
-                 CMov_F_F_FM<17, 1>;
+                 CMov_F_F_FM<17, 1>, INSN_MIPS4_32, FGR_64;
   def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64Opnd, II_MOVF_D, MipsCMovFP_F>,
-                 CMov_F_F_FM<17, 0>;
+                 CMov_F_F_FM<17, 0>, INSN_MIPS4_32, FGR_64;
 }
 
 // Instantiation of conditional move patterns.
 defm : MovzPats0<GPR32, GPR32, MOVZ_I_I, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<GPR32, GPR32, MOVZ_I_I, XOR>;
 defm : MovzPats2<GPR32, GPR32, MOVZ_I_I, XORi>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>;
-  defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>;
-  defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>;
-  defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>;
-  defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>;
-  defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>;
-}
+
+defm : MovzPats0<GPR32, GPR64, MOVZ_I_I64, SLT, SLTu, SLTi, SLTiu>, GPR_64;
+defm : MovzPats0<GPR64, GPR32, MOVZ_I_I, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats0<GPR64, GPR64, MOVZ_I_I64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats1<GPR32, GPR64, MOVZ_I_I64, XOR>, GPR_64;
+defm : MovzPats1<GPR64, GPR32, MOVZ_I64_I, XOR64>, GPR_64;
+defm : MovzPats1<GPR64, GPR64, MOVZ_I64_I64, XOR64>, GPR_64;
+defm : MovzPats2<GPR32, GPR64, MOVZ_I_I64, XORi>, GPR_64;
+defm : MovzPats2<GPR64, GPR32, MOVZ_I64_I, XORi64>, GPR_64;
+defm : MovzPats2<GPR64, GPR64, MOVZ_I64_I64, XORi64>, GPR_64;
 
 defm : MovnPats<GPR32, GPR32, MOVN_I_I, XOR>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>;
-  defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>;
-  defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>;
-}
+
+defm : MovnPats<GPR32, GPR64, MOVN_I_I64, XOR>, GPR_64;
+defm : MovnPats<GPR64, GPR32, MOVN_I64_I, XOR64>, GPR_64;
+defm : MovnPats<GPR64, GPR64, MOVN_I64_I64, XOR64>, GPR_64;
 
 defm : MovzPats0<GPR32, FGR32, MOVZ_I_S, SLT, SLTu, SLTi, SLTiu>;
 defm : MovzPats1<GPR32, FGR32, MOVZ_I_S, XOR>;
 defm : MovnPats<GPR32, FGR32, MOVN_I_S, XOR>;
-let Predicates = [HasMips64, HasStdEnc] in {
-  defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>;
-  defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>;
-}
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>;
-  defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>;
-}
-let Predicates = [IsFP64bit, HasStdEnc] in {
-  defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>;
-  defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64,
-                   SLTiu64>;
-  defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>;
-  defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>;
-  defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>;
-  defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>;
-}
+defm : MovzPats0<GPR64, FGR32, MOVZ_I_S, SLT64, SLTu64, SLTi64, SLTiu64>,
+       GPR_64;
+defm : MovzPats1<GPR64, FGR32, MOVZ_I64_S, XOR64>, GPR_64;
+defm : MovnPats<GPR64, FGR32, MOVN_I64_S, XOR64>, GPR_64;
+
+defm : MovzPats0<GPR32, AFGR64, MOVZ_I_D32, SLT, SLTu, SLTi, SLTiu>, FGR_32;
+defm : MovzPats1<GPR32, AFGR64, MOVZ_I_D32, XOR>, FGR_32;
+defm : MovnPats<GPR32, AFGR64, MOVN_I_D32, XOR>, FGR_32;
+
+defm : MovzPats0<GPR32, FGR64, MOVZ_I_D64, SLT, SLTu, SLTi, SLTiu>, FGR_64;
+defm : MovzPats0<GPR64, FGR64, MOVZ_I_D64, SLT64, SLTu64, SLTi64, SLTiu64>,
+       FGR_64;
+defm : MovzPats1<GPR32, FGR64, MOVZ_I_D64, XOR>, FGR_64;
+defm : MovzPats1<GPR64, FGR64, MOVZ_I64_D64, XOR64>, FGR_64;
+defm : MovnPats<GPR32, FGR64, MOVN_I_D64, XOR>, FGR_64;
+defm : MovnPats<GPR64, FGR64, MOVN_I64_D64, XOR64>, FGR_64;
diff --git a/lib/Target/Mips/MipsConstantIslandPass.cpp b/lib/Target/Mips/MipsConstantIslandPass.cpp
index e5642ba..a37062f 100644
--- a/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -21,8 +21,6 @@
 //
 //
 
-#define DEBUG_TYPE "mips-constant-islands"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips16InstrInfo.h"
@@ -47,6 +45,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-constant-islands"
+
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -368,14 +368,14 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        STI(&TM.getSubtarget<MipsSubtarget>()), MF(0), MCP(0),
+        STI(&TM.getSubtarget<MipsSubtarget>()), MF(nullptr), MCP(nullptr),
         PrescannedForConstants(false){}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Constant Islands";
     }
 
-    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineFunction(MachineFunction &F) override;
 
     void doInitialPlacement(std::vector<MachineInstr*> &CPEMIs);
     CPEntry *findConstPoolEntry(unsigned CPI, const MachineInstr *CPEMI);
@@ -628,7 +628,7 @@ MipsConstantIslands::CPEntry
     if (CPEs[i].CPEMI == CPEMI)
       return &CPEs[i];
   }
-  return NULL;
+  return nullptr;
 }
 
 /// getCPELogAlign - Returns the required alignment of the constant pool entry
@@ -1065,7 +1065,7 @@ bool MipsConstantIslands::decrementCPEReferenceCount(unsigned CPI,
   assert(CPE && "Unexpected!");
   if (--CPE->RefCount == 0) {
     removeDeadCPEMI(CPEMI);
-    CPE->CPEMI = NULL;
+    CPE->CPEMI = nullptr;
     --NumCPEs;
     return true;
   }
@@ -1098,7 +1098,7 @@ int MipsConstantIslands::findInRangeCPEntry(CPUser& U, unsigned UserOffset)
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI, U.getMaxDisp(),
                      U.NegOk)) {
@@ -1154,7 +1154,7 @@ int MipsConstantIslands::findLongFormInRangeCPEntry
     if (CPEs[i].CPEMI == CPEMI)
       continue;
     // Removing CPEs can leave empty entries, skip
-    if (CPEs[i].CPEMI == NULL)
+    if (CPEs[i].CPEMI == nullptr)
       continue;
     if (isCPEntryInRange(UserMI, UserOffset, CPEs[i].CPEMI,
                          U.getLongFormMaxDisp(), U.NegOk)) {
@@ -1486,7 +1486,7 @@ bool MipsConstantIslands::removeUnusedCPEntries() {
       for (unsigned j = 0, ee = CPEs.size(); j != ee; ++j) {
         if (CPEs[j].RefCount == 0 && CPEs[j].CPEMI) {
           removeDeadCPEMI(CPEs[j].CPEMI);
-          CPEs[j].CPEMI = NULL;
+          CPEs[j].CPEMI = nullptr;
           MadeChange = true;
         }
       }
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index eef9f38..d6c7cac 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "delay-slot-filler"
-
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
@@ -33,6 +31,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "delay-slot-filler"
+
 STATISTIC(FilledSlots, "Number of delay slots filled");
 STATISTIC(UsefulSlots, "Number of delay slots filled with instructions that"
                        " are not NOP.");
@@ -124,7 +124,7 @@ namespace {
   public:
     NoMemInstr() : InspectMemInstr(true) {}
   private:
-    virtual bool hasHazard_(const MachineInstr &MI) { return true; }
+    bool hasHazard_(const MachineInstr &MI) override { return true; }
   };
 
   /// This subclass accepts loads from stacks and constant loads.
@@ -132,7 +132,7 @@ namespace {
   public:
     LoadFromStackOrConst() : InspectMemInstr(false) {}
   private:
-    virtual bool hasHazard_(const MachineInstr &MI);
+    bool hasHazard_(const MachineInstr &MI) override;
   };
 
   /// This subclass uses memory dependence information to determine whether a
@@ -142,19 +142,21 @@ namespace {
     MemDefsUses(const MachineFrameInfo *MFI);
 
   private:
-    virtual bool hasHazard_(const MachineInstr &MI);
+    typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
+    bool hasHazard_(const MachineInstr &MI) override;
 
     /// Update Defs and Uses. Return true if there exist dependences that
     /// disqualify the delay slot candidate between V and values in Uses and
     /// Defs.
-    bool updateDefsUses(const Value *V, bool MayStore);
+    bool updateDefsUses(ValueType V, bool MayStore);
 
     /// Get the list of underlying objects of MI's memory operand.
     bool getUnderlyingObjects(const MachineInstr &MI,
-                              SmallVectorImpl<const Value *> &Objects) const;
+                              SmallVectorImpl<ValueType> &Objects) const;
 
     const MachineFrameInfo *MFI;
-    SmallPtrSet<const Value*, 4> Uses, Defs;
+    SmallPtrSet<ValueType, 4> Uses, Defs;
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
@@ -166,11 +168,11 @@ namespace {
     Filler(TargetMachine &tm)
       : MachineFunctionPass(ID), TM(tm) { }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Delay Slot Filler";
     }
 
-    bool runOnMachineFunction(MachineFunction &F) {
+    bool runOnMachineFunction(MachineFunction &F) override {
       bool Changed = false;
       for (MachineFunction::iterator FI = F.begin(), FE = F.end();
            FI != FE; ++FI)
@@ -178,7 +180,7 @@ namespace {
       return Changed;
     }
 
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineBranchProbabilityInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -399,16 +401,15 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
   if (MI.mayStore())
     return true;
 
-  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getPseudoValue())
     return true;
 
-  const Value *V = (*MI.memoperands_begin())->getValue();
-
-  if (isa<FixedStackPseudoSourceValue>(V))
-    return false;
-
-  if (const PseudoSourceValue *PSV = dyn_cast<const PseudoSourceValue>(V))
-    return !PSV->isConstant(0) && V != PseudoSourceValue::getStack();
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (isa<FixedStackPseudoSourceValue>(PSV))
+      return false;
+    return !PSV->isConstant(nullptr) && PSV != PseudoSourceValue::getStack();
+  }
 
   return true;
 }
@@ -419,11 +420,11 @@ MemDefsUses::MemDefsUses(const MachineFrameInfo *MFI_)
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
-  SmallVector<const Value *, 4> Objs;
+  SmallVector<ValueType, 4> Objs;
 
   // Check underlying object list.
   if (getUnderlyingObjects(MI, Objs)) {
-    for (SmallVectorImpl<const Value *>::const_iterator I = Objs.begin();
+    for (SmallVectorImpl<ValueType>::const_iterator I = Objs.begin();
          I != Objs.end(); ++I)
       HasHazard |= updateDefsUses(*I, MI.mayStore());
 
@@ -440,7 +441,7 @@ bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   return HasHazard;
 }
 
-bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
+bool MemDefsUses::updateDefsUses(ValueType V, bool MayStore) {
   if (MayStore)
     return !Defs.insert(V) || Uses.count(V) || SeenNoObjStore || SeenNoObjLoad;
 
@@ -450,10 +451,20 @@ bool MemDefsUses::updateDefsUses(const Value *V, bool MayStore) {
 
 bool MemDefsUses::
 getUnderlyingObjects(const MachineInstr &MI,
-                     SmallVectorImpl<const Value *> &Objects) const {
-  if (!MI.hasOneMemOperand() || !(*MI.memoperands_begin())->getValue())
+                     SmallVectorImpl<ValueType> &Objects) const {
+  if (!MI.hasOneMemOperand() ||
+      (!(*MI.memoperands_begin())->getValue() &&
+       !(*MI.memoperands_begin())->getPseudoValue()))
     return false;
 
+  if (const PseudoSourceValue *PSV =
+      (*MI.memoperands_begin())->getPseudoValue()) {
+    if (!PSV->isAliased(MFI))
+      return false;
+    Objects.push_back(PSV);
+    return true;
+  }
+
   const Value *V = (*MI.memoperands_begin())->getValue();
 
   SmallVector<Value *, 4> Objs;
@@ -461,10 +472,7 @@ getUnderlyingObjects(const MachineInstr &MI,
 
   for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), E = Objs.end();
        I != E; ++I) {
-    if (const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(*I)) {
-      if (PSV->isAliased(MFI))
-        return false;
-    } else if (!isIdentifiedObject(V))
+    if (!isIdentifiedObject(V))
       return false;
 
     Objects.push_back(*I);
@@ -602,7 +610,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
   RegDefsUses RegDU(TM);
   bool HasMultipleSuccs = false;
   BB2BrMap BrMap;
-  OwningPtr<InspectMemInstr> IM;
+  std::unique_ptr<InspectMemInstr> IM;
   Iter Filler;
 
   // Iterate over SuccBB's predecessor list.
@@ -636,7 +644,7 @@ bool Filler::searchSuccBBs(MachineBasicBlock &MBB, Iter Slot) const {
 
 MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
   if (B.succ_empty())
-    return NULL;
+    return nullptr;
 
   // Select the successor with the larget edge weight.
   auto &Prob = getAnalysis<MachineBranchProbabilityInfo>();
@@ -645,14 +653,14 @@ MachineBasicBlock *Filler::selectSuccBB(MachineBasicBlock &B) const {
                                                const MachineBasicBlock *Dst1) {
     return Prob.getEdgeWeight(&B, Dst0) < Prob.getEdgeWeight(&B, Dst1);
   });
-  return S->isLandingPad() ? NULL : S;
+  return S->isLandingPad() ? nullptr : S;
 }
 
 std::pair<MipsInstrInfo::BranchType, MachineInstr *>
 Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   const MipsInstrInfo *TII =
     static_cast<const MipsInstrInfo*>(TM.getInstrInfo());
-  MachineBasicBlock *TrueBB = 0, *FalseBB = 0;
+  MachineBasicBlock *TrueBB = nullptr, *FalseBB = nullptr;
   SmallVector<MachineInstr*, 2> BranchInstrs;
   SmallVector<MachineOperand, 2> Cond;
 
@@ -660,11 +668,11 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
     TII->AnalyzeBranch(MBB, TrueBB, FalseBB, Cond, false, BranchInstrs);
 
   if ((R == MipsInstrInfo::BT_None) || (R == MipsInstrInfo::BT_NoBranch))
-    return std::make_pair(R, (MachineInstr*)NULL);
+    return std::make_pair(R, nullptr);
 
   if (R != MipsInstrInfo::BT_CondUncond) {
     if (!hasUnoccupiedSlot(BranchInstrs[0]))
-      return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+      return std::make_pair(MipsInstrInfo::BT_None, nullptr);
 
     assert(((R != MipsInstrInfo::BT_Uncond) || (TrueBB == &Dst)));
 
@@ -681,7 +689,7 @@ Filler::getBranch(MachineBasicBlock &MBB, const MachineBasicBlock &Dst) const {
   if (hasUnoccupiedSlot(BranchInstrs[1]) && (FalseBB == &Dst))
     return std::make_pair(MipsInstrInfo::BT_Uncond, BranchInstrs[1]);
 
-  return std::make_pair(MipsInstrInfo::BT_None, (MachineInstr*)NULL);
+  return std::make_pair(MipsInstrInfo::BT_None, nullptr);
 }
 
 bool Filler::examinePred(MachineBasicBlock &Pred, const MachineBasicBlock &Succ,
diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp
new file mode 100644
index 0000000..268a0ed
--- /dev/null
+++ b/lib/Target/Mips/MipsFastISel.cpp
@@ -0,0 +1,283 @@
+//===-- MipsastISel.cpp - Mips FastISel implementation
+//---------------------===//
+
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "MipsRegisterInfo.h"
+#include "MipsISelLowering.h"
+#include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
+
+using namespace llvm;
+
+namespace {
+
+// All possible address modes.
+typedef struct Address {
+  enum { RegBase, FrameIndexBase } BaseType;
+
+  union {
+    unsigned Reg;
+    int FI;
+  } Base;
+
+  int64_t Offset;
+
+  // Innocuous defaults for our address.
+  Address() : BaseType(RegBase), Offset(0) { Base.Reg = 0; }
+} Address;
+
+class MipsFastISel final : public FastISel {
+
+  /// Subtarget - Keep a pointer to the MipsSubtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const MipsSubtarget *Subtarget;
+  Module &M;
+  const TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  const TargetLowering &TLI;
+  MipsFunctionInfo *MFI;
+
+  // Convenience variables to avoid some queries.
+  LLVMContext *Context;
+
+  bool TargetSupported;
+
+public:
+  explicit MipsFastISel(FunctionLoweringInfo &funcInfo,
+                        const TargetLibraryInfo *libInfo)
+      : FastISel(funcInfo, libInfo),
+        M(const_cast<Module &>(*funcInfo.Fn->getParent())),
+        TM(funcInfo.MF->getTarget()), TII(*TM.getInstrInfo()),
+        TLI(*TM.getTargetLowering()) {
+    Subtarget = &TM.getSubtarget<MipsSubtarget>();
+    MFI = funcInfo.MF->getInfo<MipsFunctionInfo>();
+    Context = &funcInfo.Fn->getContext();
+    TargetSupported = ((Subtarget->getRelocationModel() == Reloc::PIC_) &&
+                       (Subtarget->hasMips32r2() && (Subtarget->isABI_O32())));
+  }
+
+  bool TargetSelectInstruction(const Instruction *I) override;
+  unsigned TargetMaterializeConstant(const Constant *C) override;
+
+  bool ComputeAddress(const Value *Obj, Address &Addr);
+
+private:
+  bool EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                 unsigned Alignment = 0);
+  bool SelectRet(const Instruction *I);
+  bool SelectStore(const Instruction *I);
+
+  bool isTypeLegal(Type *Ty, MVT &VT);
+  bool isLoadTypeLegal(Type *Ty, MVT &VT);
+
+  unsigned MaterializeFP(const ConstantFP *CFP, MVT VT);
+  unsigned MaterializeGV(const GlobalValue *GV, MVT VT);
+  unsigned MaterializeInt(const Constant *C, MVT VT);
+  unsigned Materialize32BitInt(int64_t Imm, const TargetRegisterClass *RC);
+};
+
+bool MipsFastISel::isTypeLegal(Type *Ty, MVT &VT) {
+  EVT evt = TLI.getValueType(Ty, true);
+  // Only handle simple types.
+  if (evt == MVT::Other || !evt.isSimple())
+    return false;
+  VT = evt.getSimpleVT();
+
+  // Handle all legal types, i.e. a register that will directly hold this
+  // value.
+  return TLI.isTypeLegal(VT);
+}
+
+bool MipsFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
+  if (isTypeLegal(Ty, VT))
+    return true;
+  // We will extend this in a later patch:
+  //   If this is a type than can be sign or zero-extended to a basic operation
+  //   go ahead and accept it now.
+  return false;
+}
+
+bool MipsFastISel::ComputeAddress(const Value *Obj, Address &Addr) {
+  // This construct looks a big awkward but it is how other ports handle this
+  // and as this function is more fully completed, these cases which
+  // return false will have additional code in them.
+  //
+  if (isa<Instruction>(Obj))
+    return false;
+  else if (isa<ConstantExpr>(Obj))
+    return false;
+  Addr.Base.Reg = getRegForValue(Obj);
+  return Addr.Base.Reg != 0;
+}
+
+// Materialize a constant into a register, and return the register
+// number (or zero if we failed to handle it).
+unsigned MipsFastISel::TargetMaterializeConstant(const Constant *C) {
+  EVT CEVT = TLI.getValueType(C->getType(), true);
+
+  // Only handle simple types.
+  if (!CEVT.isSimple())
+    return 0;
+  MVT VT = CEVT.getSimpleVT();
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+    return MaterializeFP(CFP, VT);
+  else if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+    return MaterializeGV(GV, VT);
+  else if (isa<ConstantInt>(C))
+    return MaterializeInt(C, VT);
+
+  return 0;
+}
+
+bool MipsFastISel::EmitStore(MVT VT, unsigned SrcReg, Address &Addr,
+                             unsigned Alignment) {
+  //
+  // more cases will be handled here in following patches.
+  //
+  if (VT != MVT::i32)
+    return false;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::SW))
+      .addReg(SrcReg)
+      .addReg(Addr.Base.Reg)
+      .addImm(Addr.Offset);
+  return true;
+}
+
+bool MipsFastISel::SelectStore(const Instruction *I) {
+  Value *Op0 = I->getOperand(0);
+  unsigned SrcReg = 0;
+
+  // Atomic stores need special handling.
+  if (cast<StoreInst>(I)->isAtomic())
+    return false;
+
+  // Verify we have a legal type before going any further.
+  MVT VT;
+  if (!isLoadTypeLegal(I->getOperand(0)->getType(), VT))
+    return false;
+
+  // Get the value to be stored into a register.
+  SrcReg = getRegForValue(Op0);
+  if (SrcReg == 0)
+    return false;
+
+  // See if we can handle this address.
+  Address Addr;
+  if (!ComputeAddress(I->getOperand(1), Addr))
+    return false;
+
+  if (!EmitStore(VT, SrcReg, Addr, cast<StoreInst>(I)->getAlignment()))
+    return false;
+  return true;
+}
+
+bool MipsFastISel::SelectRet(const Instruction *I) {
+  const ReturnInst *Ret = cast<ReturnInst>(I);
+
+  if (!FuncInfo.CanLowerReturn)
+    return false;
+  if (Ret->getNumOperands() > 0) {
+    return false;
+  }
+  unsigned RetOpc = Mips::RetRA;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc));
+  return true;
+}
+
+bool MipsFastISel::TargetSelectInstruction(const Instruction *I) {
+  if (!TargetSupported)
+    return false;
+  switch (I->getOpcode()) {
+  default:
+    break;
+  case Instruction::Store:
+    return SelectStore(I);
+  case Instruction::Ret:
+    return SelectRet(I);
+  }
+  return false;
+}
+}
+
+unsigned MipsFastISel::MaterializeFP(const ConstantFP *CFP, MVT VT) {
+  return 0;
+}
+
+unsigned MipsFastISel::MaterializeGV(const GlobalValue *GV, MVT VT) {
+  // For now 32-bit only.
+  if (VT != MVT::i32)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  unsigned DestReg = createResultReg(RC);
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  // TLS not supported at this time.
+  if (IsThreadLocal)
+    return 0;
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LW), DestReg)
+      .addReg(MFI->getGlobalBaseReg())
+      .addGlobalAddress(GV, 0, MipsII::MO_GOT);
+  return DestReg;
+}
+unsigned MipsFastISel::MaterializeInt(const Constant *C, MVT VT) {
+  if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
+    return 0;
+  const TargetRegisterClass *RC = &Mips::GPR32RegClass;
+  const ConstantInt *CI = cast<ConstantInt>(C);
+  int64_t Imm;
+  if (CI->isNegative())
+    Imm = CI->getSExtValue();
+  else
+    Imm = CI->getZExtValue();
+  return Materialize32BitInt(Imm, RC);
+}
+
+unsigned MipsFastISel::Materialize32BitInt(int64_t Imm,
+                                           const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+
+  if (isInt<16>(Imm)) {
+    unsigned Opc = Mips::ADDiu;
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
+        .addReg(Mips::ZERO)
+        .addImm(Imm);
+    return ResultReg;
+  } else if (isUInt<16>(Imm)) {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
+            ResultReg)
+        .addReg(Mips::ZERO)
+        .addImm(Imm);
+    return ResultReg;
+  }
+  unsigned Lo = Imm & 0xFFFF;
+  unsigned Hi = (Imm >> 16) & 0xFFFF;
+  if (Lo) {
+    // Both Lo and Hi have nonzero bits.
+    unsigned TmpReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
+            TmpReg).addImm(Hi);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::ORi),
+            ResultReg)
+        .addReg(TmpReg)
+        .addImm(Lo);
+
+  } else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Mips::LUi),
+            ResultReg).addImm(Hi);
+  }
+  return ResultReg;
+}
+
+namespace llvm {
+FastISel *Mips::createFastISel(FunctionLoweringInfo &funcInfo,
+                               const TargetLibraryInfo *libInfo) {
+  return new MipsFastISel(funcInfo, libInfo);
+}
+}
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index eb9d49f..8ba35fa 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -110,7 +110,7 @@ uint64_t MipsFrameLowering::estimateStackSize(const MachineFunction &MF) const {
     Offset = std::max(Offset, -MFI->getObjectOffset(I));
 
   // Conservatively assume all callee-saved registers will be saved.
-  for (const uint16_t *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
+  for (const MCPhysReg *R = TRI.getCalleeSavedRegs(&MF); *R; ++R) {
     unsigned Size = TRI.getMinimalPhysRegClass(*R)->getSize();
     Offset = RoundUpToAlignment(Offset + Size, Size);
   }
diff --git a/lib/Target/Mips/MipsFrameLowering.h b/lib/Target/Mips/MipsFrameLowering.h
index 6a5f79d..e10a3a5 100644
--- a/lib/Target/Mips/MipsFrameLowering.h
+++ b/lib/Target/Mips/MipsFrameLowering.h
@@ -32,7 +32,7 @@ public:
   static const MipsFrameLowering *create(MipsTargetMachine &TM,
                                          const MipsSubtarget &ST);
 
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
 
 protected:
   uint64_t estimateStackSize(const MachineFunction &MF) const;
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 941aeac..90cff63 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "MipsISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 //===----------------------------------------------------------------------===//
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
@@ -182,7 +183,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return NULL;
+    return nullptr;
   }
 
   // See if subclasses can handle this node.
@@ -201,8 +202,9 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
 #ifndef NDEBUG
   case ISD::LOAD:
   case ISD::STORE:
-    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
-           cast<MemSDNode>(Node)->getAlignment() &&
+    assert((Subtarget.systemSupportsUnalignedAccess() ||
+            cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+            cast<MemSDNode>(Node)->getAlignment()) &&
            "Unexpected unaligned loads/stores.");
     break;
 #endif
@@ -212,7 +214,7 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ");
-  if (ResNode == NULL || ResNode == Node)
+  if (ResNode == nullptr || ResNode == Node)
     DEBUG(Node->dump(CurDAG));
   else
     DEBUG(ResNode->dump(CurDAG));
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.h b/lib/Target/Mips/MipsISelDAGToDAG.h
index 4546182..13becb6 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsISelDAGToDAG.h
@@ -35,11 +35,11 @@ public:
     : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS DAG->DAG Pattern Instruction Selection";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 protected:
   SDNode *getGlobalBaseReg();
@@ -110,7 +110,7 @@ private:
   /// starting at bit zero.
   virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
 
-  virtual SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   virtual std::pair<bool, SDNode*> selectNode(SDNode *Node) = 0;
 
@@ -121,9 +121,9 @@ private:
 
   virtual void processFunctionAfterISel(MachineFunction &MF) = 0;
 
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 };
 
 /// createMipsISelDag - This pass converts a legalized DAG into a
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index abf36da..bfe5ea1 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -11,7 +11,6 @@
 // selection DAG.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-lower"
 #include "MipsISelLowering.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
@@ -39,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-lower"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 static cl::opt<bool>
@@ -50,16 +51,21 @@ NoZeroDivCheck("mno-check-zero-division", cl::Hidden,
                cl::desc("MIPS: Don't trap on integer division by zero."),
                cl::init(false));
 
-static const uint16_t O32IntRegs[4] = {
+cl::opt<bool>
+EnableMipsFastISel("mips-fast-isel", cl::Hidden,
+  cl::desc("Allow mips-fast-isel to be used"),
+  cl::init(false));
+
+static const MCPhysReg O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
 
-static const uint16_t Mips64IntRegs[8] = {
+static const MCPhysReg Mips64IntRegs[8] = {
   Mips::A0_64, Mips::A1_64, Mips::A2_64, Mips::A3_64,
   Mips::T0_64, Mips::T1_64, Mips::T2_64, Mips::T3_64
 };
 
-static const uint16_t Mips64DPRegs[8] = {
+static const MCPhysReg Mips64DPRegs[8] = {
   Mips::D12_64, Mips::D13_64, Mips::D14_64, Mips::D15_64,
   Mips::D16_64, Mips::D17_64, Mips::D18_64, Mips::D19_64
 };
@@ -198,7 +204,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::PCKEV:             return "MipsISD::PCKEV";
   case MipsISD::PCKOD:             return "MipsISD::PCKOD";
   case MipsISD::INSVE:             return "MipsISD::INSVE";
-  default:                         return NULL;
+  default:                         return nullptr;
   }
 }
 
@@ -245,12 +251,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::FP_TO_SINT,         MVT::i32,   Custom);
 
-  if (!TM.Options.NoNaNsFPMath) {
-    setOperationAction(ISD::FABS,             MVT::f32,   Custom);
-    setOperationAction(ISD::FABS,             MVT::f64,   Custom);
-  }
-
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
     setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
     setOperationAction(ISD::GlobalTLSAddress,   MVT::i64,   Custom);
@@ -262,14 +263,14 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT,         MVT::i64,   Custom);
   }
 
-  if (!hasMips64()) {
+  if (!isGP64bit()) {
     setOperationAction(ISD::SHL_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRA_PARTS,          MVT::i32,   Custom);
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
   setOperationAction(ISD::ADD,                MVT::i32,   Custom);
-  if (hasMips64())
+  if (isGP64bit())
     setOperationAction(ISD::ADD,                MVT::i64,   Custom);
 
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
@@ -334,11 +335,6 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FREM,              MVT::f32,   Expand);
   setOperationAction(ISD::FREM,              MVT::f64,   Expand);
 
-  if (!TM.Options.NoNaNsFPMath) {
-    setOperationAction(ISD::FNEG,             MVT::f32,   Expand);
-    setOperationAction(ISD::FNEG,             MVT::f64,   Expand);
-  }
-
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
 
   setOperationAction(ISD::VAARG,             MVT::Other, Expand);
@@ -356,22 +352,23 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
-  if (!Subtarget->hasSEInReg()) {
+  if (!Subtarget->hasMips32r2()) {
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8,  Expand);
     setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
   }
 
-  if (!Subtarget->hasBitCount()) {
+  // MIPS16 lacks MIPS32's clz and clo instructions.
+  if (!Subtarget->hasMips32() || Subtarget->inMips16Mode())
     setOperationAction(ISD::CTLZ, MVT::i32, Expand);
+  if (!Subtarget->hasMips64())
     setOperationAction(ISD::CTLZ, MVT::i64, Expand);
-  }
 
-  if (!Subtarget->hasSwap()) {
+  if (!Subtarget->hasMips32r2())
     setOperationAction(ISD::BSWAP, MVT::i32, Expand);
+  if (!Subtarget->hasMips64r2())
     setOperationAction(ISD::BSWAP, MVT::i64, Expand);
-  }
 
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Custom);
     setLoadExtAction(ISD::EXTLOAD, MVT::i32, Custom);
@@ -387,7 +384,7 @@ MipsTargetLowering::MipsTargetLowering(MipsTargetMachine &TM)
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
 
-  setMinFunctionAlignment(hasMips64() ? 3 : 2);
+  setMinFunctionAlignment(isGP64bit() ? 3 : 2);
 
   setStackPointerRegisterToSaveRestore(isN64() ? Mips::SP_64 : Mips::SP);
 
@@ -406,6 +403,15 @@ const MipsTargetLowering *MipsTargetLowering::create(MipsTargetMachine &TM) {
   return llvm::createMipsSETargetLowering(TM);
 }
 
+// Create a fast isel object.
+FastISel *
+MipsTargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
+                                  const TargetLibraryInfo *libInfo) const {
+  if (!EnableMipsFastISel)
+    return TargetLowering::createFastISel(funcInfo, libInfo);
+  return Mips::createFastISel(funcInfo, libInfo);
+}
+
 EVT MipsTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
@@ -779,7 +785,6 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
   case ISD::SETCC:              return lowerSETCC(Op, DAG);
   case ISD::VASTART:            return lowerVASTART(Op, DAG);
   case ISD::FCOPYSIGN:          return lowerFCOPYSIGN(Op, DAG);
-  case ISD::FABS:               return lowerFABS(Op, DAG);
   case ISD::FRAMEADDR:          return lowerFRAMEADDR(Op, DAG);
   case ISD::RETURNADDR:         return lowerRETURNADDR(Op, DAG);
   case ISD::EH_RETURN:          return lowerEH_RETURN(Op, DAG);
@@ -1506,7 +1511,7 @@ SDValue MipsTargetLowering::lowerGlobalAddress(SDValue Op,
       SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
                                               MipsII::MO_GPREL);
       SDValue GPRelNode = DAG.getNode(MipsISD::GPRel, DL,
-                                      DAG.getVTList(MVT::i32), &GA, 1);
+                                      DAG.getVTList(MVT::i32), GA);
       SDValue GPReg = DAG.getRegister(Mips::GP, MVT::i32);
       return DAG.getNode(ISD::ADD, DL, MVT::i32, GPReg, GPRelNode);
     }
@@ -1572,11 +1577,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Entry.Ty = PtrTy;
     Args.push_back(Entry);
 
-    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
-                  false, false, false, false, 0, CallingConv::C,
-                  /*IsTailCall=*/false, /*doesNotRet=*/false,
-                  /*isReturnValueUsed=*/true,
-                  TlsGetAddr, Args, DAG, DL);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
+      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, &Args, 0);
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
@@ -1765,71 +1768,12 @@ static SDValue lowerFCOPYSIGN64(SDValue Op, SelectionDAG &DAG,
 
 SDValue
 MipsTargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
-  if (Subtarget->hasMips64())
+  if (Subtarget->isGP64bit())
     return lowerFCOPYSIGN64(Op, DAG, Subtarget->hasExtractInsert());
 
   return lowerFCOPYSIGN32(Op, DAG, Subtarget->hasExtractInsert());
 }
 
-static SDValue lowerFABS32(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
-  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  SDLoc DL(Op);
-
-  // If operand is of type f64, extract the upper 32-bit. Otherwise, bitcast it
-  // to i32.
-  SDValue X = (Op.getValueType() == MVT::f32) ?
-    DAG.getNode(ISD::BITCAST, DL, MVT::i32, Op.getOperand(0)) :
-    DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32, Op.getOperand(0),
-                Const1);
-
-  // Clear MSB.
-  if (HasExtractInsert)
-    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i32,
-                      DAG.getRegister(Mips::ZERO, MVT::i32),
-                      DAG.getConstant(31, MVT::i32), Const1, X);
-  else {
-    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i32, X, Const1);
-    Res = DAG.getNode(ISD::SRL, DL, MVT::i32, SllX, Const1);
-  }
-
-  if (Op.getValueType() == MVT::f32)
-    return DAG.getNode(ISD::BITCAST, DL, MVT::f32, Res);
-
-  SDValue LowX = DAG.getNode(MipsISD::ExtractElementF64, DL, MVT::i32,
-                             Op.getOperand(0), DAG.getConstant(0, MVT::i32));
-  return DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, LowX, Res);
-}
-
-static SDValue lowerFABS64(SDValue Op, SelectionDAG &DAG,
-                           bool HasExtractInsert) {
-  SDValue Res, Const1 = DAG.getConstant(1, MVT::i32);
-  SDLoc DL(Op);
-
-  // Bitcast to integer node.
-  SDValue X = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Op.getOperand(0));
-
-  // Clear MSB.
-  if (HasExtractInsert)
-    Res = DAG.getNode(MipsISD::Ins, DL, MVT::i64,
-                      DAG.getRegister(Mips::ZERO_64, MVT::i64),
-                      DAG.getConstant(63, MVT::i32), Const1, X);
-  else {
-    SDValue SllX = DAG.getNode(ISD::SHL, DL, MVT::i64, X, Const1);
-    Res = DAG.getNode(ISD::SRL, DL, MVT::i64, SllX, Const1);
-  }
-
-  return DAG.getNode(ISD::BITCAST, DL, MVT::f64, Res);
-}
-
-SDValue
-MipsTargetLowering::lowerFABS(SDValue Op, SelectionDAG &DAG) const {
-  if (Subtarget->hasMips64() && (Op.getValueType() == MVT::f64))
-    return lowerFABS64(Op, DAG, Subtarget->hasExtractInsert());
-
-  return lowerFABS32(Op, DAG, Subtarget->hasExtractInsert());
-}
-
 SDValue MipsTargetLowering::
 lowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   // check the depth
@@ -1931,7 +1875,7 @@ SDValue MipsTargetLowering::lowerShiftLeftParts(SDValue Op,
   Hi = DAG.getNode(ISD::SELECT, DL, MVT::i32, Cond, ShiftLeftLo, Or);
 
   SDValue Ops[2] = {Lo, Hi};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
@@ -1972,7 +1916,7 @@ SDValue MipsTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
                    ShiftRightHi);
 
   SDValue Ops[2] = {Lo, Hi};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
@@ -1988,7 +1932,7 @@ static SDValue createLoadLR(unsigned Opc, SelectionDAG &DAG, LoadSDNode *LD,
                       DAG.getConstant(Offset, BasePtrVT));
 
   SDValue Ops[] = { Chain, Ptr, Src };
-  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
                                  LD->getMemOperand());
 }
 
@@ -1997,6 +1941,9 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   LoadSDNode *LD = cast<LoadSDNode>(Op);
   EVT MemVT = LD->getMemoryVT();
 
+  if (Subtarget->systemSupportsUnalignedAccess())
+    return Op;
+
   // Return if load is aligned or if MemVT is neither i32 nor i64.
   if ((LD->getAlignment() >= MemVT.getSizeInBits() / 8) ||
       ((MemVT != MVT::i32) && (MemVT != MVT::i64)))
@@ -2051,7 +1998,7 @@ SDValue MipsTargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue SLL = DAG.getNode(ISD::SHL, DL, MVT::i64, LWR, Const32);
   SDValue SRL = DAG.getNode(ISD::SRL, DL, MVT::i64, SLL, Const32);
   SDValue Ops[] = { SRL, LWR.getValue(1) };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
@@ -2066,7 +2013,7 @@ static SDValue createStoreLR(unsigned Opc, SelectionDAG &DAG, StoreSDNode *SD,
                       DAG.getConstant(Offset, BasePtrVT));
 
   SDValue Ops[] = { Chain, Value, Ptr };
-  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, 3, MemVT,
+  return DAG.getMemIntrinsicNode(Opc, DL, VTList, Ops, MemVT,
                                  SD->getMemOperand());
 }
 
@@ -2120,7 +2067,8 @@ SDValue MipsTargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = SD->getMemoryVT();
 
   // Lower unaligned integer stores.
-  if ((SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
+  if (!Subtarget->systemSupportsUnalignedAccess() &&
+      (SD->getAlignment() < MemVT.getSizeInBits() / 8) &&
       ((MemVT == MVT::i32) || (MemVT == MVT::i64)))
     return lowerUnalignedIntStore(SD, DAG, Subtarget->isLittle());
 
@@ -2177,12 +2125,12 @@ SDValue MipsTargetLowering::lowerFP_TO_SINT(SDValue Op,
 
 static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
                        CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
-                       CCState &State, const uint16_t *F64Regs) {
+                       CCState &State, const MCPhysReg *F64Regs) {
 
   static const unsigned IntRegsSize = 4, FloatRegsSize = 2;
 
-  static const uint16_t IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
-  static const uint16_t F32Regs[] = { Mips::F12, Mips::F14 };
+  static const MCPhysReg IntRegs[] = { Mips::A0, Mips::A1, Mips::A2, Mips::A3 };
+  static const MCPhysReg F32Regs[] = { Mips::F12, Mips::F14 };
 
   // Do not process byval args here.
   if (ArgFlags.isByVal())
@@ -2254,7 +2202,7 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
 static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const uint16_t F64Regs[] = { Mips::D6, Mips::D7 };
+  static const MCPhysReg F64Regs[] = { Mips::D6, Mips::D7 };
 
   return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
 }
@@ -2262,7 +2210,7 @@ static bool CC_MipsO32_FP32(unsigned ValNo, MVT ValVT,
 static bool CC_MipsO32_FP64(unsigned ValNo, MVT ValVT,
                             MVT LocVT, CCValAssign::LocInfo LocInfo,
                             ISD::ArgFlagsTy ArgFlags, CCState &State) {
-  static const uint16_t F64Regs[] = { Mips::D12_64, Mips::D14_64 };
+  static const MCPhysReg F64Regs[] = { Mips::D12_64, Mips::D14_64 };
 
   return CC_MipsO32(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, F64Regs);
 }
@@ -2383,7 +2331,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   MipsCCInfo.analyzeCallOperands(Outs, IsVarArg,
                                  Subtarget->mipsSEUsesSoftFloat(),
-                                 Callee.getNode(), CLI.Args);
+                                 Callee.getNode(), CLI.getArgs());
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
@@ -2394,6 +2342,10 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isEligibleForTailCallOptimization(MipsCCInfo, NextStackOffset,
                                         *MF.getInfo<MipsFunctionInfo>());
 
+  if (!IsTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
   if (IsTailCall)
     ++NumTailCalls;
 
@@ -2489,8 +2441,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Transform all store nodes into one single node because all store
   // nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
@@ -2544,9 +2495,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
               CLI, Callee, Chain);
 
   if (IsTailCall)
-    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, &Ops[0], Ops.size());
+    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
 
-  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
   SDValue InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -2713,18 +2664,21 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
     }
   }
 
-  // The mips ABIs for returning structs by value requires that we copy
-  // the sret argument into $v0 for the return. Save the argument into
-  // a virtual register so that we can access it from the return points.
-  if (DAG.getMachineFunction().getFunction()->hasStructRetAttr()) {
-    unsigned Reg = MipsFI->getSRetReturnReg();
-    if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(
-          getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
-      MipsFI->setSRetReturnReg(Reg);
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    // The mips ABIs for returning structs by value requires that we copy
+    // the sret argument into $v0 for the return. Save the argument into
+    // a virtual register so that we can access it from the return points.
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = MipsFI->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(isN64() ? MVT::i64 : MVT::i32));
+        MipsFI->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
+      break;
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), DL, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Copy, Chain);
   }
 
   if (IsVarArg)
@@ -2734,8 +2688,7 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   // the size of Ins and InVals. This only happens when on varg functions
   if (!OutChains.empty()) {
     OutChains.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &OutChains[0], OutChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
   }
 
   return Chain;
@@ -2820,7 +2773,7 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     RetOps.push_back(Flag);
 
   // Return on Mips is always a "jr $ra"
-  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(MipsISD::Ret, DL, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2870,7 +2823,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -2948,12 +2901,12 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
   std::pair<bool, bool> R = parsePhysicalReg(C, Prefix, Reg);
 
   if (!R.first)
-    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+    return std::make_pair(0U, nullptr);
 
   if ((Prefix == "hi" || Prefix == "lo")) { // Parse hi/lo.
     // No numeric characters follow "hi" or "lo".
     if (R.second)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+      return std::make_pair(0U, nullptr);
 
     RC = TRI->getRegClass(Prefix == "hi" ?
                           Mips::HI32RegClassID : Mips::LO32RegClassID);
@@ -2963,7 +2916,7 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
 
     // No numeric characters follow the name.
     if (R.second)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+      return std::make_pair(0U, nullptr);
 
     Reg = StringSwitch<unsigned long long>(Prefix)
               .Case("$msair", Mips::MSAIR)
@@ -2977,14 +2930,14 @@ parseRegForInlineAsmConstraint(const StringRef &C, MVT VT) const {
               .Default(0);
 
     if (!Reg)
-      return std::make_pair((unsigned)0, (const TargetRegisterClass *)0);
+      return std::make_pair(0U, nullptr);
 
     RC = TRI->getRegClass(Mips::MSACtrlRegClassID);
     return std::make_pair(Reg, RC);
   }
 
   if (!R.second)
-    return std::make_pair((unsigned)0, (const TargetRegisterClass*)0);
+    return std::make_pair(0U, nullptr);
 
   if (Prefix == "$f") { // Parse $f0-$f31.
     // If the size of FP registers is 64-bit or Reg is an even number, select
@@ -3032,7 +2985,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
       if (VT == MVT::i64 && isGP64bit())
         return std::make_pair(0U, &Mips::GPR64RegClass);
       // This will generate an error message
-      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+      return std::make_pair(0U, nullptr);
     case 'f': // FPU or MSA register
       if (VT == MVT::v16i8)
         return std::make_pair(0U, &Mips::MSA128BRegClass);
@@ -3062,7 +3015,7 @@ getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const
     case 'x': // register suitable for indirect jump
       // Fixme: Not triggering the use of both hi and low
       // This will generate an error message
-      return std::make_pair(0u, static_cast<const TargetRegisterClass*>(0));
+      return std::make_pair(0U, nullptr);
     }
   }
 
@@ -3081,7 +3034,7 @@ void MipsTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
@@ -3265,7 +3218,7 @@ static bool originalTypeIsF128(const Type *Ty, const SDNode *CallNode) {
 MipsTargetLowering::MipsCC::SpecialCallingConvType
   MipsTargetLowering::getSpecialCallingConv(SDValue Callee) const {
   MipsCC::SpecialCallingConvType SpecialCallingConv =
-    MipsCC::NoSpecialCallingConv;;
+    MipsCC::NoSpecialCallingConv;
   if (Subtarget->inMips16HardFloat()) {
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
       llvm::StringRef Sym = G->getGlobal()->getName();
@@ -3321,7 +3274,7 @@ analyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Args,
       dbgs() << "Call operand #" << I << " has unhandled type "
              << EVT(ArgVT).getEVTString();
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -3344,7 +3297,7 @@ analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
       continue;
     }
 
-    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), 0, IsSoftFloat);
+    MVT RegVT = getRegVT(ArgVT, FuncArg->getType(), nullptr, IsSoftFloat);
 
     if (!FixedFn(I, ArgVT, RegVT, CCValAssign::Full, ArgFlags, CCInfo))
       continue;
@@ -3353,7 +3306,7 @@ analyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Args,
     dbgs() << "Formal Arg #" << I << " has unhandled type "
            << EVT(ArgVT).getEVTString();
 #endif
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 }
 
@@ -3378,7 +3331,7 @@ analyzeReturn(const SmallVectorImpl<Ty> &RetVals, bool IsSoftFloat,
       dbgs() << "Call result #" << I << " has unhandled type "
              << EVT(VT).getEVTString() << '\n';
 #endif
-      llvm_unreachable(0);
+      llvm_unreachable(nullptr);
     }
   }
 }
@@ -3392,7 +3345,7 @@ analyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsSoftFloat,
 void MipsTargetLowering::MipsCC::
 analyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsSoftFloat,
               const Type *RetTy) const {
-  analyzeReturn(Outs, IsSoftFloat, 0, RetTy);
+  analyzeReturn(Outs, IsSoftFloat, nullptr, RetTy);
 }
 
 void MipsTargetLowering::MipsCC::handleByValArg(unsigned ValNo, MVT ValVT,
@@ -3426,7 +3379,7 @@ unsigned MipsTargetLowering::MipsCC::reservedArgArea() const {
   return (IsO32 && (CallConv != CallingConv::Fast)) ? 16 : 0;
 }
 
-const uint16_t *MipsTargetLowering::MipsCC::intArgRegs() const {
+const MCPhysReg *MipsTargetLowering::MipsCC::intArgRegs() const {
   return IsO32 ? O32IntRegs : Mips64IntRegs;
 }
 
@@ -3443,7 +3396,7 @@ llvm::CCAssignFn *MipsTargetLowering::MipsCC::varArgFn() const {
   return IsO32 ? (IsFP64 ? CC_MipsO32_FP64 : CC_MipsO32_FP32) : CC_MipsN_VarArg;
 }
 
-const uint16_t *MipsTargetLowering::MipsCC::shadowRegs() const {
+const MCPhysReg *MipsTargetLowering::MipsCC::shadowRegs() const {
   return IsO32 ? O32IntRegs : Mips64DPRegs;
 }
 
@@ -3451,7 +3404,7 @@ void MipsTargetLowering::MipsCC::allocateRegs(ByValArgInfo &ByVal,
                                               unsigned ByValSize,
                                               unsigned Align) {
   unsigned RegSize = regSize(), NumIntArgRegs = numIntArgRegs();
-  const uint16_t *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
+  const MCPhysReg *IntArgRegs = intArgRegs(), *ShadowRegs = shadowRegs();
   assert(!(ByValSize % RegSize) && !(Align % RegSize) &&
          "Byval argument's size and alignment should be a multiple of"
          "RegSize.");
@@ -3536,21 +3489,22 @@ passByValArg(SDValue Chain, SDLoc DL,
              MachineFrameInfo *MFI, SelectionDAG &DAG, SDValue Arg,
              const MipsCC &CC, const ByValArgInfo &ByVal,
              const ISD::ArgFlagsTy &Flags, bool isLittle) const {
-  unsigned ByValSize = Flags.getByValSize();
-  unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
-  unsigned RegSize = CC.regSize();
-  unsigned Alignment = std::min(Flags.getByValAlign(), RegSize);
-  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSize * 8);
+  unsigned ByValSizeInBytes = Flags.getByValSize();
+  unsigned OffsetInBytes = 0; // From beginning of struct
+  unsigned RegSizeInBytes = CC.regSize();
+  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  EVT PtrTy = getPointerTy(), RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
 
   if (ByVal.NumRegs) {
-    const uint16_t *ArgRegs = CC.intArgRegs();
-    bool LeftoverBytes = (ByVal.NumRegs * RegSize > ByValSize);
+    const MCPhysReg *ArgRegs = CC.intArgRegs();
+    bool LeftoverBytes = (ByVal.NumRegs * RegSizeInBytes > ByValSizeInBytes);
     unsigned I = 0;
 
     // Copy words to registers.
-    for (; I < ByVal.NumRegs - LeftoverBytes; ++I, Offset += RegSize) {
+    for (; I < ByVal.NumRegs - LeftoverBytes;
+         ++I, OffsetInBytes += RegSizeInBytes) {
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                    DAG.getConstant(Offset, PtrTy));
+                                    DAG.getConstant(OffsetInBytes, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
                                     MachinePointerInfo(), false, false, false,
                                     Alignment);
@@ -3560,38 +3514,38 @@ passByValArg(SDValue Chain, SDLoc DL,
     }
 
     // Return if the struct has been fully copied.
-    if (ByValSize == Offset)
+    if (ByValSizeInBytes == OffsetInBytes)
       return;
 
     // Copy the remainder of the byval argument with sub-word loads and shifts.
     if (LeftoverBytes) {
-      assert((ByValSize > Offset) && (ByValSize < Offset + RegSize) &&
-             "Size of the remainder should be smaller than RegSize.");
+      assert((ByValSizeInBytes > OffsetInBytes) &&
+             (ByValSizeInBytes < OffsetInBytes + RegSizeInBytes) &&
+             "Size of the remainder should be smaller than RegSizeInBytes.");
       SDValue Val;
 
-      for (unsigned LoadSize = RegSize / 2, TotalSizeLoaded = 0;
-           Offset < ByValSize; LoadSize /= 2) {
-        unsigned RemSize = ByValSize - Offset;
+      for (unsigned LoadSizeInBytes = RegSizeInBytes / 2, TotalBytesLoaded = 0;
+           OffsetInBytes < ByValSizeInBytes; LoadSizeInBytes /= 2) {
+        unsigned RemainingSizeInBytes = ByValSizeInBytes - OffsetInBytes;
 
-        if (RemSize < LoadSize)
+        if (RemainingSizeInBytes < LoadSizeInBytes)
           continue;
 
         // Load subword.
         SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                                      DAG.getConstant(Offset, PtrTy));
-        SDValue LoadVal =
-          DAG.getExtLoad(ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr,
-                         MachinePointerInfo(), MVT::getIntegerVT(LoadSize * 8),
-                         false, false, Alignment);
+                                      DAG.getConstant(OffsetInBytes, PtrTy));
+        SDValue LoadVal = DAG.getExtLoad(
+            ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
+            MVT::getIntegerVT(LoadSizeInBytes * 8), false, false, Alignment);
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
         unsigned Shamt;
 
         if (isLittle)
-          Shamt = TotalSizeLoaded;
+          Shamt = TotalBytesLoaded * 8;
         else
-          Shamt = (RegSize - (TotalSizeLoaded + LoadSize)) * 8;
+          Shamt = (RegSizeInBytes - (TotalBytesLoaded + LoadSizeInBytes)) * 8;
 
         SDValue Shift = DAG.getNode(ISD::SHL, DL, RegTy, LoadVal,
                                     DAG.getConstant(Shamt, MVT::i32));
@@ -3601,9 +3555,9 @@ passByValArg(SDValue Chain, SDLoc DL,
         else
           Val = Shift;
 
-        Offset += LoadSize;
-        TotalSizeLoaded += LoadSize;
-        Alignment = std::min(Alignment, LoadSize);
+        OffsetInBytes += LoadSizeInBytes;
+        TotalBytesLoaded += LoadSizeInBytes;
+        Alignment = std::min(Alignment, LoadSizeInBytes);
       }
 
       unsigned ArgReg = ArgRegs[ByVal.FirstIdx + I];
@@ -3613,14 +3567,14 @@ passByValArg(SDValue Chain, SDLoc DL,
   }
 
   // Copy remainder of byval arg to it with memcpy.
-  unsigned MemCpySize = ByValSize - Offset;
+  unsigned MemCpySize = ByValSizeInBytes - OffsetInBytes;
   SDValue Src = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
-                            DAG.getConstant(Offset, PtrTy));
+                            DAG.getConstant(OffsetInBytes, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(ByVal.Address));
   Chain = DAG.getMemcpy(Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, PtrTy),
                         Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
-                        MachinePointerInfo(0), MachinePointerInfo(0));
+                        MachinePointerInfo(), MachinePointerInfo());
   MemOpChains.push_back(Chain);
 }
 
@@ -3628,7 +3582,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
                                          const MipsCC &CC, SDValue Chain,
                                          SDLoc DL, SelectionDAG &DAG) const {
   unsigned NumRegs = CC.numIntArgRegs();
-  const uint16_t *ArgRegs = CC.intArgRegs();
+  const MCPhysReg *ArgRegs = CC.intArgRegs();
   const CCState &CCInfo = CC.getCCInfo();
   unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
   unsigned RegSize = CC.regSize();
@@ -3662,7 +3616,7 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
     SDValue PtrOff = DAG.getFrameIndex(FI, getPointerTy());
     SDValue Store = DAG.getStore(Chain, DL, ArgValue, PtrOff,
                                  MachinePointerInfo(), false, false, 0);
-    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue(0);
+    cast<StoreSDNode>(Store.getNode())->getMemOperand()->setValue((Value*)nullptr);
     OutChains.push_back(Store);
   }
 }
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 35dd396..4ac33bf 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -218,32 +218,38 @@ namespace llvm {
 
     static const MipsTargetLowering *create(MipsTargetMachine &TM);
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    /// createFastISel - This method returns a target specific FastISel object,
+    /// or null if the target does not support "fast" ISel.
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo) const override;
 
-    virtual void LowerOperationWrapper(SDNode *N,
-                                       SmallVectorImpl<SDValue> &Results,
-                                       SelectionDAG &DAG) const;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
+
+    void LowerOperationWrapper(SDNode *N,
+                               SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
     /// getSetCCResultType - get the ISD::SETCC result ValueType
-    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
     struct LTStr {
       bool operator()(const char *S1, const char *S2) const {
@@ -382,7 +388,7 @@ namespace llvm {
       unsigned reservedArgArea() const;
 
       /// Return pointer to array of integer argument registers.
-      const uint16_t *intArgRegs() const;
+      const MCPhysReg *intArgRegs() const;
 
       typedef SmallVectorImpl<ByValArgInfo>::const_iterator byval_iterator;
       byval_iterator byval_begin() const { return ByValArgs.begin(); }
@@ -403,7 +409,7 @@ namespace llvm {
       /// Return the function that analyzes variable argument list functions.
       llvm::CCAssignFn *varArgFn() const;
 
-      const uint16_t *shadowRegs() const;
+      const MCPhysReg *shadowRegs() const;
 
       void allocateRegs(ByValArgInfo &ByVal, unsigned ByValSize,
                         unsigned Align);
@@ -523,41 +529,39 @@ namespace llvm {
     void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
                          SDValue Chain, SDLoc DL, SelectionDAG &DAG) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
     SDValue passArgOnStack(SDValue StackPtr, unsigned Offset, SDValue Chain,
                            SDValue Arg, SDLoc DL, bool IsTailCall,
                            SelectionDAG &DAG) const;
 
-    virtual SDValue
-      LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+    SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                      SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual bool
-      CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
-                     bool isVarArg,
-                     const SmallVectorImpl<ISD::OutputArg> &Outs,
-                     LLVMContext &Context) const;
-
-    virtual SDValue
-      LowerReturn(SDValue Chain,
-                  CallingConv::ID CallConv, bool isVarArg,
-                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                  const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+    bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                        bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
+    SDValue LowerReturn(SDValue Chain,
+                        CallingConv::ID CallConv, bool isVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        const SmallVectorImpl<SDValue> &OutVals,
+                        SDLoc dl, SelectionDAG &DAG) const override;
 
     // Inline asm support
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+      getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+      AsmOperandInfo &info, const char *constraint) const override;
 
     /// This function parses registers that appear in inline-asm constraints.
     /// It returns pair (0, 0) on failure.
@@ -566,33 +570,33 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
               getRegForInlineAsmConstraint(const std::string &Constraint,
-                                           MVT VT) const;
+                                           MVT VT) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
     /// true it means one of the asm constraint of the inline asm instruction
     /// being processed is 'm'.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-    virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                    unsigned SrcAlign,
-                                    bool IsMemset, bool ZeroMemset,
-                                    bool MemcpyStrSrc,
-                                    MachineFunction &MF) const;
+    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
+                            unsigned SrcAlign,
+                            bool IsMemset, bool ZeroMemset,
+                            bool MemcpyStrSrc,
+                            MachineFunction &MF) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
     /// specified FP immediate natively. If false, the legalizer will
     /// materialize the FP immediate as a load from a constant pool.
-    virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+    bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
-    virtual unsigned getJumpTableEncoding() const;
+    unsigned getJumpTableEncoding() const override;
 
     MachineBasicBlock *emitAtomicBinary(MachineInstr *MI, MachineBasicBlock *BB,
                     unsigned Size, unsigned BinOpcode, bool Nand = false) const;
@@ -608,6 +612,11 @@ namespace llvm {
   /// Create MipsTargetLowering objects.
   const MipsTargetLowering *createMips16TargetLowering(MipsTargetMachine &TM);
   const MipsTargetLowering *createMipsSETargetLowering(MipsTargetMachine &TM);
+
+  namespace Mips {
+    FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
+                             const TargetLibraryInfo *libInfo);
+  }
 }
 
 #endif // MipsISELLOWERING_H
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 4b5a73e..32cda3b 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -66,6 +66,16 @@ def IsSingleFloat    : Predicate<"Subtarget.isSingleFloat()">,
 def IsNotSingleFloat : Predicate<"!Subtarget.isSingleFloat()">,
                        AssemblerPredicate<"!FeatureSingleFloat">;
 
+//===----------------------------------------------------------------------===//
+// Mips FGR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class FGR_32 { list<Predicate> FGRPredicates = [NotFP64bit]; }
+class FGR_64 { list<Predicate> FGRPredicates = [IsFP64bit]; }
+
+//===----------------------------------------------------------------------===//
+
 // FP immediate patterns.
 def fpimm0 : PatLeaf<(fpimm), [{
   return N->isExactlyValue(+0.0);
@@ -100,10 +110,10 @@ class ADDS_FT<string opstr, RegisterOperand RC, InstrItinClass Itin, bit IsComm,
 multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
                   SDPatternOperator OpNode = null_frag> {
   def _D32 : MMRel, ADDS_FT<opstr, AFGR64Opnd, Itin, IsComm, OpNode>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ADDS_FT<opstr, FGR64Opnd, Itin,
                      IsComm, OpNode>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     string DecoderNamespace = "Mips64";
   }
 }
@@ -117,18 +127,18 @@ class ABSS_FT<string opstr, RegisterOperand DstRC, RegisterOperand SrcRC,
 multiclass ABSS_M<string opstr, InstrItinClass Itin,
                   SDPatternOperator OpNode= null_frag> {
   def _D32 : MMRel, ABSS_FT<opstr, AFGR64Opnd, AFGR64Opnd, Itin, OpNode>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ABSS_FT<opstr, FGR64Opnd, FGR64Opnd, Itin, OpNode>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     string DecoderNamespace = "Mips64";
   }
 }
 
 multiclass ROUND_M<string opstr, InstrItinClass Itin> {
   def _D32 : MMRel, ABSS_FT<opstr, FGR32Opnd, AFGR64Opnd, Itin>,
-             Requires<[NotFP64bit, HasStdEnc]>;
+             AdditionalRequires<[NotFP64bit]>;
   def _D64 : ABSS_FT<opstr, FGR32Opnd, FGR64Opnd, Itin>,
-             Requires<[IsFP64bit, HasStdEnc]> {
+             AdditionalRequires<[IsFP64bit]> {
     let DecoderNamespace = "Mips64";
   }
 }
@@ -241,77 +251,75 @@ multiclass C_COND_M<string TypeStr, RegisterOperand RC, bits<5> fmt,
 
 defm S : C_COND_M<"s", FGR32Opnd, 16, II_C_CC_S>;
 defm D32 : C_COND_M<"d", AFGR64Opnd, 17, II_C_CC_D>,
-           Requires<[NotFP64bit, HasStdEnc]>;
+           AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 defm D64 : C_COND_M<"d", FGR64Opnd, 17, II_C_CC_D>,
-           Requires<[IsFP64bit, HasStdEnc]>;
+           AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
 def ROUND_W_S  : MMRel, ABSS_FT<"round.w.s", FGR32Opnd, FGR32Opnd, II_ROUND>,
-                 ABSS_FM<0xc, 16>;
+                 ABSS_FM<0xc, 16>, ISA_MIPS2;
 def TRUNC_W_S  : MMRel, ABSS_FT<"trunc.w.s", FGR32Opnd, FGR32Opnd, II_TRUNC>,
-                 ABSS_FM<0xd, 16>;
+                 ABSS_FM<0xd, 16>, ISA_MIPS2;
 def CEIL_W_S   : MMRel, ABSS_FT<"ceil.w.s", FGR32Opnd, FGR32Opnd, II_CEIL>,
-                 ABSS_FM<0xe, 16>;
+                 ABSS_FM<0xe, 16>, ISA_MIPS2;
 def FLOOR_W_S  : MMRel, ABSS_FT<"floor.w.s", FGR32Opnd, FGR32Opnd, II_FLOOR>,
-                 ABSS_FM<0xf, 16>;
+                 ABSS_FM<0xf, 16>, ISA_MIPS2;
 def CVT_W_S    : MMRel, ABSS_FT<"cvt.w.s", FGR32Opnd, FGR32Opnd, II_CVT>,
                  ABSS_FM<0x24, 16>;
 
-defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>;
-defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>;
-defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>;
-defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>;
+defm ROUND_W : ROUND_M<"round.w.d", II_ROUND>, ABSS_FM<0xc, 17>, ISA_MIPS2;
+defm TRUNC_W : ROUND_M<"trunc.w.d", II_TRUNC>, ABSS_FM<0xd, 17>, ISA_MIPS2;
+defm CEIL_W  : ROUND_M<"ceil.w.d", II_CEIL>, ABSS_FM<0xe, 17>, ISA_MIPS2;
+defm FLOOR_W : ROUND_M<"floor.w.d", II_FLOOR>, ABSS_FM<0xf, 17>, ISA_MIPS2;
 defm CVT_W   : ROUND_M<"cvt.w.d", II_CVT>, ABSS_FM<0x24, 17>;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def ROUND_L_S : ABSS_FT<"round.l.s", FGR64Opnd, FGR32Opnd, II_ROUND>,
-                  ABSS_FM<0x8, 16>;
+                  ABSS_FM<0x8, 16>, FGR_64;
   def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64Opnd, FGR64Opnd, II_ROUND>,
-                    ABSS_FM<0x8, 17>;
+                    ABSS_FM<0x8, 17>, FGR_64;
   def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64Opnd, FGR32Opnd, II_TRUNC>,
-                  ABSS_FM<0x9, 16>;
+                  ABSS_FM<0x9, 16>, FGR_64;
   def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64Opnd, FGR64Opnd, II_TRUNC>,
-                    ABSS_FM<0x9, 17>;
+                    ABSS_FM<0x9, 17>, FGR_64;
   def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64Opnd, FGR32Opnd, II_CEIL>,
-                  ABSS_FM<0xa, 16>;
+                  ABSS_FM<0xa, 16>, FGR_64;
   def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64Opnd, FGR64Opnd, II_CEIL>,
-                   ABSS_FM<0xa, 17>;
+                   ABSS_FM<0xa, 17>, FGR_64;
   def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64Opnd, FGR32Opnd, II_FLOOR>,
-                  ABSS_FM<0xb, 16>;
+                  ABSS_FM<0xb, 16>, FGR_64;
   def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64Opnd, FGR64Opnd, II_FLOOR>,
-                    ABSS_FM<0xb, 17>;
+                    ABSS_FM<0xb, 17>, FGR_64;
 }
 
 def CVT_S_W : MMRel, ABSS_FT<"cvt.s.w", FGR32Opnd, FGR32Opnd, II_CVT>,
               ABSS_FM<0x20, 20>;
 def CVT_L_S : MMRel, ABSS_FT<"cvt.l.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-              ABSS_FM<0x25, 16>;
+              ABSS_FM<0x25, 16>, INSN_MIPS3_32R2;
 def CVT_L_D64: MMRel, ABSS_FT<"cvt.l.d", FGR64Opnd, FGR64Opnd, II_CVT>,
-               ABSS_FM<0x25, 17>;
-
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 17>;
-  def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 20>;
-  def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 16>;
-}
+               ABSS_FM<0x25, 17>, INSN_MIPS3_32R2;
+
+def CVT_S_D32 : MMRel, ABSS_FT<"cvt.s.d", FGR32Opnd, AFGR64Opnd, II_CVT>,
+                ABSS_FM<0x20, 17>, FGR_32;
+def CVT_D32_W : MMRel, ABSS_FT<"cvt.d.w", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 20>, FGR_32;
+def CVT_D32_S : MMRel, ABSS_FT<"cvt.d.s", AFGR64Opnd, FGR32Opnd, II_CVT>,
+                ABSS_FM<0x21, 16>, FGR_32;
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+let DecoderNamespace = "Mips64" in {
   def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 17>;
+                  ABSS_FM<0x20, 17>, FGR_64;
   def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x20, 21>;
+                  ABSS_FM<0x20, 21>, FGR_64;
   def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 20>;
+                  ABSS_FM<0x21, 20>, FGR_64;
   def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64Opnd, FGR32Opnd, II_CVT>,
-                  ABSS_FM<0x21, 16>;
+                  ABSS_FM<0x21, 16>, FGR_64;
   def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64Opnd, FGR64Opnd, II_CVT>,
-                  ABSS_FM<0x21, 21>;
+                  ABSS_FM<0x21, 21>, FGR_64;
 }
 
 let isPseudo = 1, isCodeGenOnly = 1 in {
@@ -322,18 +330,16 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
   def PseudoCVT_D64_L : ABSS_FT<"", FGR64Opnd, GPR64Opnd, II_CVT>;
 }
 
-let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
-               ABSS_FM<0x5, 16>;
-  def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
-               ABSS_FM<0x7, 16>;
-  defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
-  defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
-}
+def FABS_S : MMRel, ABSS_FT<"abs.s", FGR32Opnd, FGR32Opnd, II_ABS, fabs>,
+             ABSS_FM<0x5, 16>;
+def FNEG_S : MMRel, ABSS_FT<"neg.s", FGR32Opnd, FGR32Opnd, II_NEG, fneg>,
+             ABSS_FM<0x7, 16>;
+defm FABS : ABSS_M<"abs.d", II_ABS, fabs>, ABSS_FM<0x5, 17>;
+defm FNEG : ABSS_M<"neg.d", II_NEG, fneg>, ABSS_FM<0x7, 17>;
 
 def FSQRT_S : MMRel, ABSS_FT<"sqrt.s", FGR32Opnd, FGR32Opnd, II_SQRT_S, fsqrt>,
-              ABSS_FM<0x4, 16>;
-defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>;
+              ABSS_FM<0x4, 16>, ISA_MIPS2;
+defm FSQRT : ABSS_M<"sqrt.d", II_SQRT_D, fsqrt>, ABSS_FM<0x4, 17>, ISA_MIPS2;
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
@@ -348,76 +354,92 @@ def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1,
 def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1,
                           bitconvert>, MFC1_FM<4>;
 def MFHC1 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, FGRH32Opnd, II_MFHC1>,
-            MFC1_FM<3>;
+            MFC1_FM<3>, ISA_MIPS32R2;
 def MTHC1 : MMRel, MTC1_FT<"mthc1", FGRH32Opnd, GPR32Opnd, II_MTHC1>,
-            MFC1_FM<7>;
+            MFC1_FM<7>, ISA_MIPS32R2;
 def DMFC1 : MFC1_FT<"dmfc1", GPR64Opnd, FGR64Opnd, II_DMFC1,
-            bitconvert>, MFC1_FM<1>;
+            bitconvert>, MFC1_FM<1>, ISA_MIPS3;
 def DMTC1 : MTC1_FT<"dmtc1", FGR64Opnd, GPR64Opnd, II_DMTC1,
-            bitconvert>, MFC1_FM<5>;
+            bitconvert>, MFC1_FM<5>, ISA_MIPS3;
 
 def FMOV_S   : MMRel, ABSS_FT<"mov.s", FGR32Opnd, FGR32Opnd, II_MOV_S>,
                ABSS_FM<0x6, 16>;
 def FMOV_D32 : MMRel, ABSS_FT<"mov.d", AFGR64Opnd, AFGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, Requires<[NotFP64bit, HasStdEnc]>;
+               ABSS_FM<0x6, 17>, AdditionalRequires<[NotFP64bit]>;
 def FMOV_D64 : ABSS_FT<"mov.d", FGR64Opnd, FGR64Opnd, II_MOV_D>,
-               ABSS_FM<0x6, 17>, Requires<[IsFP64bit, HasStdEnc]> {
+               ABSS_FM<0x6, 17>, AdditionalRequires<[IsFP64bit]> {
                  let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
-let Predicates = [HasStdEnc] in {
-  def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
-  def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
-}
-
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
-  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
-}
-
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>;
-  def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>;
-}
-
-/// Cop2 Memory Instructions
-let Predicates = [HasStdEnc] in {
-  def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
-  def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
-  def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>;
-  def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>;
-}
+def LWC1 : MMRel, LW_FT<"lwc1", FGR32Opnd, II_LWC1, load>, LW_FM<0x31>;
+def SWC1 : MMRel, SW_FT<"swc1", FGR32Opnd, II_SWC1, store>, LW_FM<0x39>;
+
+let DecoderNamespace = "Mips64" in {
+  def LDC164 : LW_FT<"ldc1", FGR64Opnd, II_LDC1, load>, LW_FM<0x35>, ISA_MIPS2,
+               FGR_64;
+  def SDC164 : SW_FT<"sdc1", FGR64Opnd, II_SDC1, store>, LW_FM<0x3d>, ISA_MIPS2,
+               FGR_64;
+}
+
+def LDC1 : MMRel, LW_FT<"ldc1", AFGR64Opnd, II_LDC1, load>, LW_FM<0x35>,
+           ISA_MIPS2, FGR_32;
+def SDC1 : MMRel, SW_FT<"sdc1", AFGR64Opnd, II_SDC1, store>, LW_FM<0x3d>,
+           ISA_MIPS2, FGR_32;
+
+// Cop2 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+def LWC2 : LW_FT<"lwc2", COP2Opnd, NoItinerary, load>, LW_FM<0x32>;
+def SWC2 : SW_FT<"swc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3a>;
+def LDC2 : LW_FT<"ldc2", COP2Opnd, NoItinerary, load>, LW_FM<0x36>, ISA_MIPS2;
+def SDC2 : SW_FT<"sdc2", COP2Opnd, NoItinerary, store>, LW_FM<0x3e>, ISA_MIPS2;
+
+// Cop3 Memory Instructions
+// FIXME: These aren't really FPU instructions and as such don't belong in this
+//        file
+def LWC3 : LW_FT<"lwc3", COP3Opnd, NoItinerary, load>, LW_FM<0x33>;
+def SWC3 : SW_FT<"swc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3b>;
+def LDC3 : LW_FT<"ldc3", COP3Opnd, NoItinerary, load>, LW_FM<0x37>, ISA_MIPS2;
+def SDC3 : SW_FT<"sdc3", COP3Opnd, NoItinerary, store>, LW_FM<0x3f>, ISA_MIPS2;
 
 // Indexed loads and stores.
 // Base register + offset register addressing mode (indicated by "x" in the
 // instruction mnemonic) is disallowed under NaCl.
-let Predicates = [HasFPIdx, HasStdEnc, IsNotNaCl] in {
-  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>;
-  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>;
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LWXC1 : MMRel, LWXC1_FT<"lwxc1", FGR32Opnd, II_LWXC1, load>, LWXC1_FM<0>,
+              INSN_MIPS4_32R2;
+  def SWXC1 : MMRel, SWXC1_FT<"swxc1", FGR32Opnd, II_SWXC1, store>, SWXC1_FM<8>,
+              INSN_MIPS4_32R2;
 }
 
-let Predicates = [HasFPIdx, NotFP64bit, HasStdEnc, NotInMicroMips,
-                  IsNotNaCl] in {
-  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
-  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
+let AdditionalPredicates = [NotInMicroMips, IsNotNaCl] in {
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+              INSN_MIPS4_32R2, FGR_32;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+              INSN_MIPS4_32R2, FGR_32;
 }
 
-let Predicates = [HasFPIdx, IsFP64bit, HasStdEnc],
-    DecoderNamespace="Mips64" in {
-  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>;
-  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>;
+let DecoderNamespace="Mips64" in {
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64Opnd, II_LDXC1, load>, LWXC1_FM<1>,
+                INSN_MIPS4_32R2, FGR_64;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64Opnd, II_SDXC1, store>, SWXC1_FM<9>,
+                INSN_MIPS4_32R2, FGR_64;
 }
 
 // Load/store doubleword indexed unaligned.
-let Predicates = [NotFP64bit, HasStdEnc, IsNotNaCl] in {
-  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
-  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
+let AdditionalPredicates = [IsNotNaCl] in {
+  def LUXC1 : MMRel, LWXC1_FT<"luxc1", AFGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+              INSN_MIPS5_32R2, FGR_32;
+  def SUXC1 : MMRel, SWXC1_FT<"suxc1", AFGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+              INSN_MIPS5_32R2, FGR_32;
 }
 
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>;
-  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>;
+let DecoderNamespace="Mips64" in {
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64Opnd, II_LUXC1>, LWXC1_FM<0x5>,
+                INSN_MIPS5_32R2, FGR_64;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64Opnd, II_SUXC1>, SWXC1_FM<0xd>,
+                INSN_MIPS5_32R2, FGR_64;
 }
 
 /// Floating-point Aritmetic
@@ -434,47 +456,43 @@ def FSUB_S : MMRel, ADDS_FT<"sub.s", FGR32Opnd, II_SUB_S, 0, fsub>,
              ADDS_FM<0x01, 16>;
 defm FSUB :  ADDS_M<"sub.d", II_SUB_D, 0, fsub>, ADDS_FM<0x01, 17>;
 
-let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
-               MADDS_FM<4, 0>;
-  def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
-               MADDS_FM<5, 0>;
-}
+def MADD_S : MMRel, MADDS_FT<"madd.s", FGR32Opnd, II_MADD_S, fadd>,
+             MADDS_FM<4, 0>, ISA_MIPS32R2;
+def MSUB_S : MMRel, MADDS_FT<"msub.s", FGR32Opnd, II_MSUB_S, fsub>,
+             MADDS_FM<5, 0>, ISA_MIPS32R2;
 
-let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
+let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_S : MMRel, NMADDS_FT<"nmadd.s", FGR32Opnd, II_NMADD_S, fadd>,
-                MADDS_FM<6, 0>;
+                MADDS_FM<6, 0>, ISA_MIPS32R2;
   def NMSUB_S : MMRel, NMADDS_FT<"nmsub.s", FGR32Opnd, II_NMSUB_S, fsub>,
-                MADDS_FM<7, 0>;
+                MADDS_FM<7, 0>, ISA_MIPS32R2;
 }
 
-let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>;
-  def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>;
-}
+def MADD_D32 : MMRel, MADDS_FT<"madd.d", AFGR64Opnd, II_MADD_D, fadd>,
+               MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_32;
+def MSUB_D32 : MMRel, MADDS_FT<"msub.d", AFGR64Opnd, II_MSUB_D, fsub>,
+               MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_32;
 
-let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
+let AdditionalPredicates = [NoNaNsFPMath] in {
   def NMADD_D32 : MMRel, NMADDS_FT<"nmadd.d", AFGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_32;
   def NMSUB_D32 : MMRel, NMADDS_FT<"nmsub.d", AFGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_32;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
+let isCodeGenOnly=1 in {
   def MADD_D64 : MADDS_FT<"madd.d", FGR64Opnd, II_MADD_D, fadd>,
-                 MADDS_FM<4, 1>;
+                 MADDS_FM<4, 1>, ISA_MIPS32R2, FGR_64;
   def MSUB_D64 : MADDS_FT<"msub.d", FGR64Opnd, II_MSUB_D, fsub>,
-                 MADDS_FM<5, 1>;
+                 MADDS_FM<5, 1>, ISA_MIPS32R2, FGR_64;
 }
 
-let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
+let AdditionalPredicates = [NoNaNsFPMath],
     isCodeGenOnly=1 in {
   def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64Opnd, II_NMADD_D, fadd>,
-                  MADDS_FM<6, 1>;
+                  MADDS_FM<6, 1>, ISA_MIPS32R2, FGR_64;
   def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64Opnd, II_NMSUB_D, fsub>,
-                  MADDS_FM<7, 1>;
+                  MADDS_FM<7, 1>, ISA_MIPS32R2, FGR_64;
 }
 
 //===----------------------------------------------------------------------===//
@@ -515,10 +533,10 @@ def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 /// Floating Point Compare
 def FCMP_S32 : MMRel, CEQS_FT<"s", FGR32, II_C_CC_S, MipsFPCmp>, CEQS_FM<16>;
 def FCMP_D32 : MMRel, CEQS_FT<"d", AFGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               Requires<[NotFP64bit, HasStdEnc]>;
+               AdditionalRequires<[NotFP64bit]>;
 let DecoderNamespace = "Mips64" in
 def FCMP_D64 : CEQS_FT<"d", FGR64, II_C_CC_D, MipsFPCmp>, CEQS_FM<17>,
-               Requires<[IsFP64bit, HasStdEnc]>;
+               AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
@@ -531,9 +549,9 @@ class BuildPairF64Base<RegisterOperand RO> :
            [(set RO:$dst, (MipsBuildPairF64 GPR32Opnd:$lo, GPR32Opnd:$hi))]>;
 
 def BuildPairF64 : BuildPairF64Base<AFGR64Opnd>,
-                   Requires<[NotFP64bit, HasStdEnc]>;
+                   AdditionalRequires<[NotFP64bit]>;
 def BuildPairF64_64 : BuildPairF64Base<FGR64Opnd>,
-                      Requires<[IsFP64bit, HasStdEnc]>;
+                      AdditionalRequires<[IsFP64bit]>;
 
 // This pseudo instr gets expanded into 2 mfc1 instrs after register
 // allocation.
@@ -544,15 +562,15 @@ class ExtractElementF64Base<RegisterOperand RO> :
            [(set GPR32Opnd:$dst, (MipsExtractElementF64 RO:$src, imm:$n))]>;
 
 def ExtractElementF64 : ExtractElementF64Base<AFGR64Opnd>,
-                        Requires<[NotFP64bit, HasStdEnc]>;
+                        AdditionalRequires<[NotFP64bit]>;
 def ExtractElementF64_64 : ExtractElementF64Base<FGR64Opnd>,
-                           Requires<[IsFP64bit, HasStdEnc]>;
+                           AdditionalRequires<[IsFP64bit]>;
 
 //===----------------------------------------------------------------------===//
 // InstAliases.
 //===----------------------------------------------------------------------===//
-def : InstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
-def : InstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1t $offset", (BC1T FCC0, brtarget:$offset)>;
+def : MipsInstAlias<"bc1f $offset", (BC1F FCC0, brtarget:$offset)>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Patterns
@@ -565,55 +583,45 @@ def : MipsPat<(f32 (sint_to_fp GPR32Opnd:$src)),
 def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
               (TRUNC_W_S FGR32Opnd:$src)>;
 
-let Predicates = [NotFP64bit, HasStdEnc] in {
-  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
-                (PseudoCVT_D32_W GPR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
-                (TRUNC_W_D32 AFGR64Opnd:$src)>;
-  def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
-                (CVT_S_D32 AFGR64Opnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
-                (CVT_D32_S FGR32Opnd:$src)>;
-}
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D32_W GPR32Opnd:$src)>, FGR_32;
+def : MipsPat<(MipsTruncIntFP AFGR64Opnd:$src),
+              (TRUNC_W_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f32 (fround AFGR64Opnd:$src)),
+              (CVT_S_D32 AFGR64Opnd:$src)>, FGR_32;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D32_S FGR32Opnd:$src)>, FGR_32;
+
+def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>, FGR_64;
+def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>, FGR_64;
+
+def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
+              (PseudoCVT_D64_W GPR32Opnd:$src)>, FGR_64;
+def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
+              (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>, FGR_64;
+def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
+              (PseudoCVT_D64_L GPR64Opnd:$src)>, FGR_64;
+
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_W_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
+              (TRUNC_L_S FGR32Opnd:$src)>, FGR_64;
+def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
+              (TRUNC_L_D64 FGR64Opnd:$src)>, FGR_64;
 
-let Predicates = [IsFP64bit, HasStdEnc] in {
-  def : MipsPat<(f64 fpimm0), (DMTC1 ZERO_64)>;
-  def : MipsPat<(f64 fpimm0neg), (FNEG_D64 (DMTC1 ZERO_64))>;
-
-  def : MipsPat<(f64 (sint_to_fp GPR32Opnd:$src)),
-                (PseudoCVT_D64_W GPR32Opnd:$src)>;
-  def : MipsPat<(f32 (sint_to_fp GPR64Opnd:$src)),
-                (EXTRACT_SUBREG (PseudoCVT_S_L GPR64Opnd:$src), sub_lo)>;
-  def : MipsPat<(f64 (sint_to_fp GPR64Opnd:$src)),
-                (PseudoCVT_D64_L GPR64Opnd:$src)>;
-
-  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-                (TRUNC_W_D64 FGR64Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR32Opnd:$src),
-                (TRUNC_L_S FGR32Opnd:$src)>;
-  def : MipsPat<(MipsTruncIntFP FGR64Opnd:$src),
-                (TRUNC_L_D64 FGR64Opnd:$src)>;
-
-  def : MipsPat<(f32 (fround FGR64Opnd:$src)),
-                (CVT_S_D64 FGR64Opnd:$src)>;
-  def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
-                (CVT_D64_S FGR32Opnd:$src)>;
-}
+def : MipsPat<(f32 (fround FGR64Opnd:$src)),
+              (CVT_S_D64 FGR64Opnd:$src)>, FGR_64;
+def : MipsPat<(f64 (fextend FGR32Opnd:$src)),
+              (CVT_D64_S FGR32Opnd:$src)>, FGR_64;
 
 // Patterns for loads/stores with a reg+imm operand.
 let AddedComplexity = 40 in {
-  let Predicates = [HasStdEnc] in {
-    def : LoadRegImmPat<LWC1, f32, load>;
-    def : StoreRegImmPat<SWC1, f32>;
-  }
+  def : LoadRegImmPat<LWC1, f32, load>;
+  def : StoreRegImmPat<SWC1, f32>;
 
-  let Predicates = [IsFP64bit, HasStdEnc] in {
-    def : LoadRegImmPat<LDC164, f64, load>;
-    def : StoreRegImmPat<SDC164, f64>;
-  }
+  def : LoadRegImmPat<LDC164, f64, load>, FGR_64;
+  def : StoreRegImmPat<SDC164, f64>, FGR_64;
 
-  let Predicates = [NotFP64bit, HasStdEnc] in {
-    def : LoadRegImmPat<LDC1, f64, load>;
-    def : StoreRegImmPat<SDC1, f64>;
-  }
+  def : LoadRegImmPat<LDC1, f64, load>, FGR_32;
+  def : StoreRegImmPat<SDC1, f64>, FGR_32;
 }
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index e4405ab..0377eab 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -93,8 +93,8 @@ class MipsInst<dag outs, dag ins, string asmstr, list<dag> pattern,
 // Mips32/64 Instruction Format
 class InstSE<dag outs, dag ins, string asmstr, list<dag> pattern,
              InstrItinClass itin, Format f, string opstr = ""> :
-  MipsInst<outs, ins, asmstr, pattern, itin, f> {
-  let Predicates = [HasStdEnc];
+  MipsInst<outs, ins, asmstr, pattern, itin, f>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
   string BaseOpcode = opstr;
   string Arch;
 }
@@ -109,9 +109,9 @@ class MipsPseudo<dag outs, dag ins, list<dag> pattern,
 
 // Mips32/64 Pseudo Instruction Format
 class PseudoSE<dag outs, dag ins, list<dag> pattern,
-               InstrItinClass itin = IIPseudo>:
-  MipsPseudo<outs, ins, pattern, itin> {
-  let Predicates = [HasStdEnc];
+               InstrItinClass itin = IIPseudo> :
+  MipsPseudo<outs, ins, pattern, itin>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
 }
 
 // Pseudo-instructions for alternate assembly syntax (never used by codegen).
@@ -545,6 +545,20 @@ class SEQ_FM<bits<6> funct> : StdArch {
   let Inst{5-0}   = funct;
 }
 
+class SEQI_FM<bits<6> funct> : StdArch {
+  bits<5> rs;
+  bits<5> rt;
+  bits<10> imm10;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x1c;
+  let Inst{25-21} = rs;
+  let Inst{20-16} = rt;
+  let Inst{15-6}  = imm10;
+  let Inst{5-0}   = funct;
+}
+
 //===----------------------------------------------------------------------===//
 //  System calls format <op|code_|funct>
 //===----------------------------------------------------------------------===//
@@ -829,3 +843,12 @@ class BARRIER_FM<bits<5> op> : StdArch {
   let Inst{10-6} = op; // Operation
   let Inst{5-0} = 0;   // SLL
 }
+
+class COP0_TLB_FM<bits<6> op> : StdArch {
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x10; // COP0
+  let Inst{25} = 1;       // CO
+  let Inst{24-6} = 0;
+  let Inst{5-0} = op;     // Operation
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 0ebad05..d6da6c6 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -22,11 +22,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "MipsGenInstrInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 void MipsInstrInfo::anchor() {}
 
@@ -195,7 +195,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
   if (I == REnd || !isUnpredicatedTerminator(&*I)) {
     // This block ends with no branches (it just falls through to its succ).
     // Leave TBB/FBB null.
-    TBB = FBB = NULL;
+    TBB = FBB = nullptr;
     return BT_NoBranch;
   }
 
@@ -209,7 +209,7 @@ AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
 
   // Get the second to last instruction in the block.
   unsigned SecondLastOpc = 0;
-  MachineInstr *SecondLastInst = NULL;
+  MachineInstr *SecondLastInst = nullptr;
 
   if (++I != REnd) {
     SecondLastInst = &*I;
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index d9ac961..742193f 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -9,6 +9,10 @@
 //
 // This file contains the Mips implementation of the TargetInstrInfo class.
 //
+// FIXME: We need to override TargetInstrInfo::getInlineAsmLength method in
+// order for MipsLongBranch pass to work correctly when the code has inline
+// assembly.  The returned value doesn't have to be the asm instruction's exact
+// size in bytes; MipsLongBranch only expects it to be the correct upper bound.
 //===----------------------------------------------------------------------===//
 
 #ifndef MIPSINSTRUCTIONINFO_H
@@ -47,20 +51,20 @@ public:
   static const MipsInstrInfo *create(MipsTargetMachine &TM);
 
   /// Branch Analysis
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
 
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
 
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   BranchType AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                            MachineBasicBlock *&FBB,
@@ -69,8 +73,8 @@ public:
                            SmallVectorImpl<MachineInstr*> &BranchInstrs) const;
 
   /// Insert nop instruction when hazard condition is found
-  virtual void insertNoop(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
   /// getRegisterInfo - TargetInstrInfo is a superset of MRegister info.  As
   /// such, whenever a client has an instance of instruction info, it should
@@ -83,19 +87,19 @@ public:
   /// Return the number of bytes of code the specified instruction may be.
   unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const {
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override {
     storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
   }
 
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const {
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override {
     loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
   }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 07c37d8..0d3cb75 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -146,26 +146,40 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
 //===----------------------------------------------------------------------===//
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasSEInReg  :     Predicate<"Subtarget.hasSEInReg()">,
-                      AssemblerPredicate<"FeatureSEInReg">;
-def HasBitCount :     Predicate<"Subtarget.hasBitCount()">,
-                      AssemblerPredicate<"FeatureBitCount">;
-def HasSwap     :     Predicate<"Subtarget.hasSwap()">,
-                      AssemblerPredicate<"FeatureSwap">;
-def HasCondMov  :     Predicate<"Subtarget.hasCondMov()">,
-                      AssemblerPredicate<"FeatureCondMov">;
-def HasFPIdx    :     Predicate<"Subtarget.hasFPIdx()">,
-                      AssemblerPredicate<"FeatureFPIdx">;
+def HasMips2     :    Predicate<"Subtarget.hasMips2()">,
+                      AssemblerPredicate<"FeatureMips2">;
+def HasMips3_32  :    Predicate<"Subtarget.hasMips3_32()">,
+                      AssemblerPredicate<"FeatureMips3_32">;
+def HasMips3_32r2 :   Predicate<"Subtarget.hasMips3_32r2()">,
+                      AssemblerPredicate<"FeatureMips3_32r2">;
+def HasMips3     :    Predicate<"Subtarget.hasMips3()">,
+                      AssemblerPredicate<"FeatureMips3">;
+def HasMips4_32  :    Predicate<"Subtarget.hasMips4_32()">,
+                      AssemblerPredicate<"FeatureMips4_32">;
+def HasMips4_32r2 :   Predicate<"Subtarget.hasMips4_32r2()">,
+                      AssemblerPredicate<"FeatureMips4_32r2">;
+def HasMips5_32r2 :   Predicate<"Subtarget.hasMips5_32r2()">,
+                      AssemblerPredicate<"FeatureMips5_32r2">;
 def HasMips32    :    Predicate<"Subtarget.hasMips32()">,
                       AssemblerPredicate<"FeatureMips32">;
 def HasMips32r2  :    Predicate<"Subtarget.hasMips32r2()">,
                       AssemblerPredicate<"FeatureMips32r2">;
+def HasMips32r6  :    Predicate<"Subtarget.hasMips32r6()">,
+                      AssemblerPredicate<"FeatureMips32r6">;
+def NotMips32r6  :    Predicate<"!Subtarget.hasMips32r6()">,
+                      AssemblerPredicate<"!FeatureMips32r6">;
+def IsGP64bit    :    Predicate<"Subtarget.isGP64bit()">,
+                      AssemblerPredicate<"FeatureGP64Bit">;
+def IsGP32bit    :    Predicate<"!Subtarget.isGP64bit()">,
+                      AssemblerPredicate<"!FeatureGP64Bit">;
 def HasMips64    :    Predicate<"Subtarget.hasMips64()">,
                       AssemblerPredicate<"FeatureMips64">;
-def NotMips64    :    Predicate<"!Subtarget.hasMips64()">,
-                      AssemblerPredicate<"!FeatureMips64">;
 def HasMips64r2  :    Predicate<"Subtarget.hasMips64r2()">,
                       AssemblerPredicate<"FeatureMips64r2">;
+def HasMips64r6  :    Predicate<"Subtarget.hasMips64r6()">,
+                      AssemblerPredicate<"FeatureMips64r6">;
+def NotMips64r6  :    Predicate<"!Subtarget.hasMips64r6()">,
+                      AssemblerPredicate<"!FeatureMips64r6">;
 def IsN64       :     Predicate<"Subtarget.isABI_N64()">,
                       AssemblerPredicate<"FeatureN64">;
 def InMips16Mode :    Predicate<"Subtarget.inMips16Mode()">,
@@ -176,8 +190,7 @@ def RelocStatic :     Predicate<"TM.getRelocationModel() == Reloc::Static">,
                       AssemblerPredicate<"FeatureMips32">;
 def RelocPIC    :     Predicate<"TM.getRelocationModel() == Reloc::PIC_">,
                       AssemblerPredicate<"FeatureMips32">;
-def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
-                      AssemblerPredicate<"FeatureMips32">;
+def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
 def HasStdEnc :       Predicate<"Subtarget.hasStandardEncoding()">,
                       AssemblerPredicate<"!FeatureMips16">;
 def NotDSP :          Predicate<"!Subtarget.hasDSP()">;
@@ -189,9 +202,65 @@ def IsLE           :  Predicate<"Subtarget.isLittle()">;
 def IsBE           :  Predicate<"!Subtarget.isLittle()">;
 def IsNotNaCl    :    Predicate<"!Subtarget.isTargetNaCl()">;
 
-class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
-  let Predicates = [HasStdEnc];
+//===----------------------------------------------------------------------===//
+// Mips GPR size adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+class GPR_32 { list<Predicate> GPRPredicates = [IsGP32bit]; }
+class GPR_64 { list<Predicate> GPRPredicates = [IsGP64bit]; }
+
+//===----------------------------------------------------------------------===//
+// Mips ISA/ASE membership and instruction group membership adjectives.
+// They are mutually exclusive.
+//===----------------------------------------------------------------------===//
+
+// FIXME: I'd prefer to use additive predicates to build the instruction sets
+//        but we are short on assembler feature bits at the moment. Using a
+//        subtractive predicate will hopefully keep us under the 32 predicate
+//        limit long enough to develop an alternative way to handle P1||P2
+//        predicates.
+class ISA_MIPS1_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS2    { list<Predicate> InsnPredicates = [HasMips2]; }
+class ISA_MIPS2_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips2, NotMips32r6, NotMips64r6];
+}
+class ISA_MIPS3    { list<Predicate> InsnPredicates = [HasMips3]; }
+class ISA_MIPS3_NOT_32R6_64R6 {
+  list<Predicate> InsnPredicates = [HasMips3, NotMips32r6, NotMips64r6];
 }
+class ISA_MIPS32   { list<Predicate> InsnPredicates = [HasMips32]; }
+class ISA_MIPS32R2 { list<Predicate> InsnPredicates = [HasMips32r2]; }
+class ISA_MIPS64   { list<Predicate> InsnPredicates = [HasMips64]; }
+class ISA_MIPS64R2 { list<Predicate> InsnPredicates = [HasMips64r2]; }
+class ISA_MIPS32R6 { list<Predicate> InsnPredicates = [HasMips32r6]; }
+class ISA_MIPS64R6 { list<Predicate> InsnPredicates = [HasMips64r6]; }
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32 { list<Predicate> InsnPredicates = [HasMips3_32]; }
+
+// The portions of MIPS-III that were also added to MIPS32
+class INSN_MIPS3_32R2 { list<Predicate> InsnPredicates = [HasMips3_32r2]; }
+
+// The portions of MIPS-IV that were also added to MIPS32
+class INSN_MIPS4_32 { list<Predicate> InsnPredicates = [HasMips4_32]; }
+
+// The portions of MIPS-IV that were also added to MIPS32R2
+class INSN_MIPS4_32R2 { list<Predicate> InsnPredicates = [HasMips4_32r2]; }
+
+// The portions of MIPS-V that were also added to MIPS32R2
+class INSN_MIPS5_32R2 { list<Predicate> InsnPredicates = [HasMips5_32r2]; }
+
+//===----------------------------------------------------------------------===//
+
+class MipsPat<dag pattern, dag result> : Pat<pattern, result>, PredicateControl {
+  let EncodingPredicates = [HasStdEnc];
+}
+
+class MipsInstAlias<string Asm, dag Result, bit Emit = 0b1> :
+  InstAlias<Asm, Result, Emit>, PredicateControl;
 
 class IsCommutable {
   bit isCommutable = 1;
@@ -265,6 +334,11 @@ def simm16      : Operand<i32> {
   let DecoderMethod= "DecodeSimm16";
 }
 
+def simm19_lsl2 : Operand<i32> {
+  let EncoderMethod = "getSimm19Lsl2Encoding";
+  let DecoderMethod = "DecodeSimm19Lsl2";
+}
+
 def simm20      : Operand<i32> {
 }
 
@@ -284,6 +358,14 @@ def uimmz       : Operand<i32> {
 }
 
 // Unsigned Operand
+def uimm2 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
+def uimm3 : Operand<i32> {
+  let PrintMethod = "printUnsignedImm";
+}
+
 def uimm5       : Operand<i32> {
   let PrintMethod = "printUnsignedImm";
 }
@@ -314,6 +396,10 @@ def InvertedImOperand : Operand<i32> {
   let ParserMatchClass = MipsInvertedImmoperand;
 }
 
+def InvertedImOperand64 : Operand<i64> {
+  let ParserMatchClass = MipsInvertedImmoperand;
+}
+
 class mem_generic : Operand<iPTR> {
   let PrintMethod = "printMemOperand";
   let MIOperandInfo = (ops ptr_rc, simm16);
@@ -478,7 +564,9 @@ class shift_rotate_imm<string opstr, Operand ImmOpnd,
                        SDPatternOperator PF = null_frag> :
   InstSE<(outs RO:$rd), (ins RO:$rt, ImmOpnd:$shamt),
          !strconcat(opstr, "\t$rd, $rt, $shamt"),
-         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr>;
+         [(set RO:$rd, (OpNode RO:$rt, PF:$shamt))], itin, FrmR, opstr> {
+  let TwoOperandAliasConstraint = "$rt = $rd";
+}
 
 class shift_rotate_reg<string opstr, RegisterOperand RO, InstrItinClass itin,
                        SDPatternOperator OpNode = null_frag>:
@@ -590,7 +678,7 @@ class UncondBranch<Instruction BEQInst> :
   let isTerminator = 1;
   let isBarrier = 1;
   let hasDelaySlot = 1;
-  let Predicates = [RelocPIC, HasStdEnc];
+  let AdditionalPredicates = [RelocPIC];
   let Defs = [AT];
 }
 
@@ -779,27 +867,22 @@ class EffectiveAddress<string opstr, RegisterOperand RO> :
 // Count Leading Ones/Zeros in Word
 class CountLeading0<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>,
-  Requires<[HasBitCount, HasStdEnc]>;
+         [(set RO:$rd, (ctlz RO:$rs))], II_CLZ, FrmR, opstr>;
 
 class CountLeading1<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rs), !strconcat(opstr, "\t$rd, $rs"),
-         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>,
-  Requires<[HasBitCount, HasStdEnc]>;
+         [(set RO:$rd, (ctlz (not RO:$rs)))], II_CLO, FrmR, opstr>;
 
 // Sign Extend in Register.
 class SignExtInReg<string opstr, ValueType vt, RegisterOperand RO,
                    InstrItinClass itin> :
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"),
-         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr> {
-  let Predicates = [HasSEInReg, HasStdEnc];
-}
+         [(set RO:$rd, (sext_inreg RO:$rt, vt))], itin, FrmR, opstr>;
 
 // Subword Swap
 class SubwordSwap<string opstr, RegisterOperand RO>:
   InstSE<(outs RO:$rd), (ins RO:$rt), !strconcat(opstr, "\t$rd, $rt"), [],
          NoItinerary, FrmR, opstr> {
-  let Predicates = [HasSwap, HasStdEnc];
   let neverHasSideEffects = 1;
 }
 
@@ -814,17 +897,14 @@ class ExtBase<string opstr, RegisterOperand RO, Operand PosOpnd,
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ext:$size),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size))], NoItinerary,
-         FrmR, opstr> {
-  let Predicates = [HasMips32r2, HasStdEnc];
-}
+         FrmR, opstr>, ISA_MIPS32R2;
 
 class InsBase<string opstr, RegisterOperand RO, Operand PosOpnd,
               SDPatternOperator Op = null_frag>:
   InstSE<(outs RO:$rt), (ins RO:$rs, PosOpnd:$pos, size_ins:$size, RO:$src),
          !strconcat(opstr, " $rt, $rs, $pos, $size"),
          [(set RO:$rt, (Op RO:$rs, imm:$pos, imm:$size, RO:$src))],
-         NoItinerary, FrmR, opstr> {
-  let Predicates = [HasMips32r2, HasStdEnc];
+         NoItinerary, FrmR, opstr>, ISA_MIPS32R2 {
   let Constraints = "$src = $rt";
 }
 
@@ -915,6 +995,18 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
   def STORE_ACC64 : Store<"", ACC64>;
 }
 
+// We need these two pseudo instructions to avoid offset calculation for long
+// branches.  See the comment in file MipsLongBranch.cpp for detailed
+// explanation.
+
+// Expands to: lui $dst, %hi($tgt - $baltgt)
+def LONG_BRANCH_LUi : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins brtarget:$tgt, brtarget:$baltgt), []>;
+
+// Expands to: addiu $dst, $src, %lo($tgt - $baltgt)
+def LONG_BRANCH_ADDiu : PseudoSE<(outs GPR32Opnd:$dst),
+  (ins GPR32Opnd:$src, brtarget:$tgt, brtarget:$baltgt), []>;
+
 //===----------------------------------------------------------------------===//
 // Instruction definition
 //===----------------------------------------------------------------------===//
@@ -926,7 +1018,8 @@ let isPseudo = 1, isCodeGenOnly = 1 in {
 def ADDiu : MMRel, ArithLogicI<"addiu", simm16, GPR32Opnd, II_ADDIU, immSExt16,
                                add>,
             ADDI_FM<0x9>, IsAsCheapAsAMove;
-def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>;
+def ADDi  : MMRel, ArithLogicI<"addi", simm16, GPR32Opnd>, ADDI_FM<0x8>,
+            ISA_MIPS1_NOT_32R6_64R6;
 def SLTi  : MMRel, SetCC_I<"slti", setlt, simm16, immSExt16, GPR32Opnd>,
             SLTI_FM<0xa>;
 def SLTiu : MMRel, SetCC_I<"sltiu", setult, simm16, immSExt16, GPR32Opnd>,
@@ -949,7 +1042,7 @@ def SUBu  : MMRel, ArithLogicR<"subu", GPR32Opnd, 0, II_SUBU, sub>,
             ADD_FM<0, 0x23>;
 let Defs = [HI0, LO0] in
 def MUL   : MMRel, ArithLogicR<"mul", GPR32Opnd, 1, II_MUL, mul>,
-            ADD_FM<0x1c, 2>;
+            ADD_FM<0x1c, 2>, ISA_MIPS32;
 def ADD   : MMRel, ArithLogicR<"add", GPR32Opnd>, ADD_FM<0, 0x20>;
 def SUB   : MMRel, ArithLogicR<"sub", GPR32Opnd>, ADD_FM<0, 0x22>;
 def SLT   : MMRel, SetCC_R<"slt", setlt, GPR32Opnd>, ADD_FM<0, 0x2a>;
@@ -977,12 +1070,11 @@ def SRAV : MMRel, shift_rotate_reg<"srav", GPR32Opnd, II_SRAV, sra>,
            SRLV_FM<7, 0>;
 
 // Rotate Instructions
-let Predicates = [HasMips32r2, HasStdEnc] in {
-  def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
-                                      immZExt5>, SRA_FM<2, 1>;
-  def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
-              SRLV_FM<6, 1>;
-}
+def ROTR  : MMRel, shift_rotate_imm<"rotr", uimm5, GPR32Opnd, II_ROTR, rotr,
+                                    immZExt5>,
+            SRA_FM<2, 1>, ISA_MIPS32R2;
+def ROTRV : MMRel, shift_rotate_reg<"rotrv", GPR32Opnd, II_ROTRV, rotr>,
+            SRLV_FM<6, 1>, ISA_MIPS32R2;
 
 /// Load and Store Instructions
 ///  aligned
@@ -999,11 +1091,16 @@ def SH  : Store<"sh", GPR32Opnd, truncstorei16, II_SH>, MMRel, LW_FM<0x29>;
 def SW  : Store<"sw", GPR32Opnd, store, II_SW>, MMRel, LW_FM<0x2b>;
 
 /// load/store left/right
-let Predicates = [NotInMicroMips] in {
-def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>;
-def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>;
-def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>;
-def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>;
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
+def LWL : LoadLeftRight<"lwl", MipsLWL, GPR32Opnd, II_LWL>, LW_FM<0x22>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def LWR : LoadLeftRight<"lwr", MipsLWR, GPR32Opnd, II_LWR>, LW_FM<0x26>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWL : StoreLeftRight<"swl", MipsSWL, GPR32Opnd, II_SWL>, LW_FM<0x2a>,
+          ISA_MIPS1_NOT_32R6_64R6;
+def SWR : StoreLeftRight<"swr", MipsSWR, GPR32Opnd, II_SWR>, LW_FM<0x2e>,
+          ISA_MIPS1_NOT_32R6_64R6;
 }
 
 def SYNC : MMRel, SYNC_FT<"sync">, SYNC_FM;
@@ -1014,34 +1111,41 @@ def TLT : MMRel, TEQ_FT<"tlt", GPR32Opnd>, TEQ_FM<0x32>;
 def TLTU : MMRel, TEQ_FT<"tltu", GPR32Opnd>, TEQ_FM<0x33>;
 def TNE : MMRel, TEQ_FT<"tne", GPR32Opnd>, TEQ_FM<0x36>;
 
-def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>;
-def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>;
-def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>;
-def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>;
-def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>;
-def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>;
+def TEQI : MMRel, TEQI_FT<"teqi", GPR32Opnd>, TEQI_FM<0xc>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEI : MMRel, TEQI_FT<"tgei", GPR32Opnd>, TEQI_FM<0x8>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TGEIU : MMRel, TEQI_FT<"tgeiu", GPR32Opnd>, TEQI_FM<0x9>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TLTI : MMRel, TEQI_FT<"tlti", GPR32Opnd>, TEQI_FM<0xa>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TTLTIU : MMRel, TEQI_FT<"tltiu", GPR32Opnd>, TEQI_FM<0xb>,
+           ISA_MIPS2_NOT_32R6_64R6;
+def TNEI : MMRel, TEQI_FT<"tnei", GPR32Opnd>, TEQI_FM<0xe>,
+           ISA_MIPS2_NOT_32R6_64R6;
 
 def BREAK : MMRel, BRK_FT<"break">, BRK_FM<0xd>;
 def SYSCALL : MMRel, SYS_FT<"syscall">, SYS_FM<0xc>;
 def TRAP : TrapBase<BREAK>;
 
-def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>;
-def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>;
+def ERET : MMRel, ER_FT<"eret">, ER_FM<0x18>, INSN_MIPS3_32;
+def DERET : MMRel, ER_FT<"deret">, ER_FM<0x1f>, ISA_MIPS32;
 
-def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>;
-def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>;
+def EI : MMRel, DEI_FT<"ei", GPR32Opnd>, EI_FM<1>, ISA_MIPS32R2;
+def DI : MMRel, DEI_FT<"di", GPR32Opnd>, EI_FM<0>, ISA_MIPS32R2;
 
-let Predicates = [NotInMicroMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
 def WAIT : WAIT_FT<"wait">, WAIT_FM;
 
 /// Load-linked, Store-conditional
-def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>;
-def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>;
+def LL : LLBase<"ll", GPR32Opnd>, LW_FM<0x30>, ISA_MIPS2;
+def SC : SCBase<"sc", GPR32Opnd>, LW_FM<0x38>, ISA_MIPS2;
 }
 
 /// Jump and Branch Instructions
 def J       : MMRel, JumpFJ<jmptarget, "j", br, bb, "j">, FJ<2>,
-              Requires<[RelocStatic, HasStdEnc]>, IsBranch;
+              AdditionalRequires<[RelocStatic]>, IsBranch;
 def JR      : MMRel, IndirectBranch<"jr", GPR32Opnd>, MTLO_FM<8>;
 def BEQ     : MMRel, CBranch<"beq", brtarget, seteq, GPR32Opnd>, BEQ_FM<4>;
 def BNE     : MMRel, CBranch<"bne", brtarget, setne, GPR32Opnd>, BEQ_FM<5>;
@@ -1056,7 +1160,7 @@ def BLTZ    : MMRel, CBranchZero<"bltz", brtarget, setlt, GPR32Opnd>,
 def B       : UncondBranch<BEQ>;
 
 def JAL  : MMRel, JumpLink<"jal", calltarget>, FJ<3>;
-let Predicates = [NotInMicroMips, HasStdEnc] in {
+let AdditionalPredicates = [NotInMicroMips] in {
 def JALR : JumpLinkReg<"jalr", GPR32Opnd>, JALR_FM;
 def JALRPseudo : JumpLinkRegPseudo<GPR32Opnd, JALR, RA>;
 }
@@ -1102,21 +1206,24 @@ def UDIV  : MMRel, Div<"divu", II_DIVU, GPR32Opnd, [HI0, LO0]>,
 
 def MTHI : MMRel, MoveToLOHI<"mthi", GPR32Opnd, [HI0]>, MTLO_FM<0x11>;
 def MTLO : MMRel, MoveToLOHI<"mtlo", GPR32Opnd, [LO0]>, MTLO_FM<0x13>;
-let Predicates = [NotInMicroMips] in {
+let EncodingPredicates = []<Predicate>, // FIXME: Lack of HasStdEnc is probably a bug
+    AdditionalPredicates = [NotInMicroMips] in {
 def MFHI : MMRel, MoveFromLOHI<"mfhi", GPR32Opnd, AC0>, MFLO_FM<0x10>;
 def MFLO : MMRel, MoveFromLOHI<"mflo", GPR32Opnd, AC0>, MFLO_FM<0x12>;
 }
 
 /// Sign Ext In Register Instructions.
-def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>, SEB_FM<0x10, 0x20>;
-def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>, SEB_FM<0x18, 0x20>;
+def SEB : MMRel, SignExtInReg<"seb", i8, GPR32Opnd, II_SEB>,
+          SEB_FM<0x10, 0x20>, ISA_MIPS32R2;
+def SEH : MMRel, SignExtInReg<"seh", i16, GPR32Opnd, II_SEH>,
+          SEB_FM<0x18, 0x20>, ISA_MIPS32R2;
 
 /// Count Leading
-def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>;
-def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>;
+def CLZ : MMRel, CountLeading0<"clz", GPR32Opnd>, CLO_FM<0x20>, ISA_MIPS32;
+def CLO : MMRel, CountLeading1<"clo", GPR32Opnd>, CLO_FM<0x21>, ISA_MIPS32;
 
 /// Word Swap Bytes Within Halfwords
-def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>;
+def WSBH : MMRel, SubwordSwap<"wsbh", GPR32Opnd>, SEB_FM<2, 0x20>, ISA_MIPS32R2;
 
 /// No operation.
 def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
@@ -1128,12 +1235,12 @@ def NOP : PseudoSE<(outs), (ins), []>, PseudoInstExpansion<(SLL ZERO, ZERO, 0)>;
 def LEA_ADDiu : MMRel, EffectiveAddress<"addiu", GPR32Opnd>, LW_FM<9>;
 
 // MADD*/MSUB*
-def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>;
-def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>;
-def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>;
-def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>;
+def MADD  : MMRel, MArithR<"madd", II_MADD, 1>, MULT_FM<0x1c, 0>, ISA_MIPS32;
+def MADDU : MMRel, MArithR<"maddu", II_MADDU, 1>, MULT_FM<0x1c, 1>, ISA_MIPS32;
+def MSUB  : MMRel, MArithR<"msub", II_MSUB>, MULT_FM<0x1c, 4>, ISA_MIPS32;
+def MSUBU : MMRel, MArithR<"msubu", II_MSUBU>, MULT_FM<0x1c, 5>, ISA_MIPS32;
 
-let Predicates = [HasStdEnc, NotDSP] in {
+let AdditionalPredicates = [NotDSP] in {
 def PseudoMULT  : MultDivPseudo<MULT, ACC64, GPR32Opnd, MipsMult, II_MULT>;
 def PseudoMULTu : MultDivPseudo<MULTu, ACC64, GPR32Opnd, MipsMultu, II_MULTU>;
 def PseudoMFHI : PseudoMFLOHI<GPR32, ACC64, MipsMFHI>;
@@ -1156,8 +1263,8 @@ def EXT : MMRel, ExtBase<"ext", GPR32Opnd, uimm5, MipsExt>, EXT_FM<0>;
 def INS : MMRel, InsBase<"ins", GPR32Opnd, uimm5, MipsIns>, EXT_FM<4>;
 
 /// Move Control Registers From/To CPU Registers
-def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>;
-def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>;
+def MFC0 : MFC3OP<"mfc0", GPR32Opnd>, MFC3OP_FM<0x10, 0>, ISA_MIPS32;
+def MTC0 : MFC3OP<"mtc0", GPR32Opnd>, MFC3OP_FM<0x10, 4>, ISA_MIPS32;
 def MFC2 : MFC3OP<"mfc2", GPR32Opnd>, MFC3OP_FM<0x12, 0>;
 def MTC2 : MFC3OP<"mtc2", GPR32Opnd>, MFC3OP_FM<0x12, 4>;
 
@@ -1165,67 +1272,94 @@ class Barrier<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
                                       FrmOther>;
 def SSNOP : Barrier<"ssnop">, BARRIER_FM<1>;
 def EHB : Barrier<"ehb">, BARRIER_FM<3>;
-def PAUSE : Barrier<"pause">, BARRIER_FM<5>, Requires<[HasMips32r2]>;
+def PAUSE : Barrier<"pause">, BARRIER_FM<5>, ISA_MIPS32R2;
+
+class TLB<string asmstr> : InstSE<(outs), (ins), asmstr, [], NoItinerary,
+                                      FrmOther>;
+def TLBP : TLB<"tlbp">, COP0_TLB_FM<0x08>;
+def TLBR : TLB<"tlbr">, COP0_TLB_FM<0x01>;
+def TLBWI : TLB<"tlbwi">, COP0_TLB_FM<0x02>;
+def TLBWR : TLB<"tlbwr">, COP0_TLB_FM<0x06>;
 
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
-def : InstAlias<"move $dst, $src",
-                (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
-      Requires<[NotMips64, NotInMicroMips]>;
-def : InstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
-def : InstAlias<"addu $rs, $rt, $imm",
-                (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"add $rs, $rt, $imm",
-                (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"and $rs, $rt, $imm",
-                (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"move $dst, $src",
+                    (ADDu GPR32Opnd:$dst, GPR32Opnd:$src,ZERO), 1>,
+      GPR_32 {
+  let AdditionalPredicates = [NotInMicroMips];
+}
+def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"addu $rs, $rt, $imm",
+                    (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"add $rs, $rt, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"and $rs, $rt, $imm",
+                    (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
-def : InstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-}
-def : InstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
-def : InstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
-def : InstAlias<"not $rt, $rs",
-                (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
-def : InstAlias<"neg $rt, $rs",
-                (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : InstAlias<"negu $rt, $rs",
-                (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
-def : InstAlias<"slt $rs, $rt, $imm",
-                (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-def : InstAlias<"xor $rs, $rt, $imm",
-                (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : InstAlias<"or $rs, $rt, $imm",
-                (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-def : InstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
-def : InstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
-def : InstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"bnez $rs,$offset",
-                (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"beqz $rs,$offset",
-                (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
-def : InstAlias<"syscall", (SYSCALL 0), 1>;
-
-def : InstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
-def : InstAlias<"break", (BREAK 0, 0), 1>;
-def : InstAlias<"ei", (EI ZERO), 1>;
-def : InstAlias<"di", (DI ZERO), 1>;
-
-def  : InstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def  : InstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
-def : InstAlias<"sub, $rd, $rs, $imm",
-                (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
-def : InstAlias<"subu, $rd, $rs, $imm",
-                (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs, InvertedImOperand:$imm)>;
-
+def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+}
+def : MipsInstAlias<"jal $rs", (JALR RA, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"jal $rd,$rs", (JALR GPR32Opnd:$rd, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"not $rt, $rs",
+                    (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
+def : MipsInstAlias<"neg $rt, $rs",
+                    (SUB GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"negu $rt",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rt), 0>;
+def : MipsInstAlias<"negu $rt, $rs",
+                    (SUBu GPR32Opnd:$rt, ZERO, GPR32Opnd:$rs), 1>;
+def : MipsInstAlias<"slt $rs, $rt, $imm",
+                    (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
+def : MipsInstAlias<"sltu $rt, $rs, $imm",
+                    (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm16:$imm), 0>;
+def : MipsInstAlias<"xor $rs, $rt, $imm",
+                    (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"or $rs, $rt, $imm",
+                    (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
+def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mfc2 $rt, $rd", (MFC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"mtc2 $rt, $rd", (MTC2 GPR32Opnd:$rt, GPR32Opnd:$rd, 0), 0>;
+def : MipsInstAlias<"b $offset", (BEQ ZERO, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"bnez $rs,$offset",
+                    (BNE GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"beqz $rs,$offset",
+                    (BEQ GPR32Opnd:$rs, ZERO, brtarget:$offset), 0>;
+def : MipsInstAlias<"syscall", (SYSCALL 0), 1>;
+    
+def : MipsInstAlias<"break", (BREAK 0, 0), 1>;
+def : MipsInstAlias<"break $imm", (BREAK uimm10:$imm, 0), 1>;
+def : MipsInstAlias<"ei", (EI ZERO), 1>;
+def : MipsInstAlias<"di", (DI ZERO), 1>;
+
+def  : MipsInstAlias<"teq $rs, $rt", (TEQ GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tge $rs, $rt", (TGE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tgeu $rs, $rt", (TGEU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tlt $rs, $rt", (TLT GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"tltu $rs, $rt", (TLTU GPR32Opnd:$rs, GPR32Opnd:$rt, 0),
+                     1>;
+def  : MipsInstAlias<"tne $rs, $rt", (TNE GPR32Opnd:$rs, GPR32Opnd:$rt, 0), 1>;
+def  : MipsInstAlias<"sll $rd, $rt, $rs",
+                     (SLLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"sub, $rd, $rs, $imm",
+                    (ADDi GPR32Opnd:$rd, GPR32Opnd:$rs,
+                          InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sub $rs, $imm",
+                    (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, InvertedImOperand:$imm),
+                    0>;
+def : MipsInstAlias<"subu, $rd, $rs, $imm",
+                    (ADDiu GPR32Opnd:$rd, GPR32Opnd:$rs,
+                           InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"subu $rs, $imm", (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                             InvertedImOperand:$imm), 0>;
+def : MipsInstAlias<"sra $rd, $rt, $rs",
+                    (SRAV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
+def : MipsInstAlias<"srl $rd, $rt, $rs",
+                    (SRLV GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -1271,7 +1405,7 @@ def : MipsPat<(i32 imm:$imm),
 // Carry MipsPatterns
 def : MipsPat<(subc GPR32:$lhs, GPR32:$rhs),
               (SUBu GPR32:$lhs, GPR32:$rhs)>;
-let Predicates = [HasStdEnc, NotDSP] in {
+let AdditionalPredicates = [NotDSP] in {
   def : MipsPat<(addc GPR32:$lhs, GPR32:$rhs),
                 (ADDu GPR32:$lhs, GPR32:$rhs)>;
   def : MipsPat<(addc  GPR32:$src, immSExt16:$imm),
@@ -1340,14 +1474,11 @@ def : MipsPat<(not GPR32:$in),
               (NOR GPR32Opnd:$in, ZERO)>;
 
 // extended loads
-let Predicates = [HasStdEnc] in {
-  def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
-}
+def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
+def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 
 // peepholes
-let Predicates = [HasStdEnc] in
 def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 
 // brcond patterns
@@ -1441,11 +1572,9 @@ def : MipsPat<(bswap GPR32:$rt), (ROTR (WSBH GPR32:$rt), 16)>;
 
 // Load halfword/word patterns.
 let AddedComplexity = 40 in {
-  let Predicates = [HasStdEnc] in {
-    def : LoadRegImmPat<LBu, i32, zextloadi8>;
-    def : LoadRegImmPat<LH, i32, sextloadi16>;
-    def : LoadRegImmPat<LW, i32, load>;
-  }
+  def : LoadRegImmPat<LBu, i32, zextloadi8>;
+  def : LoadRegImmPat<LH, i32, sextloadi16>;
+  def : LoadRegImmPat<LW, i32, load>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1456,6 +1585,9 @@ include "MipsInstrFPU.td"
 include "Mips64InstrInfo.td"
 include "MipsCondMov.td"
 
+include "Mips32r6InstrInfo.td"
+include "Mips64r6InstrInfo.td"
+
 //
 // Mips16
 
diff --git a/lib/Target/Mips/MipsJITInfo.cpp b/lib/Target/Mips/MipsJITInfo.cpp
index d76cb1d..2072488 100644
--- a/lib/Target/Mips/MipsJITInfo.cpp
+++ b/lib/Target/Mips/MipsJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "MipsJITInfo.h"
 #include "MipsInstrInfo.h"
 #include "MipsRelocations.h"
@@ -25,6 +24,8 @@
 #include <cstdlib>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 
 void MipsJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
   unsigned NewAddr = (intptr_t)New;
diff --git a/lib/Target/Mips/MipsJITInfo.h b/lib/Target/Mips/MipsJITInfo.h
index ecda310..c9dfd83 100644
--- a/lib/Target/Mips/MipsJITInfo.h
+++ b/lib/Target/Mips/MipsJITInfo.h
@@ -37,26 +37,26 @@ class MipsJITInfo : public TargetJITInfo {
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
 
     // getStubLayout - Returns the size and alignment of the largest call stub
     // on Mips.
-    virtual StubLayout getStubLayout();
+    StubLayout getStubLayout() override;
 
     /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
     /// small native function that simply calls the function at the specified
     /// address.
-    virtual void *emitFunctionStub(const Function *F, void *Fn,
-                                   JITCodeEmitter &JCE);
+    void *emitFunctionStub(const Function *F, void *Fn,
+                           JITCodeEmitter &JCE) override;
 
     /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
     /// relocate - Before the JIT can run a block of code that has been emitted,
     /// it must rewrite the code to contain the actual addresses of any
     /// referenced global symbols.
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char *GOTBase);
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char *GOTBase) override;
 
     /// Initialize - Initialize internal stage for the function being JITted.
     void Initialize(const MachineFunction &MF, bool isPIC,
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 2b6a874..acfe76e 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -10,14 +10,9 @@
 // This pass expands a branch or jump instruction into a long branch if its
 // offset is too large to fit into its immediate field.
 //
-// FIXME:
-// 1. Fix pc-region jump instructions which cross 256MB segment boundaries.
-// 2. If program has inline assembly statements whose size cannot be
-//    determined accurately, load branch target addresses from the GOT.
+// FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-long-branch"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsTargetMachine.h"
@@ -33,6 +28,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-long-branch"
+
 STATISTIC(LongBranches, "Number of long branches.");
 
 static cl::opt<bool> SkipLongBranch(
@@ -56,7 +53,7 @@ namespace {
     bool HasLongBranch;
     MachineInstr *Br;
 
-    MBBInfo() : Size(0), HasLongBranch(false), Br(0) {}
+    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
   };
 
   class MipsLongBranch : public MachineFunctionPass {
@@ -67,13 +64,13 @@ namespace {
       : MachineFunctionPass(ID), TM(tm),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_),
         ABI(TM.getSubtarget<MipsSubtarget>().getTargetABI()),
-        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 13 : 9)) {}
+        LongBranchSeqSize(!IsPIC ? 2 : (ABI == MipsSubtarget::N64 ? 10 : 9)) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Mips Long Branch";
     }
 
-    bool runOnMachineFunction(MachineFunction &F);
+    bool runOnMachineFunction(MachineFunction &F) override;
 
   private:
     void splitMBB(MachineBasicBlock *MBB);
@@ -111,7 +108,7 @@ static MachineBasicBlock *getTargetMBB(const MachineInstr &Br) {
   }
 
   assert(false && "This instruction does not have an MBB operand.");
-  return 0;
+  return nullptr;
 }
 
 // Traverse the list of instructions backwards until a non-debug instruction is
@@ -267,20 +264,14 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
     LongBrMBB->addSuccessor(BalTgtMBB);
     BalTgtMBB->addSuccessor(TgtMBB);
 
-    int64_t TgtAddress = MBBInfos[TgtMBB->getNumber()].Address;
-    unsigned BalTgtMBBSize = 5;
-    int64_t Offset = TgtAddress - (I.Address + I.Size - BalTgtMBBSize * 4);
-    int64_t Lo = SignExtend64<16>(Offset & 0xffff);
-    int64_t Hi = SignExtend64<16>(((Offset + 0x8000) >> 16) & 0xffff);
-
     if (ABI != MipsSubtarget::N64) {
       // $longbr:
       //  addiu $sp, $sp, -8
       //  sw $ra, 0($sp)
-      //  bal $baltgt
       //  lui $at, %hi($tgt - $baltgt)
-      // $baltgt:
+      //  bal $baltgt
       //  addiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
       //  addu $at, $ra, $at
       //  lw $ra, 0($sp)
       //  jr $at
@@ -295,14 +286,31 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SW)).addReg(Mips::RA)
         .addReg(Mips::SP).addImm(0);
 
+      // LUi and ADDiu instructions create 32-bit offset of the target basic
+      // block from the target of BAL instruction.  We cannot use immediate
+      // value for this offset because it cannot be determined accurately when
+      // the program has inline assembly statements.  We therefore use the
+      // relocation expressions %hi($tgt-$baltgt) and %lo($tgt-$baltgt) which
+      // are resolved during the fixup, so the values will always be correct.
+      //
+      // Since we cannot create %hi($tgt-$baltgt) and %lo($tgt-$baltgt)
+      // expressions at this point (it is possible only at the MC layer),
+      // we replace LUi and ADDiu with pseudo instructions
+      // LONG_BRANCH_LUi and LONG_BRANCH_ADDiu, and add both basic
+      // blocks as operands to these instructions.  When lowering these pseudo
+      // instructions to LUi and ADDiu in the MC layer, we will create
+      // %hi($tgt-$baltgt) and %lo($tgt-$baltgt) expressions and add them as
+      // operands to lowered instructions.
+
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_LUi), Mips::AT)
+        .addMBB(TgtMBB).addMBB(BalTgtMBB);
       MIBundleBuilder(*LongBrMBB, Pos)
         .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::LUi), Mips::AT).addImm(Hi));
+        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_ADDiu), Mips::AT)
+                  .addReg(Mips::AT).addMBB(TgtMBB).addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDiu), Mips::AT)
-        .addReg(Mips::AT).addImm(Lo);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::ADDu), Mips::AT)
         .addReg(Mips::RA).addReg(Mips::AT);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LW), Mips::RA)
@@ -316,14 +324,11 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // $longbr:
       //  daddiu $sp, $sp, -16
       //  sd $ra, 0($sp)
-      //  lui64 $at, %highest($tgt - $baltgt)
-      //  daddiu $at, $at, %higher($tgt - $baltgt)
+      //  daddiu $at, $zero, %hi($tgt - $baltgt)
       //  dsll $at, $at, 16
-      //  daddiu $at, $at, %hi($tgt - $baltgt)
       //  bal $baltgt
-      //  dsll $at, $at, 16
-      // $baltgt:
       //  daddiu $at, $at, %lo($tgt - $baltgt)
+      // $baltgt:
       //  daddu $at, $ra, $at
       //  ld $ra, 0($sp)
       //  jr64 $at
@@ -331,9 +336,20 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
       // $fallthrough:
       //
 
-      int64_t Higher = SignExtend64<16>(((Offset + 0x80008000) >> 32) & 0xffff);
-      int64_t Highest =
-        SignExtend64<16>(((Offset + 0x800080008000LL) >> 48) & 0xffff);
+      // We assume the branch is within-function, and that offset is within
+      // +/- 2GB.  High 32 bits will therefore always be zero.
+
+      // Note that this will work even if the offset is negative, because
+      // of the +1 modification that's added in that case.  For example, if the
+      // offset is -1MB (0xFFFFFFFFFFF00000), the computation for %higher is
+      //
+      // 0xFFFFFFFFFFF00000 + 0x80008000 = 0x000000007FF08000
+      //
+      // and the bits [47:32] are zero.  For %highest
+      //
+      // 0xFFFFFFFFFFF00000 + 0x800080008000 = 0x000080007FF08000
+      //
+      // and the bits [63:48] are zero.
 
       Pos = LongBrMBB->begin();
 
@@ -341,24 +357,21 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
         .addReg(Mips::SP_64).addImm(-16);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::SD)).addReg(Mips::RA_64)
         .addReg(Mips::SP_64).addImm(0);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LUi64), Mips::AT_64)
-        .addImm(Highest);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Higher);
+      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+              Mips::AT_64).addReg(Mips::ZERO_64)
+                          .addMBB(TgtMBB, MipsII::MO_ABS_HI).addMBB(BalTgtMBB);
       BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DSLL), Mips::AT_64)
         .addReg(Mips::AT_64).addImm(16);
-      BuildMI(*LongBrMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Hi);
 
       MIBundleBuilder(*LongBrMBB, Pos)
         .append(BuildMI(*MF, DL, TII->get(Mips::BAL_BR)).addMBB(BalTgtMBB))
-        .append(BuildMI(*MF, DL, TII->get(Mips::DSLL), Mips::AT_64)
-                .addReg(Mips::AT_64).addImm(16));
+        .append(BuildMI(*MF, DL, TII->get(Mips::LONG_BRANCH_DADDiu),
+                        Mips::AT_64).addReg(Mips::AT_64)
+                                    .addMBB(TgtMBB, MipsII::MO_ABS_LO)
+                                    .addMBB(BalTgtMBB));
 
       Pos = BalTgtMBB->begin();
 
-      BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDiu), Mips::AT_64)
-        .addReg(Mips::AT_64).addImm(Lo);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::DADDu), Mips::AT_64)
         .addReg(Mips::RA_64).addReg(Mips::AT_64);
       BuildMI(*BalTgtMBB, Pos, DL, TII->get(Mips::LD), Mips::RA_64)
@@ -370,8 +383,7 @@ void MipsLongBranch::expandToLongBranch(MBBInfo &I) {
                 .addReg(Mips::SP_64).addImm(16));
     }
 
-    assert(BalTgtMBBSize == BalTgtMBB->size());
-    assert(LongBrMBB->size() + BalTgtMBBSize == LongBranchSeqSize);
+    assert(LongBrMBB->size() + BalTgtMBB->size() == LongBranchSeqSize);
   } else {
     // $longbr:
     //  j $tgt
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 7c9a9ed..821392e 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -151,7 +151,75 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand &MO,
   return MCOperand();
 }
 
+MCOperand MipsMCInstLower::createSub(MachineBasicBlock *BB1,
+                                     MachineBasicBlock *BB2,
+                                     MCSymbolRefExpr::VariantKind Kind) const {
+  const MCSymbolRefExpr *Sym1 = MCSymbolRefExpr::Create(BB1->getSymbol(), *Ctx);
+  const MCSymbolRefExpr *Sym2 = MCSymbolRefExpr::Create(BB2->getSymbol(), *Ctx);
+  const MCBinaryExpr *Sub = MCBinaryExpr::CreateSub(Sym1, Sym2, *Ctx);
+
+  return MCOperand::CreateExpr(MipsMCExpr::Create(Kind, Sub, *Ctx));
+}
+
+void MipsMCInstLower::
+lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(Mips::LUi);
+
+  // Lower register operand.
+  OutMI.addOperand(LowerOperand(MI->getOperand(0)));
+
+  // Create %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(1).getMBB(),
+                             MI->getOperand(2).getMBB(),
+                             MCSymbolRefExpr::VK_Mips_ABS_HI));
+}
+
+void MipsMCInstLower::
+lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI, int Opcode,
+                     MCSymbolRefExpr::VariantKind Kind) const {
+  OutMI.setOpcode(Opcode);
+
+  // Lower two register operands.
+  for (unsigned I = 0, E = 2; I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+    OutMI.addOperand(LowerOperand(MO));
+  }
+
+  // Create %lo($tgt-$baltgt) or %hi($tgt-$baltgt).
+  OutMI.addOperand(createSub(MI->getOperand(2).getMBB(),
+                             MI->getOperand(3).getMBB(), Kind));
+}
+
+bool MipsMCInstLower::lowerLongBranch(const MachineInstr *MI,
+                                      MCInst &OutMI) const {
+  switch (MI->getOpcode()) {
+  default:
+    return false;
+  case Mips::LONG_BRANCH_LUi:
+    lowerLongBranchLUi(MI, OutMI);
+    return true;
+  case Mips::LONG_BRANCH_ADDiu:
+    lowerLongBranchADDiu(MI, OutMI, Mips::ADDiu,
+                         MCSymbolRefExpr::VK_Mips_ABS_LO);
+    return true;
+  case Mips::LONG_BRANCH_DADDiu:
+    unsigned TargetFlags = MI->getOperand(2).getTargetFlags();
+    if (TargetFlags == MipsII::MO_ABS_HI)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_HI);
+    else if (TargetFlags == MipsII::MO_ABS_LO)
+      lowerLongBranchADDiu(MI, OutMI, Mips::DADDiu,
+                           MCSymbolRefExpr::VK_Mips_ABS_LO);
+    else
+      report_fatal_error("Unexpected flags for LONG_BRANCH_DADDiu");
+    return true;
+  }
+}
+
 void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  if (lowerLongBranch(MI, OutMI))
+    return;
+
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 4570bd9..269190f 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -9,6 +9,7 @@
 
 #ifndef MIPSMCINSTLOWER_H
 #define MIPSMCINSTLOWER_H
+#include "MCTargetDesc/MipsMCExpr.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
@@ -36,6 +37,13 @@ public:
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
+  MCOperand createSub(MachineBasicBlock *BB1, MachineBasicBlock *BB2,
+                      MCSymbolRefExpr::VariantKind Kind) const;
+  void lowerLongBranchLUi(const MachineInstr *MI, MCInst &OutMI) const;
+  void lowerLongBranchADDiu(const MachineInstr *MI, MCInst &OutMI,
+                            int Opcode,
+                            MCSymbolRefExpr::VariantKind Kind) const;
+  bool lowerLongBranch(const MachineInstr *MI, MCInst &OutMI) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMSAInstrInfo.td b/lib/Target/Mips/MipsMSAInstrInfo.td
index 5722c6c..285bb14 100644
--- a/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -65,10 +65,6 @@ def MipsVExtractZExt : SDNode<"MipsISD::VEXTRACT_ZEXT_ELT",
 
 // Operands
 
-def uimm2 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm";
-}
-
 // The immediate of an LSA instruction needs special handling
 // as the encoded value should be subtracted by one.
 def uimm2LSAAsmOperand : AsmOperandClass {
@@ -84,10 +80,6 @@ def LSAImm : Operand<i32> {
   let ParserMatchClass = uimm2LSAAsmOperand;
 }
 
-def uimm3 : Operand<i32> {
-  let PrintMethod = "printUnsignedImm8";
-}
-
 def uimm4 : Operand<i32> {
   let PrintMethod = "printUnsignedImm8";
 }
@@ -1505,6 +1497,15 @@ class MSA_INSERT_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
   string Constraints = "$wd = $wd_in";
 }
 
+class MSA_INSERT_VIDX_PSEUDO_BASE<SDPatternOperator OpNode, ValueType Ty,
+                                  RegisterOperand ROWD, RegisterOperand ROFS> :
+      MSAPseudo<(outs ROWD:$wd), (ins ROWD:$wd_in, GPR32Opnd:$n, ROFS:$fs),
+                [(set ROWD:$wd, (OpNode (Ty ROWD:$wd_in), ROFS:$fs,
+                                        GPR32Opnd:$n))]> {
+  bit usesCustomInserter = 1;
+  string Constraints = "$wd = $wd_in";
+}
+
 class MSA_INSVE_DESC_BASE<string instr_asm, SDPatternOperator OpNode,
                           RegisterOperand ROWD, RegisterOperand ROWS = ROWD,
                           InstrItinClass itin = NoItinerary> {
@@ -2300,11 +2301,25 @@ class INSERT_W_DESC : MSA_INSERT_DESC_BASE<"insert.w", vinsert_v4i32,
 class INSERT_D_DESC : MSA_INSERT_DESC_BASE<"insert.d", vinsert_v2i64,
                                            MSA128DOpnd, GPR64Opnd>;
 
+class INSERT_B_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v16i8, MSA128BOpnd, GPR32Opnd>;
+class INSERT_H_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v8i16, MSA128HOpnd, GPR32Opnd>;
+class INSERT_W_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4i32, MSA128WOpnd, GPR32Opnd>;
+class INSERT_D_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2i64, MSA128DOpnd, GPR64Opnd>;
+
 class INSERT_FW_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v4f32,
                                                      MSA128WOpnd, FGR32Opnd>;
 class INSERT_FD_PSEUDO_DESC : MSA_INSERT_PSEUDO_BASE<vector_insert, v2f64,
                                                      MSA128DOpnd, FGR64Opnd>;
 
+class INSERT_FW_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v4f32, MSA128WOpnd, FGR32Opnd>;
+class INSERT_FD_VIDX_PSEUDO_DESC :
+    MSA_INSERT_VIDX_PSEUDO_BASE<vector_insert, v2f64, MSA128DOpnd, FGR64Opnd>;
+
 class INSVE_B_DESC : MSA_INSVE_DESC_BASE<"insve.b", insve_v16i8,
                                          MSA128BOpnd>;
 class INSVE_H_DESC : MSA_INSVE_DESC_BASE<"insve.h", insve_v8i16,
@@ -3214,6 +3229,13 @@ let DecoderMethod = "DecodeINSVE_DF" in {
 def INSERT_FW_PSEUDO : INSERT_FW_PSEUDO_DESC;
 def INSERT_FD_PSEUDO : INSERT_FD_PSEUDO_DESC;
 
+def INSERT_B_VIDX_PSEUDO : INSERT_B_VIDX_PSEUDO_DESC;
+def INSERT_H_VIDX_PSEUDO : INSERT_H_VIDX_PSEUDO_DESC;
+def INSERT_W_VIDX_PSEUDO : INSERT_W_VIDX_PSEUDO_DESC;
+def INSERT_D_VIDX_PSEUDO : INSERT_D_VIDX_PSEUDO_DESC;
+def INSERT_FW_VIDX_PSEUDO : INSERT_FW_VIDX_PSEUDO_DESC;
+def INSERT_FD_VIDX_PSEUDO : INSERT_FD_VIDX_PSEUDO_DESC;
+
 def LD_B: LD_B_ENC, LD_B_DESC;
 def LD_H: LD_H_ENC, LD_H_DESC;
 def LD_W: LD_W_ENC, LD_W_DESC;
@@ -3731,3 +3753,55 @@ def SZ_D_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAllZero, v2i64,
                                                MSA128D, NoItinerary>;
 def SZ_V_PSEUDO : MSA_CBRANCH_PSEUDO_DESC_BASE<MipsVAnyZero, v16i8,
                                                MSA128B, NoItinerary>;
+
+// Vector extraction with variable index
+def : MSAPat<(i32 (vextract_sext_i8 v16i8:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_sext_i16 v8i16:$ws, i32:$idx)),
+             (SRA (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_sext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_sext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(i32 (vextract_zext_i8 v16i8:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_B v16i8:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 24))>;
+def : MSAPat<(i32 (vextract_zext_i16 v8i16:$ws, i32:$idx)),
+             (SRL (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_H v8i16:$ws,
+                                                                  i32:$idx),
+                                                         sub_lo)),
+                                    GPR32), (i32 16))>;
+def : MSAPat<(i32 (vextract_zext_i32 v4i32:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i32 (EXTRACT_SUBREG (SPLAT_W v4i32:$ws,
+                                                             i32:$idx),
+                                                    sub_lo)),
+                               GPR32)>;
+def : MSAPat<(i64 (vextract_zext_i64 v2i64:$ws, i32:$idx)),
+             (COPY_TO_REGCLASS (i64 (EXTRACT_SUBREG (SPLAT_D v2i64:$ws,
+                                                             i32:$idx),
+                                                    sub_64)),
+                               GPR64), [HasMSA, IsGP64bit]>;
+
+def : MSAPat<(f32 (vector_extract v4f32:$ws, i32:$idx)),
+             (f32 (EXTRACT_SUBREG (SPLAT_W v4f32:$ws,
+                                           i32:$idx),
+                                  sub_lo))>;
+def : MSAPat<(f64 (vector_extract v2f64:$ws, i32:$idx)),
+             (f64 (EXTRACT_SUBREG (SPLAT_D v2f64:$ws,
+                                           i32:$idx),
+                                  sub_64))>;
diff --git a/lib/Target/Mips/MipsMachineFunction.cpp b/lib/Target/Mips/MipsMachineFunction.cpp
index dedf802..e30302e 100644
--- a/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/lib/Target/Mips/MipsMachineFunction.cpp
@@ -27,7 +27,7 @@ FixGlobalBaseReg("mips-fix-global-base-reg", cl::Hidden, cl::init(true),
 MipsCallEntry::MipsCallEntry(const StringRef &N) {
 #ifndef NDEBUG
   Name = N;
-  Val = 0;
+  Val = nullptr;
 #endif
 }
 
@@ -65,9 +65,8 @@ MipsFunctionInfo::~MipsFunctionInfo() {
        ++I)
     delete I->getValue();
 
-  for (ValueMap<const GlobalValue *, const MipsCallEntry *>::iterator
-       I = GlobalCallEntries.begin(), E = GlobalCallEntries.end(); I != E; ++I)
-    delete I->second;
+  for (const auto &Entry : GlobalCallEntries)
+    delete Entry.second;
 }
 
 bool MipsFunctionInfo::globalBaseRegSet() const {
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index 3e14c8c..e9101cc 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -37,12 +37,12 @@ class MipsCallEntry : public PseudoSourceValue {
 public:
   explicit MipsCallEntry(const StringRef &N);
   explicit MipsCallEntry(const GlobalValue *V);
-  virtual bool isConstant(const MachineFrameInfo *) const;
-  virtual bool isAliased(const MachineFrameInfo *) const;
-  virtual bool mayAlias(const MachineFrameInfo *) const;
+  bool isConstant(const MachineFrameInfo *) const override;
+  bool isAliased(const MachineFrameInfo *) const override;
+  bool mayAlias(const MachineFrameInfo *) const override;
 
 private:
-  virtual void printCustom(raw_ostream &O) const;
+  void printCustom(raw_ostream &O) const override;
 #ifndef NDEBUG
   std::string Name;
   const GlobalValue *Val;
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
index c6abf17..03c76ea 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.cpp
@@ -14,6 +14,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mips-isel"
+
 namespace llvm {
 
 bool MipsModuleDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
diff --git a/lib/Target/Mips/MipsModuleISelDAGToDAG.h b/lib/Target/Mips/MipsModuleISelDAGToDAG.h
index fda35ae..a96862a 100644
--- a/lib/Target/Mips/MipsModuleISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsModuleISelDAGToDAG.h
@@ -41,15 +41,11 @@ public:
       TM(TM_), Subtarget(TM.getSubtarget<MipsSubtarget>()) {}
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS DAG->DAG Pattern Instruction Selection";
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
-
-  virtual SDNode *Select(SDNode *N) {
-    llvm_unreachable("unexpected");
-  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 protected:
   /// Keep a pointer to the MipsSubtarget around so that we can make the right
diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp
index db270f3..c234049 100644
--- a/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "optimize-mips-pic-call"
-
 #include "Mips.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MipsMachineFunction.h"
@@ -25,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "optimize-mips-pic-call"
+
 static cl::opt<bool> LoadTargetFromGOT("mips-load-target-from-got",
                                        cl::init(true),
                                        cl::desc("Load target address from GOT"),
@@ -35,11 +35,13 @@ static cl::opt<bool> EraseGPOpnd("mips-erase-gp-opnd",
                                  cl::Hidden);
 
 namespace {
+typedef PointerUnion<const Value *, const PseudoSourceValue *> ValueType;
+
 typedef std::pair<unsigned, unsigned> CntRegP;
 typedef RecyclingAllocator<BumpPtrAllocator,
-                           ScopedHashTableVal<const Value *, CntRegP> >
+                           ScopedHashTableVal<ValueType, CntRegP> >
 AllocatorTy;
-typedef ScopedHashTable<const Value *, CntRegP, DenseMapInfo<const Value *>,
+typedef ScopedHashTable<ValueType, CntRegP, DenseMapInfo<ValueType>,
                         AllocatorTy> ScopedHTType;
 
 class MBBInfo {
@@ -59,11 +61,11 @@ class OptimizePICCall : public MachineFunctionPass {
 public:
   OptimizePICCall(TargetMachine &tm) : MachineFunctionPass(ID) {}
 
-  virtual const char *getPassName() const { return "Mips OptimizePICCall"; }
+  const char *getPassName() const override { return "Mips OptimizePICCall"; }
 
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -78,18 +80,18 @@ private:
   /// and the underlying object in Reg and Val respectively, if the function's
   /// address can be resolved lazily.
   bool isCallViaRegister(MachineInstr &MI, unsigned &Reg,
-                         const Value *&Val) const;
+                         ValueType &Val) const;
 
   /// \brief Return the number of instructions that dominate the current
   /// instruction and load the function address from object Entry.
-  unsigned getCount(const Value *Entry);
+  unsigned getCount(ValueType Entry);
 
   /// \brief Return the destination virtual register of the last instruction
   /// that loads from object Entry.
-  unsigned getReg(const Value *Entry);
+  unsigned getReg(ValueType Entry);
 
   /// \brief Update ScopedHT.
-  void incCntAndSetReg(const Value *Entry, unsigned Reg);
+  void incCntAndSetReg(ValueType Entry, unsigned Reg);
 
   ScopedHTType ScopedHT;
   static char ID;
@@ -101,13 +103,13 @@ char OptimizePICCall::ID = 0;
 /// Return the first MachineOperand of MI if it is a used virtual register.
 static MachineOperand *getCallTargetRegOpnd(MachineInstr &MI) {
   if (MI.getNumOperands() == 0)
-    return 0;
+    return nullptr;
 
   MachineOperand &MO = MI.getOperand(0);
 
   if (!MO.isReg() || !MO.isUse() ||
       !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-    return 0;
+    return nullptr;
 
   return &MO;
 }
@@ -153,10 +155,10 @@ static void eraseGPOpnd(MachineInstr &MI) {
     }
   }
 
-  llvm_unreachable(0);
+  llvm_unreachable(nullptr);
 }
 
-MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(0) {}
+MBBInfo::MBBInfo(MachineDomTreeNode *N) : Node(N), HTScope(nullptr) {}
 
 const MachineDomTreeNode *MBBInfo::getNode() const { return Node; }
 
@@ -210,7 +212,7 @@ bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
        ++I) {
     unsigned Reg;
-    const Value *Entry;
+    ValueType Entry;
 
     // Skip instructions that are not call instructions via registers.
     if (!isCallViaRegister(*I, Reg, Entry))
@@ -242,7 +244,7 @@ bool OptimizePICCall::visitNode(MBBInfo &MBBI) {
 }
 
 bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
-                                        const Value *&Val) const {
+                                        ValueType &Val) const {
   if (!MI.isCall())
     return false;
 
@@ -254,7 +256,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
 
   // Get the instruction that loads the function address from the GOT.
   Reg = MO->getReg();
-  Val = 0;
+  Val = (Value*)nullptr;
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   MachineInstr *DefMI = MRI.getVRegDef(Reg);
 
@@ -273,20 +275,22 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg,
   // Return the underlying object for the GOT entry in Val.
   assert(DefMI->hasOneMemOperand());
   Val = (*DefMI->memoperands_begin())->getValue();
+  if (!Val)
+    Val = (*DefMI->memoperands_begin())->getPseudoValue();
   return true;
 }
 
-unsigned OptimizePICCall::getCount(const Value *Entry) {
+unsigned OptimizePICCall::getCount(ValueType Entry) {
   return ScopedHT.lookup(Entry).first;
 }
 
-unsigned OptimizePICCall::getReg(const Value *Entry) {
+unsigned OptimizePICCall::getReg(ValueType Entry) {
   unsigned Reg = ScopedHT.lookup(Entry).second;
   assert(Reg);
   return Reg;
 }
 
-void OptimizePICCall::incCntAndSetReg(const Value *Entry, unsigned Reg) {
+void OptimizePICCall::incCntAndSetReg(ValueType Entry, unsigned Reg) {
   CntRegP P = ScopedHT.lookup(Entry);
   ScopedHT.insert(Entry, std::make_pair(P.first + 1, Reg));
 }
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index fe60841..7aae964 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-os16"
 #include "MipsOs16.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
+#define DEBUG_TYPE "mips-os16"
+
 
 static cl::opt<std::string> Mips32FunctionMask(
   "mips32-function-mask",
diff --git a/lib/Target/Mips/MipsOs16.h b/lib/Target/Mips/MipsOs16.h
index 21beef8..55e5a81 100644
--- a/lib/Target/Mips/MipsOs16.h
+++ b/lib/Target/Mips/MipsOs16.h
@@ -34,11 +34,11 @@ public:
 
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "MIPS Os16 Optimization";
   }
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
 };
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index d7fc93b..83d25ab 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-reg-info"
-
 #include "MipsRegisterInfo.h"
 #include "Mips.h"
 #include "MipsAnalyzeImmediate.h"
@@ -37,11 +35,13 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "MipsGenRegisterInfo.inc"
 
-using namespace llvm;
-
 MipsRegisterInfo::MipsRegisterInfo(const MipsSubtarget &ST)
   : MipsGenRegisterInfo(Mips::RA), Subtarget(ST) {}
 
@@ -79,8 +79,8 @@ MipsRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 //===----------------------------------------------------------------------===//
 
 /// Mips Callee Saved Registers
-const uint16_t* MipsRegisterInfo::
-getCalleeSavedRegs(const MachineFunction *MF) const {
+const MCPhysReg *
+MipsRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isSingleFloat())
     return CSR_SingleFloatOnly_SaveList;
 
@@ -119,11 +119,11 @@ const uint32_t *MipsRegisterInfo::getMips16RetHelperMask() {
 
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
-  static const uint16_t ReservedGPR32[] = {
+  static const MCPhysReg ReservedGPR32[] = {
     Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
   };
 
-  static const uint16_t ReservedGPR64[] = {
+  static const MCPhysReg ReservedGPR64[] = {
     Mips::ZERO_64, Mips::K0_64, Mips::K1_64, Mips::SP_64
   };
 
diff --git a/lib/Target/Mips/MipsRegisterInfo.h b/lib/Target/Mips/MipsRegisterInfo.h
index 0450c6f..b34496f 100644
--- a/lib/Target/Mips/MipsRegisterInfo.h
+++ b/lib/Target/Mips/MipsRegisterInfo.h
@@ -43,30 +43,31 @@ public:
 
   /// Code Generation virtual methods...
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const;
+                                                unsigned Kind) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+                               MachineFunction &MF) const override;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   static const uint32_t *getMips16RetHelperMask();
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  virtual bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                       RegScavenger *RS = nullptr) const;
 
   /// Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   /// \brief Return GPR register class.
   virtual const TargetRegisterClass *intRegClass(unsigned Size) const = 0;
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 834e6c5..875a596 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -205,6 +205,10 @@ let Namespace = "Mips" in {
   foreach I = 0-31 in
   def COP2#I : MipsReg<#I, ""#I>;
 
+  // COP3 registers.
+  foreach I = 0-31 in
+  def COP3#I : MipsReg<#I, ""#I>;
+
   // PC register
   def PC : Register<"pc">;
 
@@ -387,6 +391,10 @@ def DSPCC : RegisterClass<"Mips", [v4i8, v2i16], 32, (add DSPCCond)>;
 def COP2 : RegisterClass<"Mips", [i32], 32, (sequence "COP2%u", 0, 31)>,
            Unallocatable;
 
+// Coprocessor 3 registers.
+def COP3 : RegisterClass<"Mips", [i32], 32, (sequence "COP3%u", 0, 31)>,
+           Unallocatable;
+
 // Octeon multiplier and product registers
 def OCTEON_MPL : RegisterClass<"Mips", [i64], 64, (add MPL0, MPL1, MPL2)>,
                  Unallocatable;
@@ -484,6 +492,10 @@ def COP2AsmOperand : MipsAsmRegOperand {
   let Name = "COP2AsmReg";
 }
 
+def COP3AsmOperand : MipsAsmRegOperand {
+  let Name = "COP3AsmReg";
+}
+
 def HWRegsOpnd : RegisterOperand<HWRegs> {
   let ParserMatchClass = HWRegsAsmOperand;
 }
@@ -524,6 +536,10 @@ def COP2Opnd : RegisterOperand<COP2> {
   let ParserMatchClass = COP2AsmOperand;
 }
 
+def COP3Opnd : RegisterOperand<COP3> {
+  let ParserMatchClass = COP3AsmOperand;
+}
+
 def MSA128BOpnd : RegisterOperand<MSA128B> {
   let ParserMatchClass = MSA128AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 0343a47..6ad5821 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -375,7 +375,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF) const {
   // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
     // Insert instruction "move $fp, $sp" at this location.
-    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO)
+      .setMIFlag(MachineInstr::FrameSetup);
 
     // emit ".cfi_def_cfa_register $fp"
     unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 8fa9e46..5d2801f 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -25,22 +25,22 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS) const;
+                                            RegScavenger *RS) const override;
   unsigned ehDataReg(unsigned I) const;
 };
 
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index 5b20a6c..d5385be 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelDAGToDAG.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "Mips.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 bool MipsSEDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   if (Subtarget.inMips16Mode())
     return false;
@@ -412,7 +413,7 @@ bool MipsSEDAGToDAGISel::selectVSplat(SDNode *N, APInt &Imm) const {
 
   BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N);
 
-  if (Node == NULL)
+  if (!Node)
     return false;
 
   APInt SplatValue, SplatUndef;
@@ -813,16 +814,16 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     EVT ViaVecTy;
 
     if (!Subtarget.hasMSA() || !BVN->getValueType(0).is128BitVector())
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     if (!BVN->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
                               HasAnyUndefs, 8,
                               !Subtarget.isLittle()))
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     switch (SplatBitSize) {
     default:
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
     case 8:
       LdiOp = Mips::LDI_B;
       ViaVecTy = MVT::v16i8;
@@ -842,7 +843,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
     }
 
     if (!SplatValue.isSignedIntN(10))
-      return std::make_pair(false, (SDNode*)NULL);
+      return std::make_pair(false, nullptr);
 
     SDValue Imm = CurDAG->getTargetConstant(SplatValue,
                                             ViaVecTy.getVectorElementType());
@@ -868,7 +869,7 @@ std::pair<bool, SDNode*> MipsSEDAGToDAGISel::selectNode(SDNode *Node) {
 
   }
 
-  return std::make_pair(false, (SDNode*)NULL);
+  return std::make_pair(false, nullptr);
 }
 
 FunctionPass *llvm::createMipsSEISelDag(MipsTargetMachine &TM) {
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h
index ba84a6d..57328d2 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.h
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h
@@ -25,7 +25,7 @@ public:
 
 private:
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
   void addDSPCtrlRegOperands(bool IsDef, MachineInstr &MI,
                              MachineFunction &MF);
@@ -44,66 +44,66 @@ private:
   bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset,
                                   unsigned OffsetBits) const;
 
-  virtual bool selectAddrRegImm(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectAddrRegImm(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
-  virtual bool selectAddrRegReg(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectAddrRegReg(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
-  virtual bool selectAddrDefault(SDValue Addr, SDValue &Base,
-                                 SDValue &Offset) const;
+  bool selectAddrDefault(SDValue Addr, SDValue &Base,
+                         SDValue &Offset) const override;
 
-  virtual bool selectIntAddr(SDValue Addr, SDValue &Base,
-                             SDValue &Offset) const;
+  bool selectIntAddr(SDValue Addr, SDValue &Base,
+                     SDValue &Offset) const override;
 
-  virtual bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
-                                  SDValue &Offset) const;
+  bool selectAddrRegImm10(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
 
-  virtual bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
-                                  SDValue &Offset) const;
+  bool selectAddrRegImm12(SDValue Addr, SDValue &Base,
+                          SDValue &Offset) const;
 
-  virtual bool selectIntAddrMM(SDValue Addr, SDValue &Base,
-                               SDValue &Offset) const;
+  bool selectIntAddrMM(SDValue Addr, SDValue &Base,
+                       SDValue &Offset) const override;
 
-  virtual bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
-                                SDValue &Offset) const;
+  bool selectIntAddrMSA(SDValue Addr, SDValue &Base,
+                        SDValue &Offset) const override;
 
   /// \brief Select constant vector splats.
-  virtual bool selectVSplat(SDNode *N, APInt &Imm) const;
+  bool selectVSplat(SDNode *N, APInt &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a given integer.
-  virtual bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
+  bool selectVSplatCommon(SDValue N, SDValue &Imm, bool Signed,
                                   unsigned ImmBitSize) const;
   /// \brief Select constant vector splats whose value fits in a uimm1.
-  virtual bool selectVSplatUimm1(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm1(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm2.
-  virtual bool selectVSplatUimm2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm3.
-  virtual bool selectVSplatUimm3(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm3(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm4.
-  virtual bool selectVSplatUimm4(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm4(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm5.
-  virtual bool selectVSplatUimm5(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm5(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm6.
-  virtual bool selectVSplatUimm6(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm6(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a uimm8.
-  virtual bool selectVSplatUimm8(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimm8(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value fits in a simm5.
-  virtual bool selectVSplatSimm5(SDValue N, SDValue &Imm) const;
+  bool selectVSplatSimm5(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a power of 2.
-  virtual bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimmPow2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is the inverse of a
   /// power of 2.
-  virtual bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const;
+  bool selectVSplatUimmInvPow2(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a run of set bits
   /// ending at the most significant bit
-  virtual bool selectVSplatMaskL(SDValue N, SDValue &Imm) const;
+  bool selectVSplatMaskL(SDValue N, SDValue &Imm) const override;
   /// \brief Select constant vector splats whose value is a run of set bits
   /// starting at bit zero.
-  virtual bool selectVSplatMaskR(SDValue N, SDValue &Imm) const;
+  bool selectVSplatMaskR(SDValue N, SDValue &Imm) const override;
 
-  virtual std::pair<bool, SDNode*> selectNode(SDNode *Node);
+  std::pair<bool, SDNode*> selectNode(SDNode *Node) override;
 
-  virtual void processFunctionAfterISel(MachineFunction &MF);
+  void processFunctionAfterISel(MachineFunction &MF) override;
 
   // Insert instructions to initialize the global base register in the
   // first MBB of the function.
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 0dac0b7..969d730 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -10,7 +10,6 @@
 // Subclass of MipsTargetLowering specialized for mips32/64.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "mips-isel"
 #include "MipsSEISelLowering.h"
 #include "MipsRegisterInfo.h"
 #include "MipsTargetMachine.h"
@@ -24,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-isel"
+
 static cl::opt<bool>
 EnableMipsTailCalls("enable-mips-tail-calls", cl::Hidden,
                     cl::desc("MIPS: Enable tail calls."), cl::init(false));
@@ -119,10 +120,10 @@ MipsSETargetLowering::MipsSETargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->hasCnMips())
     setOperationAction(ISD::MUL,              MVT::i64, Legal);
-  else if (hasMips64())
+  else if (isGP64bit())
     setOperationAction(ISD::MUL,              MVT::i64, Custom);
 
-  if (hasMips64()) {
+  if (isGP64bit()) {
     setOperationAction(ISD::MULHS,            MVT::i64, Custom);
     setOperationAction(ISD::MULHU,            MVT::i64, Custom);
   }
@@ -253,6 +254,16 @@ MipsSETargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                     bool *Fast) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
+  if (Subtarget->systemSupportsUnalignedAccess()) {
+    // MIPS32r6/MIPS64r6 is required to support unaligned access. It's
+    // implementation defined whether this is handled by hardware, software, or
+    // a hybrid of the two but it's expected that most implementations will
+    // handle the majority of cases in hardware.
+    if (Fast)
+      *Fast = true;
+    return true;
+  }
+
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
@@ -487,7 +498,8 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
         Log2 == ExtendTySize) {
       SDValue Ops[] = { Op0->getOperand(0), Op0->getOperand(1), Op0Op2 };
       DAG.MorphNodeTo(Op0.getNode(), MipsISD::VEXTRACT_ZEXT_ELT,
-                      Op0->getVTList(), Ops, Op0->getNumOperands());
+                      Op0->getVTList(),
+                      makeArrayRef(Ops, Op0->getNumOperands()));
       return Op0;
     }
   }
@@ -507,7 +519,7 @@ static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
 static bool isVSplat(SDValue N, APInt &Imm, bool IsLittleEndian) {
   BuildVectorSDNode *Node = dyn_cast<BuildVectorSDNode>(N.getNode());
 
-  if (Node == NULL)
+  if (!Node)
     return false;
 
   APInt SplatValue, SplatUndef;
@@ -831,7 +843,8 @@ static SDValue performSRACombine(SDNode *N, SelectionDAG &DAG,
         SDValue Ops[] = { Op0Op0->getOperand(0), Op0Op0->getOperand(1),
                           Op0Op0->getOperand(2) };
         DAG.MorphNodeTo(Op0Op0.getNode(), MipsISD::VEXTRACT_SEXT_ELT,
-                        Op0Op0->getVTList(), Ops, Op0Op0->getNumOperands());
+                        Op0Op0->getVTList(),
+                        makeArrayRef(Ops, Op0Op0->getNumOperands()));
         return Op0Op0;
       }
     }
@@ -1051,6 +1064,18 @@ MipsSETargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return emitINSERT_FW(MI, BB);
   case Mips::INSERT_FD_PSEUDO:
     return emitINSERT_FD(MI, BB);
+  case Mips::INSERT_B_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 1, false);
+  case Mips::INSERT_H_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 2, false);
+  case Mips::INSERT_W_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, false);
+  case Mips::INSERT_D_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, false);
+  case Mips::INSERT_FW_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 4, true);
+  case Mips::INSERT_FD_VIDX_PSEUDO:
+    return emitINSERT_DF_VIDX(MI, BB, 8, true);
   case Mips::FILL_FW_PSEUDO:
     return emitFILL_FW(MI, BB);
   case Mips::FILL_FD_PSEUDO:
@@ -1117,7 +1142,7 @@ SDValue MipsSETargetLowering::lowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
   SDValue BP = DAG.getNode(MipsISD::BuildPairF64, DL, MVT::f64, Lo, Hi);
   SDValue Ops[2] = {BP, Hi.getValue(1)};
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue MipsSETargetLowering::lowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1168,7 +1193,7 @@ SDValue MipsSETargetLowering::lowerMulDiv(SDValue Op, unsigned NewOpc,
     return HasLo ? Lo : Hi;
 
   SDValue Vals[] = { Lo, Hi };
-  return DAG.getMergeValues(Vals, 2, DL);
+  return DAG.getMergeValues(Vals, DL);
 }
 
 
@@ -1235,7 +1260,7 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
     ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
 
   // Create node.
-  SDValue Val = DAG.getNode(Opc, DL, ResTys, &Ops[0], Ops.size());
+  SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
   SDValue Out = (ResTys[0] == MVT::Untyped) ? extractLOHI(Val, DL, DAG) : Val;
 
   if (!HasChainIn)
@@ -1243,7 +1268,7 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
 
   assert(Val->getValueType(1) == MVT::Other);
   SDValue Vals[] = { Out, SDValue(Val.getNode(), 1) };
-  return DAG.getMergeValues(Vals, 2, DL);
+  return DAG.getMergeValues(Vals, DL);
 }
 
 // Lower an MSA copy intrinsic into the specified SelectionDAG node
@@ -1280,8 +1305,8 @@ static SDValue lowerMSASplatZExt(SDValue Op, unsigned OpNr, SelectionDAG &DAG) {
   SDValue Ops[16] = { LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB,
                       LaneA, LaneB, LaneA, LaneB, LaneA, LaneB, LaneA, LaneB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
-                               ViaVecTy.getVectorNumElements());
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (ViaVecTy != ResVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, ResVecTy, Result);
@@ -1320,8 +1345,8 @@ static SDValue getBuildVectorSplat(EVT VecTy, SDValue SplatValue,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB,
                       SplatValueA, SplatValueB, SplatValueA, SplatValueB };
 
-  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy, Ops,
-                               ViaVecTy.getVectorNumElements());
+  SDValue Result = DAG.getNode(ISD::BUILD_VECTOR, DL, ViaVecTy,
+                       makeArrayRef(Ops, ViaVecTy.getVectorNumElements()));
 
   if (VecTy != ViaVecTy)
     Result = DAG.getNode(ISD::BITCAST, DL, VecTy, Result);
@@ -1355,7 +1380,7 @@ static SDValue lowerMSABinaryBitImmIntr(SDValue Op, SelectionDAG &DAG,
     }
   }
 
-  if (Exp2Imm.getNode() == NULL) {
+  if (!Exp2Imm.getNode()) {
     // We couldnt constant fold, do a vector shift instead
 
     // Extend i32 to i64 if necessary. Sign or zero extend doesn't matter since
@@ -1735,7 +1760,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     // If ResTy is v2i64 then the type legalizer will break this node down into
     // an equivalent v4i32.
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, DL, ResTy, Ops);
   }
   case Intrinsic::mips_fexp2_w:
   case Intrinsic::mips_fexp2_d: {
@@ -2560,8 +2585,7 @@ static SDValue lowerVECTOR_SHUFFLE_VSHF(SDValue Op, EVT ResTy,
        ++I)
     Ops.push_back(DAG.getTargetConstant(*I, MaskEltTy));
 
-  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, &Ops[0],
-                                Ops.size());
+  SDValue MaskVec = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskVecTy, Ops);
 
   if (Using1stVec && Using2ndVec) {
     Op0 = Op->getOperand(0);
@@ -2885,6 +2909,131 @@ MipsSETargetLowering::emitINSERT_FD(MachineInstr *MI,
   return BB;
 }
 
+// Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction.
+//
+// For integer:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $rs)
+// =>
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSERT_[BHWD], $wdtmp2, $wdtmp1, 0, $rs)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+//
+// For floating point:
+// (INSERT_([BHWD]|F[WD])_PSEUDO $wd, $wd_in, $n, $fs)
+// =>
+// (SUBREG_TO_REG $wt, $fs, <subreg>)
+// (SLL $lanetmp1, $lane, <log2size)
+// (SLD_B $wdtmp1, $wd_in, $wd_in, $lanetmp1)
+// (INSVE_[WD], $wdtmp2, 0, $wdtmp1, 0)
+// (NEG $lanetmp2, $lanetmp1)
+// (SLD_B $wd, $wdtmp2, $wdtmp2,  $lanetmp2)
+MachineBasicBlock *
+MipsSETargetLowering::emitINSERT_DF_VIDX(MachineInstr *MI,
+                                         MachineBasicBlock *BB,
+                                         unsigned EltSizeInBytes,
+                                         bool IsFP) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  unsigned Wd = MI->getOperand(0).getReg();
+  unsigned SrcVecReg = MI->getOperand(1).getReg();
+  unsigned LaneReg = MI->getOperand(2).getReg();
+  unsigned SrcValReg = MI->getOperand(3).getReg();
+
+  const TargetRegisterClass *VecRC = nullptr;
+  const TargetRegisterClass *GPRRC = isGP64bit() ? &Mips::GPR64RegClass
+                                                 : &Mips::GPR32RegClass;
+  unsigned EltLog2Size;
+  unsigned InsertOp = 0;
+  unsigned InsveOp = 0;
+  switch (EltSizeInBytes) {
+  default:
+    llvm_unreachable("Unexpected size");
+  case 1:
+    EltLog2Size = 0;
+    InsertOp = Mips::INSERT_B;
+    InsveOp = Mips::INSVE_B;
+    VecRC = &Mips::MSA128BRegClass;
+    break;
+  case 2:
+    EltLog2Size = 1;
+    InsertOp = Mips::INSERT_H;
+    InsveOp = Mips::INSVE_H;
+    VecRC = &Mips::MSA128HRegClass;
+    break;
+  case 4:
+    EltLog2Size = 2;
+    InsertOp = Mips::INSERT_W;
+    InsveOp = Mips::INSVE_W;
+    VecRC = &Mips::MSA128WRegClass;
+    break;
+  case 8:
+    EltLog2Size = 3;
+    InsertOp = Mips::INSERT_D;
+    InsveOp = Mips::INSVE_D;
+    VecRC = &Mips::MSA128DRegClass;
+    break;
+  }
+
+  if (IsFP) {
+    unsigned Wt = RegInfo.createVirtualRegister(VecRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Wt)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(EltSizeInBytes == 8 ? Mips::sub_64 : Mips::sub_lo);
+    SrcValReg = Wt;
+  }
+
+  // Convert the lane index into a byte index
+  if (EltSizeInBytes != 1) {
+    unsigned LaneTmp1 = RegInfo.createVirtualRegister(GPRRC);
+    BuildMI(*BB, MI, DL, TII->get(Mips::SLL), LaneTmp1)
+        .addReg(LaneReg)
+        .addImm(EltLog2Size);
+    LaneReg = LaneTmp1;
+  }
+
+  // Rotate bytes around so that the desired lane is element zero
+  unsigned WdTmp1 = RegInfo.createVirtualRegister(VecRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), WdTmp1)
+      .addReg(SrcVecReg)
+      .addReg(SrcVecReg)
+      .addReg(LaneReg);
+
+  unsigned WdTmp2 = RegInfo.createVirtualRegister(VecRC);
+  if (IsFP) {
+    // Use insve.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsveOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addImm(0)
+        .addReg(SrcValReg)
+        .addImm(0);
+  } else {
+    // Use insert.df to insert to element zero
+    BuildMI(*BB, MI, DL, TII->get(InsertOp), WdTmp2)
+        .addReg(WdTmp1)
+        .addReg(SrcValReg)
+        .addImm(0);
+  }
+
+  // Rotate elements the rest of the way for a full rotation.
+  // sld.df inteprets $rt modulo the number of columns so we only need to negate
+  // the lane index to do this.
+  unsigned LaneTmp2 = RegInfo.createVirtualRegister(GPRRC);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SUB), LaneTmp2)
+      .addReg(Mips::ZERO)
+      .addReg(LaneReg);
+  BuildMI(*BB, MI, DL, TII->get(Mips::SLD_B), Wd)
+      .addReg(WdTmp2)
+      .addReg(WdTmp2)
+      .addReg(LaneTmp2);
+
+  MI->eraseFromParent(); // The pseudo instruction is gone now.
+  return BB;
+}
+
 // Emit the FILL_FW pseudo instruction.
 //
 // fill_fw_pseudo $wd, $fs
diff --git a/lib/Target/Mips/MipsSEISelLowering.h b/lib/Target/Mips/MipsSEISelLowering.h
index 079fbf6..03a20ef 100644
--- a/lib/Target/Mips/MipsSEISelLowering.h
+++ b/lib/Target/Mips/MipsSEISelLowering.h
@@ -30,22 +30,23 @@ namespace llvm {
     void addMSAFloatType(MVT::SimpleValueType Ty,
                          const TargetRegisterClass *RC);
 
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
-                                               bool *Fast = 0) const override;
+    bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS = 0,
+                                       bool *Fast = nullptr) const override;
 
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual MachineBasicBlock *
-    EmitInstrWithCustomInserter(MachineInstr *MI, MachineBasicBlock *MBB) const;
+    MachineBasicBlock *
+    EmitInstrWithCustomInserter(MachineInstr *MI,
+                                MachineBasicBlock *MBB) const override;
 
-    virtual bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
-                                    EVT VT) const {
+    bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
+                            EVT VT) const override {
       return false;
     }
 
-    virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
+    const TargetRegisterClass *getRepRegClassFor(MVT VT) const override {
       if (VT == MVT::Untyped)
         return Subtarget->hasDSP() ? &Mips::ACC64DSPRegClass :
                                      &Mips::ACC64RegClass;
@@ -54,16 +55,16 @@ namespace llvm {
     }
 
   private:
-    virtual bool
-    isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
-                                      unsigned NextStackOffset,
-                                      const MipsFunctionInfo& FI) const;
+    bool isEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
+                                     unsigned NextStackOffset,
+                                     const MipsFunctionInfo& FI) const override;
 
-    virtual void
+    void
     getOpndList(SmallVectorImpl<SDValue> &Ops,
                 std::deque< std::pair<unsigned, SDValue> > &RegsToPass,
                 bool IsPICCall, bool GlobalOrExternal, bool InternalLinkage,
-                CallLoweringInfo &CLI, SDValue Callee, SDValue Chain) const;
+                CallLoweringInfo &CLI, SDValue Callee,
+                SDValue Chain) const override;
 
     SDValue lowerLOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerSTORE(SDValue Op, SelectionDAG &DAG) const;
@@ -97,6 +98,11 @@ namespace llvm {
     /// \brief Emit the INSERT_FD pseudo instruction
     MachineBasicBlock *emitINSERT_FD(MachineInstr *MI,
                                      MachineBasicBlock *BB) const;
+    /// \brief Emit the INSERT_([BHWD]|F[WD])_VIDX pseudo instruction
+    MachineBasicBlock *emitINSERT_DF_VIDX(MachineInstr *MI,
+                                          MachineBasicBlock *BB,
+                                          unsigned EltSizeInBytes,
+                                          bool IsFP) const;
     /// \brief Emit the FILL_FW pseudo instruction
     MachineBasicBlock *emitFILL_FW(MachineInstr *MI,
                                    MachineBasicBlock *BB) const;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index 094ee29..f6f364f 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -368,7 +368,7 @@ void MipsSEInstrInfo::adjustStackPtr(unsigned SP, int64_t Amount,
   if (isInt<16>(Amount))// addi sp, sp, amount
     BuildMI(MBB, I, DL, get(ADDiu), SP).addReg(SP).addImm(Amount);
   else { // Expand immediate that doesn't fit in 16-bit.
-    unsigned Reg = loadImmediate(Amount, MBB, I, DL, 0);
+    unsigned Reg = loadImmediate(Amount, MBB, I, DL, nullptr);
     BuildMI(MBB, I, DL, get(ADDu), SP).addReg(SP).addReg(Reg, RegState::Kill);
   }
 }
diff --git a/lib/Target/Mips/MipsSEInstrInfo.h b/lib/Target/Mips/MipsSEInstrInfo.h
index 6d2dd90..aa68552 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/lib/Target/Mips/MipsSEInstrInfo.h
@@ -26,46 +26,46 @@ class MipsSEInstrInfo : public MipsInstrInfo {
 public:
   explicit MipsSEInstrInfo(MipsTargetMachine &TM);
 
-  virtual const MipsRegisterInfo &getRegisterInfo() const;
+  const MipsRegisterInfo &getRegisterInfo() const override;
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
-  virtual void storeRegToStack(MachineBasicBlock &MBB,
-                               MachineBasicBlock::iterator MI,
-                               unsigned SrcReg, bool isKill, int FrameIndex,
-                               const TargetRegisterClass *RC,
-                               const TargetRegisterInfo *TRI,
-                               int64_t Offset) const;
+  void storeRegToStack(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       const TargetRegisterClass *RC,
+                       const TargetRegisterInfo *TRI,
+                       int64_t Offset) const override;
 
-  virtual void loadRegFromStack(MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MI,
-                                unsigned DestReg, int FrameIndex,
-                                const TargetRegisterClass *RC,
-                                const TargetRegisterInfo *TRI,
-                                int64_t Offset) const;
+  void loadRegFromStack(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MI,
+                        unsigned DestReg, int FrameIndex,
+                        const TargetRegisterClass *RC,
+                        const TargetRegisterInfo *TRI,
+                        int64_t Offset) const override;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual unsigned getOppositeBranchOpc(unsigned Opc) const;
+  unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
   /// Adjust SP by Amount bytes.
   void adjustStackPtr(unsigned SP, int64_t Amount, MachineBasicBlock &MBB,
@@ -79,7 +79,7 @@ public:
                          unsigned *NewImm) const;
 
 private:
-  virtual unsigned getAnalyzableBrOpc(unsigned Opc) const;
+  unsigned getAnalyzableBrOpc(unsigned Opc) const override;
 
   void expandRetRA(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    unsigned Opc) const;
diff --git a/lib/Target/Mips/MipsSERegisterInfo.cpp b/lib/Target/Mips/MipsSERegisterInfo.cpp
index 2ac082f..0af1a6b 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.cpp
+++ b/lib/Target/Mips/MipsSERegisterInfo.cpp
@@ -39,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-reg-info"
+
 MipsSERegisterInfo::MipsSERegisterInfo(const MipsSubtarget &ST)
   : MipsRegisterInfo(ST) {}
 
@@ -187,7 +189,7 @@ void MipsSERegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
           *static_cast<const MipsSEInstrInfo *>(
                MBB.getParent()->getTarget().getInstrInfo());
       unsigned Reg = TII.loadImmediate(Offset, MBB, II, DL,
-                                       OffsetBitSize == 16 ? &NewImm : NULL);
+                                       OffsetBitSize == 16 ? &NewImm : nullptr);
       BuildMI(MBB, II, DL, TII.get(ADDu), Reg).addReg(FrameReg)
         .addReg(Reg, RegState::Kill);
 
diff --git a/lib/Target/Mips/MipsSERegisterInfo.h b/lib/Target/Mips/MipsSERegisterInfo.h
index 76cdd9d..f2f3a7e 100644
--- a/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/lib/Target/Mips/MipsSERegisterInfo.h
@@ -24,16 +24,16 @@ class MipsSERegisterInfo : public MipsRegisterInfo {
 public:
   MipsSERegisterInfo(const MipsSubtarget &Subtarget);
 
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const;
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  virtual const TargetRegisterClass *intRegClass(unsigned Size) const;
+  const TargetRegisterClass *intRegClass(unsigned Size) const override;
 
 private:
-  virtual void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
-                           int FrameIndex, uint64_t StackSize,
-                           int64_t SPOffset) const;
+  void eliminateFI(MachineBasicBlock::iterator II, unsigned OpNo,
+                   int FrameIndex, uint64_t StackSize,
+                   int64_t SPOffset) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/MipsSelectionDAGInfo.cpp b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
index e4d70fc..0d4398e 100644
--- a/lib/Target/Mips/MipsSelectionDAGInfo.cpp
+++ b/lib/Target/Mips/MipsSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-selectiondag-info"
 #include "MipsTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mips-selectiondag-info"
+
 MipsSelectionDAGInfo::MipsSelectionDAGInfo(const MipsTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 143b945..74ec064 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mips-subtarget"
-
 #include "MipsMachineFunction.h"
 #include "Mips.h"
 #include "MipsRegisterInfo.h"
@@ -25,13 +23,14 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "mips-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "MipsGenSubtargetInfo.inc"
 
-
-using namespace llvm;
-
 // FIXME: Maybe this should be on by default when Mips16 is specified
 //
 static cl::opt<bool> Mixed16_32(
@@ -77,17 +76,16 @@ void MipsSubtarget::anchor() { }
 
 MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                              const std::string &FS, bool little,
-                             Reloc::Model _RM, MipsTargetMachine *_TM) :
-  MipsGenSubtargetInfo(TT, CPU, FS),
-  MipsArchVersion(Mips32), MipsABI(UnknownABI), IsLittle(little),
-  IsSingleFloat(false), IsFP64bit(false), IsGP64bit(false), HasVFPU(false),
-  HasCnMips(false), IsLinux(true), HasSEInReg(false), HasCondMov(false),
-  HasSwap(false), HasBitCount(false), HasFPIdx(false),
-  InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
-  InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
-  AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
-  RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT)
-{
+                             Reloc::Model _RM, MipsTargetMachine *_TM)
+    : MipsGenSubtargetInfo(TT, CPU, FS), MipsArchVersion(Mips32),
+      MipsABI(UnknownABI), IsLittle(little), IsSingleFloat(false),
+      IsFP64bit(false), IsNaN2008bit(false), IsGP64bit(false), HasVFPU(false),
+      HasCnMips(false), IsLinux(true), HasMips3_32(false), HasMips3_32r2(false),
+      HasMips4_32(false), HasMips4_32r2(false), HasMips5_32r2(false),
+      InMips16Mode(false), InMips16HardFloat(Mips16HardFloat),
+      InMicroMipsMode(false), HasDSP(false), HasDSPR2(false),
+      AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false),
+      RM(_RM), OverrideMode(NoOverride), TM(_TM), TargetTriple(TT) {
   std::string CPUName = CPU;
   CPUName = selectMipsCPU(TT, CPUName);
 
@@ -109,6 +107,19 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Don't even attempt to generate code for MIPS-I, MIPS-II, MIPS-III, and
+  // MIPS-V. They have not been tested and currently exist for the integrated
+  // assembler only.
+  if (MipsArchVersion == Mips1)
+    report_fatal_error("Code generation for MIPS-I is not implemented", false);
+  if (MipsArchVersion == Mips2)
+    report_fatal_error("Code generation for MIPS-II is not implemented", false);
+  if (MipsArchVersion == Mips3)
+    report_fatal_error("Code generation for MIPS-III is not implemented",
+                       false);
+  if (MipsArchVersion == Mips5)
+    report_fatal_error("Code generation for MIPS-V is not implemented", false);
+
   // Assert exactly one ABI was chosen.
   assert(MipsABI != UnknownABI);
   assert((((getFeatureBits() & Mips::FeatureO32) != 0) +
@@ -126,15 +137,23 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
                        "See -mattr=+fp64.",
                        false);
 
+  if (hasMips32r6()) {
+    StringRef ISA = hasMips64r6() ? "MIPS64r6" : "MIPS32r6";
+
+    assert(isFP64bit());
+    assert(isNaN2008());
+    if (hasDSP())
+      report_fatal_error(ISA + " is not compatible with the DSP ASE", false);
+  }
+
   // Is the target system Linux ?
   if (TT.find("linux") == std::string::npos)
     IsLinux = false;
 
   // Set UseSmallSection.
+  // TODO: Investigate the IsLinux check. I suspect it's really checking for
+  //       bare-metal.
   UseSmallSection = !IsLinux && (RM == Reloc::Static);
-  // set some subtarget specific features
-  if (inMips16Mode())
-    HasBitCount=false;
 }
 
 bool
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 2166b93..373f481 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -37,7 +37,10 @@ public:
   };
 
 protected:
-  enum MipsArchEnum { Mips32, Mips32r2, Mips4, Mips64, Mips64r2 };
+  enum MipsArchEnum {
+    Mips1, Mips2, Mips32, Mips32r2, Mips32r6, Mips3, Mips4, Mips5, Mips64,
+    Mips64r2, Mips64r6
+  };
 
   // Mips architecture version
   MipsArchEnum MipsArchVersion;
@@ -56,6 +59,9 @@ protected:
   // IsFP64bit - The target processor has 64-bit floating point registers.
   bool IsFP64bit;
 
+  // IsNan2008 - IEEE 754-2008 NaN encoding.
+  bool IsNaN2008bit;
+
   // IsFP64bit - General-purpose registers are 64 bits wide
   bool IsGP64bit;
 
@@ -73,20 +79,20 @@ protected:
 
   /// Features related to the presence of specific instructions.
 
-  // HasSEInReg - SEB and SEH (signext in register) instructions.
-  bool HasSEInReg;
+  // HasMips3_32 - The subset of MIPS-III instructions added to MIPS32
+  bool HasMips3_32;
 
-  // HasCondMov - Conditional mov (MOVZ, MOVN) instructions.
-  bool HasCondMov;
+  // HasMips3_32r2 - The subset of MIPS-III instructions added to MIPS32r2
+  bool HasMips3_32r2;
 
-  // HasSwap - Byte and half swap instructions.
-  bool HasSwap;
+  // HasMips4_32 - Has the subset of MIPS-IV present in MIPS32
+  bool HasMips4_32;
 
-  // HasBitCount - Count leading '1' and '0' bits.
-  bool HasBitCount;
+  // HasMips4_32r2 - Has the subset of MIPS-IV present in MIPS32r2
+  bool HasMips4_32r2;
 
-  // HasFPIdx -- Floating point indexed load/store instructions.
-  bool HasFPIdx;
+  // HasMips5_32r2 - Has the subset of MIPS-V present in MIPS32r2
+  bool HasMips5_32r2;
 
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
@@ -127,9 +133,9 @@ protected:
 
   Triple TargetTriple;
 public:
-  virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
-                                     AntiDepBreakMode& Mode,
-                                     RegClassVector& CriticalPathRCs) const;
+  bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                             AntiDepBreakMode& Mode,
+                             RegClassVector& CriticalPathRCs) const override;
 
   /// Only O32 and EABI supported right now.
   bool isABI_EABI() const { return MipsABI == EABI; }
@@ -148,16 +154,24 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
+  bool hasMips2() const { return MipsArchVersion >= Mips2; }
+  bool hasMips3() const { return MipsArchVersion >= Mips3; }
+  bool hasMips4_32() const { return HasMips4_32; }
+  bool hasMips4_32r2() const { return HasMips4_32r2; }
   bool hasMips32() const { return MipsArchVersion >= Mips32; }
   bool hasMips32r2() const { return MipsArchVersion == Mips32r2 ||
                                    MipsArchVersion == Mips64r2; }
+  bool hasMips32r6() const { return MipsArchVersion == Mips32r6 ||
+                                   MipsArchVersion == Mips64r6; }
   bool hasMips64() const { return MipsArchVersion >= Mips64; }
   bool hasMips64r2() const { return MipsArchVersion == Mips64r2; }
+  bool hasMips64r6() const { return MipsArchVersion == Mips64r6; }
 
   bool hasCnMips() const { return HasCnMips; }
 
   bool isLittle() const { return IsLittle; }
   bool isFP64bit() const { return IsFP64bit; }
+  bool isNaN2008() const { return IsNaN2008bit; }
   bool isNotFP64bit() const { return !IsFP64bit; }
   bool isGP64bit() const { return IsGP64bit; }
   bool isGP32bit() const { return !IsGP64bit; }
@@ -197,11 +211,6 @@ public:
   }
 
   /// Features related to the presence of specific instructions.
-  bool hasSEInReg()   const { return HasSEInReg; }
-  bool hasCondMov()   const { return HasCondMov; }
-  bool hasSwap()      const { return HasSwap; }
-  bool hasBitCount()  const { return HasBitCount; }
-  bool hasFPIdx()     const { return HasFPIdx; }
   bool hasExtractInsert() const { return !inMips16Mode() && hasMips32r2(); }
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
@@ -213,10 +222,9 @@ public:
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
   bool isNotTargetNaCl() const { return !TargetTriple.isOSNaCl(); }
 
-// for now constant islands are on for the whole compilation unit but we only
-// really use them if in addition we are in mips16 mode
-//
-static bool useConstantIslands();
+  // for now constant islands are on for the whole compilation unit but we only
+  // really use them if in addition we are in mips16 mode
+  static bool useConstantIslands();
 
   unsigned stackAlignment() const { return hasMips64() ? 16 : 8; }
 
@@ -226,7 +234,12 @@ static bool useConstantIslands();
   /// \brief Reset the subtarget for the Mips target.
   void resetSubtarget(MachineFunction *MF);
 
-
+  /// Does the system support unaligned memory access.
+  ///
+  /// MIPS32r6/MIPS64r6 require full unaligned access support but does not
+  /// specify which component of the system provides it. Hardware, software, and
+  /// hybrid implementations are all valid.
+  bool systemSupportsUnalignedAccess() const { return hasMips32r6(); }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index e9053c8..984c58e 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -35,7 +35,7 @@
 #include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
-
+#define DEBUG_TYPE "mips"
 
 extern "C" void LLVMInitializeMipsTarget() {
   // Register the target.
@@ -171,12 +171,12 @@ public:
     return *getMipsTargetMachine().getSubtargetImpl();
   }
 
-  virtual void addIRPasses();
-  virtual bool addInstSelector();
-  virtual void addMachineSSAOptimization();
-  virtual bool addPreEmitPass();
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  void addMachineSSAOptimization() override;
+  bool addPreEmitPass() override;
 
-  virtual bool addPreRegAlloc();
+  bool addPreRegAlloc() override;
 
 };
 } // namespace
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 5a9a11d..a5aa39b 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -20,7 +20,6 @@
 #include "MipsJITInfo.h"
 #include "MipsSelectionDAGInfo.h"
 #include "MipsSubtarget.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/IR/DataLayout.h"
@@ -34,15 +33,15 @@ class MipsRegisterInfo;
 class MipsTargetMachine : public LLVMTargetMachine {
   MipsSubtarget       Subtarget;
   const DataLayout    DL; // Calculates type size & alignment
-  OwningPtr<const MipsInstrInfo> InstrInfo;
-  OwningPtr<const MipsFrameLowering> FrameLowering;
-  OwningPtr<const MipsTargetLowering> TLInfo;
-  OwningPtr<const MipsInstrInfo> InstrInfo16;
-  OwningPtr<const MipsFrameLowering> FrameLowering16;
-  OwningPtr<const MipsTargetLowering> TLInfo16;
-  OwningPtr<const MipsInstrInfo> InstrInfoSE;
-  OwningPtr<const MipsFrameLowering> FrameLoweringSE;
-  OwningPtr<const MipsTargetLowering> TLInfoSE;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering;
+  std::unique_ptr<const MipsTargetLowering> TLInfo;
+  std::unique_ptr<const MipsInstrInfo> InstrInfo16;
+  std::unique_ptr<const MipsFrameLowering> FrameLowering16;
+  std::unique_ptr<const MipsTargetLowering> TLInfo16;
+  std::unique_ptr<const MipsInstrInfo> InstrInfoSE;
+  std::unique_ptr<const MipsFrameLowering> FrameLoweringSE;
+  std::unique_ptr<const MipsTargetLowering> TLInfoSE;
   MipsSelectionDAGInfo TSInfo;
   const InstrItineraryData &InstrItins;
   MipsJITInfo JITInfo;
@@ -56,39 +55,38 @@ public:
 
   virtual ~MipsTargetMachine() {}
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 
-  virtual const MipsInstrInfo *getInstrInfo() const
+  const MipsInstrInfo *getInstrInfo() const override
   { return InstrInfo.get(); }
-  virtual const TargetFrameLowering *getFrameLowering() const
+  const TargetFrameLowering *getFrameLowering() const override
   { return FrameLowering.get(); }
-  virtual const MipsSubtarget *getSubtargetImpl() const
+  const MipsSubtarget *getSubtargetImpl() const override
   { return &Subtarget; }
-  virtual const DataLayout *getDataLayout()    const
+  const DataLayout *getDataLayout()    const override
   { return &DL;}
 
-  virtual const InstrItineraryData *getInstrItineraryData() const {
-    return Subtarget.inMips16Mode() ? 0 : &InstrItins;
+  const InstrItineraryData *getInstrItineraryData() const override {
+    return Subtarget.inMips16Mode() ? nullptr : &InstrItins;
   }
 
-  virtual MipsJITInfo *getJITInfo()
-  { return &JITInfo; }
+  MipsJITInfo *getJITInfo() override { return &JITInfo; }
 
-  virtual const MipsRegisterInfo *getRegisterInfo()  const {
+  const MipsRegisterInfo *getRegisterInfo()  const override {
     return &InstrInfo->getRegisterInfo();
   }
 
-  virtual const MipsTargetLowering *getTargetLowering() const {
+  const MipsTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
 
-  virtual const MipsSelectionDAGInfo* getSelectionDAGInfo() const {
+  const MipsSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
 
   // Set helper classes
   void setHelperClassesMips16();
diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h
index 5f4b74b..4ad37ac 100644
--- a/lib/Target/Mips/MipsTargetStreamer.h
+++ b/lib/Target/Mips/MipsTargetStreamer.h
@@ -34,6 +34,8 @@ public:
 
   virtual void emitDirectiveEnt(const MCSymbol &Symbol) = 0;
   virtual void emitDirectiveAbiCalls() = 0;
+  virtual void emitDirectiveNaN2008() = 0;
+  virtual void emitDirectiveNaNLegacy() = 0;
   virtual void emitDirectiveOptionPic0() = 0;
   virtual void emitDirectiveOptionPic2() = 0;
   virtual void emitFrame(unsigned StackReg, unsigned StackSize,
@@ -45,6 +47,11 @@ public:
   virtual void emitDirectiveSetMips64() = 0;
   virtual void emitDirectiveSetMips64R2() = 0;
   virtual void emitDirectiveSetDsp() = 0;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo) = 0;
+  virtual void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                                    const MCSymbol &Sym, bool IsReg) = 0;
 };
 
 // This part is for ascii assembly output
@@ -53,32 +60,39 @@ class MipsTargetAsmStreamer : public MipsTargetStreamer {
 
 public:
   MipsTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
-  virtual void emitDirectiveSetMicroMips();
-  virtual void emitDirectiveSetNoMicroMips();
-  virtual void emitDirectiveSetMips16();
-  virtual void emitDirectiveSetNoMips16();
-
-  virtual void emitDirectiveSetReorder();
-  virtual void emitDirectiveSetNoReorder();
-  virtual void emitDirectiveSetMacro();
-  virtual void emitDirectiveSetNoMacro();
-  virtual void emitDirectiveSetAt();
-  virtual void emitDirectiveSetNoAt();
-  virtual void emitDirectiveEnd(StringRef Name);
-
-  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
-  virtual void emitDirectiveAbiCalls();
-  virtual void emitDirectiveOptionPic0();
-  virtual void emitDirectiveOptionPic2();
-  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg);
-  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
-  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
-
-  virtual void emitDirectiveSetMips32R2();
-  virtual void emitDirectiveSetMips64();
-  virtual void emitDirectiveSetMips64R2();
-  virtual void emitDirectiveSetDsp();
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
 };
 
 // This part is for ELF object output
@@ -92,36 +106,48 @@ public:
   MCELFStreamer &getStreamer();
   MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
-  virtual void emitLabel(MCSymbol *Symbol) override;
-  virtual void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void emitLabel(MCSymbol *Symbol) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void finish() override;
 
-  virtual void emitDirectiveSetMicroMips();
-  virtual void emitDirectiveSetNoMicroMips();
-  virtual void emitDirectiveSetMips16();
-  virtual void emitDirectiveSetNoMips16();
-
-  virtual void emitDirectiveSetReorder();
-  virtual void emitDirectiveSetNoReorder();
-  virtual void emitDirectiveSetMacro();
-  virtual void emitDirectiveSetNoMacro();
-  virtual void emitDirectiveSetAt();
-  virtual void emitDirectiveSetNoAt();
-  virtual void emitDirectiveEnd(StringRef Name);
-
-  virtual void emitDirectiveEnt(const MCSymbol &Symbol);
-  virtual void emitDirectiveAbiCalls();
-  virtual void emitDirectiveOptionPic0();
-  virtual void emitDirectiveOptionPic2();
-  virtual void emitFrame(unsigned StackReg, unsigned StackSize,
-                         unsigned ReturnReg);
-  virtual void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff);
-  virtual void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff);
-
-  virtual void emitDirectiveSetMips32R2();
-  virtual void emitDirectiveSetMips64();
-  virtual void emitDirectiveSetMips64R2();
-  virtual void emitDirectiveSetDsp();
+  void emitDirectiveSetMicroMips() override;
+  void emitDirectiveSetNoMicroMips() override;
+  void emitDirectiveSetMips16() override;
+  void emitDirectiveSetNoMips16() override;
+
+  void emitDirectiveSetReorder() override;
+  void emitDirectiveSetNoReorder() override;
+  void emitDirectiveSetMacro() override;
+  void emitDirectiveSetNoMacro() override;
+  void emitDirectiveSetAt() override;
+  void emitDirectiveSetNoAt() override;
+  void emitDirectiveEnd(StringRef Name) override;
+
+  void emitDirectiveEnt(const MCSymbol &Symbol) override;
+  void emitDirectiveAbiCalls() override;
+  void emitDirectiveNaN2008() override;
+  void emitDirectiveNaNLegacy() override;
+  void emitDirectiveOptionPic0() override;
+  void emitDirectiveOptionPic2() override;
+  void emitFrame(unsigned StackReg, unsigned StackSize,
+                 unsigned ReturnReg) override;
+  void emitMask(unsigned CPUBitmask, int CPUTopSavedRegOff) override;
+  void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
+
+  void emitDirectiveSetMips32R2() override;
+  void emitDirectiveSetMips64() override;
+  void emitDirectiveSetMips64R2() override;
+  void emitDirectiveSetDsp() override;
+
+  // PIC support
+  virtual void emitDirectiveCpload(unsigned RegNo);
+  void emitDirectiveCpsetup(unsigned RegNo, int RegOrOffset,
+                            const MCSymbol &Sym, bool IsReg) override;
+
+protected:
+  bool isO32() const { return STI.getFeatureBits() & Mips::FeatureO32; }
+  bool isN32() const { return STI.getFeatureBits() & Mips::FeatureN32; }
+  bool isN64() const { return STI.getFeatureBits() & Mips::FeatureN64; }
 };
 }
 #endif
diff --git a/lib/Target/NVPTX/CMakeLists.txt b/lib/Target/NVPTX/CMakeLists.txt
index 029118a..4e35b18 100644
--- a/lib/Target/NVPTX/CMakeLists.txt
+++ b/lib/Target/NVPTX/CMakeLists.txt
@@ -9,6 +9,7 @@ tablegen(LLVM NVPTXGenSubtargetInfo.inc -gen-subtarget)
 add_public_tablegen_target(NVPTXCommonTableGen)
 
 set(NVPTXCodeGen_sources
+  NVPTXFavorNonGenericAddrSpaces.cpp
   NVPTXFrameLowering.cpp
   NVPTXInstrInfo.cpp
   NVPTXISelDAGToDAG.cpp
@@ -26,6 +27,8 @@ set(NVPTXCodeGen_sources
   NVPTXAssignValidGlobalNames.cpp
   NVPTXPrologEpilogPass.cpp
   NVPTXMCExpr.cpp
+  NVPTXReplaceImageHandles.cpp
+  NVPTXImageOptimizer.cpp
   )
 
 add_llvm_target(NVPTXCodeGen ${NVPTXCodeGen_sources})
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index cf165be..9618896 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "InstPrinter/NVPTXInstPrinter.h"
 #include "MCTargetDesc/NVPTXBaseInfo.h"
 #include "NVPTX.h"
@@ -25,6 +24,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "NVPTXGenAsmWriter.inc"
 
 
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
index 93029ae..1fb3c57 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.h
@@ -27,8 +27,8 @@ public:
   NVPTXInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
                    const MCRegisterInfo &MRI, const MCSubtargetInfo &STI);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot) override;
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -37,15 +37,15 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printCvtMode(const MCInst *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printCmpMode(const MCInst *MI, int OpNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   void printLdStCode(const MCInst *MI, int OpNum,
-                     raw_ostream &O, const char *Modifier = 0);
+                     raw_ostream &O, const char *Modifier = nullptr);
   void printMemOperand(const MCInst *MI, int OpNum,
-                       raw_ostream &O, const char *Modifier = 0);
+                       raw_ostream &O, const char *Modifier = nullptr);
   void printProtoIdent(const MCInst *MI, int OpNum,
-                       raw_ostream &O, const char *Modifier = 0);
+                       raw_ostream &O, const char *Modifier = nullptr);
 };
 
 }
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
index edf4a80..ddb122f 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXBaseInfo.h
@@ -43,14 +43,16 @@ enum PropertyAnnotation {
   PROPERTY_ISSAMPLER,
   PROPERTY_ISREADONLY_IMAGE_PARAM,
   PROPERTY_ISWRITEONLY_IMAGE_PARAM,
+  PROPERTY_ISREADWRITE_IMAGE_PARAM,
   PROPERTY_ISKERNEL_FUNCTION,
   PROPERTY_ALIGN,
+  PROPERTY_MANAGED,
 
   // last property
   PROPERTY_LAST
 };
 
-const unsigned AnnotationNameLen = 8; // length of each annotation name
+const unsigned AnnotationNameLen = 9; // length of each annotation name
 const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
   "maxntidx",                         // PROPERTY_MAXNTID_X
   "maxntidy",                         // PROPERTY_MAXNTID_Y
@@ -64,8 +66,10 @@ const char PropertyAnnotationNames[PROPERTY_LAST + 1][AnnotationNameLen + 1] = {
   "sampler",                          // PROPERTY_ISSAMPLER
   "rdoimage",                         // PROPERTY_ISREADONLY_IMAGE_PARAM
   "wroimage",                         // PROPERTY_ISWRITEONLY_IMAGE_PARAM
+  "rdwrimage",                        // PROPERTY_ISREADWRITE_IMAGE_PARAM
   "kernel",                           // PROPERTY_ISKERNEL_FUNCTION
   "align",                            // PROPERTY_ALIGN
+  "managed",                          // PROPERTY_MANAGED
 
               // last property
   "proplast", // PROPERTY_LAST
diff --git a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
index 3cf6e4b..158ca90 100644
--- a/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
+++ b/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp
@@ -20,6 +20,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "NVPTXGenInstrInfo.inc"
 
@@ -29,8 +31,6 @@
 #define GET_REGINFO_MC_DESC
 #include "NVPTXGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createNVPTXMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitNVPTXMCInstrInfo(X);
@@ -66,7 +66,7 @@ static MCInstPrinter *createNVPTXMCInstPrinter(const Target &T,
                                                const MCSubtargetInfo &STI) {
   if (SyntaxVariant == 0)
     return new NVPTXInstPrinter(MAI, MII, MRI, STI);
-  return 0;
+  return nullptr;
 }
 
 // Force static initialization.
diff --git a/lib/Target/NVPTX/NVPTX.h b/lib/Target/NVPTX/NVPTX.h
index 8cbdd47..e74c808 100644
--- a/lib/Target/NVPTX/NVPTX.h
+++ b/lib/Target/NVPTX/NVPTX.h
@@ -63,9 +63,12 @@ FunctionPass *
 createNVPTXISelDag(NVPTXTargetMachine &TM, llvm::CodeGenOpt::Level OptLevel);
 ModulePass *createNVPTXAssignValidGlobalNamesPass();
 ModulePass *createGenericToNVVMPass();
+FunctionPass *createNVPTXFavorNonGenericAddrSpacesPass();
 ModulePass *createNVVMReflectPass();
 ModulePass *createNVVMReflectPass(const StringMap<int>& Mapping);
 MachineFunctionPass *createNVPTXPrologEpilogPass();
+MachineFunctionPass *createNVPTXReplaceImageHandlesPass();
+FunctionPass *createNVPTXImageOptimizerPass();
 
 bool isImageOrSamplerVal(const Value *, const Module *);
 
diff --git a/lib/Target/NVPTX/NVPTXAllocaHoisting.h b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
index 22404b7..5b61068 100644
--- a/lib/Target/NVPTX/NVPTXAllocaHoisting.h
+++ b/lib/Target/NVPTX/NVPTXAllocaHoisting.h
@@ -30,17 +30,17 @@ public:
   static char ID; // Pass ID
   NVPTXAllocaHoisting() : FunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
     AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "NVPTX specific alloca hoisting";
   }
 
-  virtual bool runOnFunction(Function &function);
+  bool runOnFunction(Function &function) override;
 };
 
 extern FunctionPass *createAllocaHoisting();
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 97e2cc6..4ec575f 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/NVPTXMCAsmInfo.h"
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
+#include "NVPTXMachineFunctionInfo.h"
 #include "NVPTXMCExpr.h"
 #include "NVPTXRegisterInfo.h"
 #include "NVPTXTargetMachine.h"
@@ -131,7 +132,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     return MCSymbolRefExpr::Create(AP.GetBlockAddressSymbol(BA), Ctx);
 
   const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
-  if (CE == 0)
+  if (!CE)
     llvm_unreachable("Unknown constant value to lower!");
 
   switch (CE->getOpcode()) {
@@ -149,9 +150,24 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
       raw_string_ostream OS(S);
       OS << "Unsupported expression in static initializer: ";
       CE->printAsOperand(OS, /*PrintType=*/ false,
-                     !AP.MF ? 0 : AP.MF->getFunction()->getParent());
+                         !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
       report_fatal_error(OS.str());
     }
+  case Instruction::AddrSpaceCast: {
+    // Strip any addrspace(1)->addrspace(0) addrspace casts. These will be
+    // handled by the generic() logic in the MCExpr printer
+    PointerType *DstTy            = cast<PointerType>(CE->getType());
+    PointerType *SrcTy            = cast<PointerType>(CE->getOperand(0)->getType());
+    if (SrcTy->getAddressSpace() == 1 && DstTy->getAddressSpace() == 0) {
+      return LowerConstant(cast<const Constant>(CE->getOperand(0)), AP);
+    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/ false,
+                       !AP.MF ? nullptr : AP.MF->getFunction()->getParent());
+    report_fatal_error(OS.str());
+  }
   case Instruction::GetElementPtr: {
     const DataLayout &TD = *AP.TM.getDataLayout();
     // Generate a symbolic expression for the byte address
@@ -310,13 +326,279 @@ void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(OutStreamer, Inst);
 }
 
+// Handle symbol backtracking for targets that do not support image handles
+bool NVPTXAsmPrinter::lowerImageHandleOperand(const MachineInstr *MI,
+                                           unsigned OpNo, MCOperand &MCOp) {
+  const MachineOperand &MO = MI->getOperand(OpNo);
+
+  switch (MI->getOpcode()) {
+  default: return false;
+  case NVPTX::TEX_1D_F32_I32:
+  case NVPTX::TEX_1D_F32_F32:
+  case NVPTX::TEX_1D_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_F32_F32_GRAD:
+  case NVPTX::TEX_1D_I32_I32:
+  case NVPTX::TEX_1D_I32_F32:
+  case NVPTX::TEX_1D_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_I32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_F32_I32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_I32_I32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_2D_F32_I32:
+  case NVPTX::TEX_2D_F32_F32:
+  case NVPTX::TEX_2D_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_F32_F32_GRAD:
+  case NVPTX::TEX_2D_I32_I32:
+  case NVPTX::TEX_2D_I32_F32:
+  case NVPTX::TEX_2D_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_I32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_F32_I32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_I32_I32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_3D_F32_I32:
+  case NVPTX::TEX_3D_F32_F32:
+  case NVPTX::TEX_3D_F32_F32_LEVEL:
+  case NVPTX::TEX_3D_F32_F32_GRAD:
+  case NVPTX::TEX_3D_I32_I32:
+  case NVPTX::TEX_3D_I32_F32:
+  case NVPTX::TEX_3D_I32_F32_LEVEL:
+  case NVPTX::TEX_3D_I32_F32_GRAD:
+   {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    if (OpNo == 4) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+    if (OpNo == 5) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_I8_TRAP:
+  case NVPTX::SULD_1D_I16_TRAP:
+  case NVPTX::SULD_1D_I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_2D_I8_TRAP:
+  case NVPTX::SULD_2D_I16_TRAP:
+  case NVPTX::SULD_2D_I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_3D_I8_TRAP:
+  case NVPTX::SULD_3D_I16_TRAP:
+  case NVPTX::SULD_3D_I32_TRAP: {
+    // This is a V1 surface load, so operand 1 is a surfref
+    if (OpNo == 1) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_V2I8_TRAP:
+  case NVPTX::SULD_1D_V2I16_TRAP:
+  case NVPTX::SULD_1D_V2I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_2D_V2I8_TRAP:
+  case NVPTX::SULD_2D_V2I16_TRAP:
+  case NVPTX::SULD_2D_V2I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_3D_V2I8_TRAP:
+  case NVPTX::SULD_3D_V2I16_TRAP:
+  case NVPTX::SULD_3D_V2I32_TRAP: {
+    // This is a V2 surface load, so operand 2 is a surfref
+    if (OpNo == 2) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SULD_1D_V4I8_TRAP:
+  case NVPTX::SULD_1D_V4I16_TRAP:
+  case NVPTX::SULD_1D_V4I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_2D_V4I8_TRAP:
+  case NVPTX::SULD_2D_V4I16_TRAP:
+  case NVPTX::SULD_2D_V4I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_3D_V4I8_TRAP:
+  case NVPTX::SULD_3D_V4I16_TRAP:
+  case NVPTX::SULD_3D_V4I32_TRAP: {
+    // This is a V4 surface load, so operand 4 is a surfref
+    if (OpNo == 4) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::SUST_B_1D_B8_TRAP:
+  case NVPTX::SUST_B_1D_B16_TRAP:
+  case NVPTX::SUST_B_1D_B32_TRAP:
+  case NVPTX::SUST_B_1D_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_V4B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_B8_TRAP:
+  case NVPTX::SUST_B_2D_B16_TRAP:
+  case NVPTX::SUST_B_2D_B32_TRAP:
+  case NVPTX::SUST_B_2D_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_3D_B8_TRAP:
+  case NVPTX::SUST_B_3D_B16_TRAP:
+  case NVPTX::SUST_B_3D_B32_TRAP:
+  case NVPTX::SUST_B_3D_V2B8_TRAP:
+  case NVPTX::SUST_B_3D_V2B16_TRAP:
+  case NVPTX::SUST_B_3D_V2B32_TRAP:
+  case NVPTX::SUST_B_3D_V4B8_TRAP:
+  case NVPTX::SUST_B_3D_V4B16_TRAP:
+  case NVPTX::SUST_B_3D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_B8_TRAP:
+  case NVPTX::SUST_P_1D_B16_TRAP:
+  case NVPTX::SUST_P_1D_B32_TRAP:
+  case NVPTX::SUST_P_1D_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_B8_TRAP:
+  case NVPTX::SUST_P_2D_B16_TRAP:
+  case NVPTX::SUST_P_2D_B32_TRAP:
+  case NVPTX::SUST_P_2D_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_3D_B8_TRAP:
+  case NVPTX::SUST_P_3D_B16_TRAP:
+  case NVPTX::SUST_P_3D_B32_TRAP:
+  case NVPTX::SUST_P_3D_V2B8_TRAP:
+  case NVPTX::SUST_P_3D_V2B16_TRAP:
+  case NVPTX::SUST_P_3D_V2B32_TRAP:
+  case NVPTX::SUST_P_3D_V4B8_TRAP:
+  case NVPTX::SUST_P_3D_V4B16_TRAP:
+  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+    // This is a surface store, so operand 0 is a surfref
+    if (OpNo == 0) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  case NVPTX::TXQ_CHANNEL_ORDER:
+  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
+  case NVPTX::TXQ_WIDTH:
+  case NVPTX::TXQ_HEIGHT:
+  case NVPTX::TXQ_DEPTH:
+  case NVPTX::TXQ_ARRAY_SIZE:
+  case NVPTX::TXQ_NUM_SAMPLES:
+  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
+  case NVPTX::SUQ_CHANNEL_ORDER:
+  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
+  case NVPTX::SUQ_WIDTH:
+  case NVPTX::SUQ_HEIGHT:
+  case NVPTX::SUQ_DEPTH:
+  case NVPTX::SUQ_ARRAY_SIZE: {
+    // This is a query, so operand 1 is a surfref/texref
+    if (OpNo == 1) {
+      lowerImageHandleSymbol(MO.getImm(), MCOp);
+      return true;
+    }
+
+    return false;
+  }
+  }
+}
+
+void NVPTXAsmPrinter::lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp) {
+  // Ewwww
+  TargetMachine &TM = const_cast<TargetMachine&>(MF->getTarget());
+  NVPTXTargetMachine &nvTM = static_cast<NVPTXTargetMachine&>(TM);
+  const NVPTXMachineFunctionInfo *MFI = MF->getInfo<NVPTXMachineFunctionInfo>();
+  const char *Sym = MFI->getImageHandleSymbol(Index);
+  std::string *SymNamePtr =
+    nvTM.getManagedStrPool()->getManagedString(Sym);
+  MCOp = GetSymbolRef(OutContext.GetOrCreateSymbol(
+    StringRef(SymNamePtr->c_str())));
+}
+
 void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.setOpcode(MI->getOpcode());
+  const NVPTXSubtarget &ST = TM.getSubtarget<NVPTXSubtarget>();
 
   // Special: Do not mangle symbol operand of CALL_PROTOTYPE
   if (MI->getOpcode() == NVPTX::CALL_PROTOTYPE) {
     const MachineOperand &MO = MI->getOperand(0);
-    OutMI.addOperand(GetSymbolRef(MO,
+    OutMI.addOperand(GetSymbolRef(
       OutContext.GetOrCreateSymbol(Twine(MO.getSymbolName()))));
     return;
   }
@@ -325,6 +607,13 @@ void NVPTXAsmPrinter::lowerToMCInst(const MachineInstr *MI, MCInst &OutMI) {
     const MachineOperand &MO = MI->getOperand(i);
 
     MCOperand MCOp;
+    if (!ST.hasImageHandles()) {
+      if (lowerImageHandleOperand(MI, i, MCOp)) {
+        OutMI.addOperand(MCOp);
+        continue;
+      }
+    }
+
     if (lowerOperand(MO, MCOp))
       OutMI.addOperand(MCOp);
   }
@@ -345,10 +634,10 @@ bool NVPTXAsmPrinter::lowerOperand(const MachineOperand &MO,
         MO.getMBB()->getSymbol(), OutContext));
     break;
   case MachineOperand::MO_ExternalSymbol:
-    MCOp = GetSymbolRef(MO, GetExternalSymbolSymbol(MO.getSymbolName()));
+    MCOp = GetSymbolRef(GetExternalSymbolSymbol(MO.getSymbolName()));
     break;
   case MachineOperand::MO_GlobalAddress:
-    MCOp = GetSymbolRef(MO, getSymbol(MO.getGlobal()));
+    MCOp = GetSymbolRef(getSymbol(MO.getGlobal()));
     break;
   case MachineOperand::MO_FPImmediate: {
     const ConstantFP *Cnt = MO.getFPImm();
@@ -407,8 +696,7 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
   }
 }
 
-MCOperand NVPTXAsmPrinter::GetSymbolRef(const MachineOperand &MO,
-                                        const MCSymbol *Symbol) {
+MCOperand NVPTXAsmPrinter::GetSymbolRef(const MCSymbol *Symbol) {
   const MCExpr *Expr;
   Expr = MCSymbolRefExpr::Create(Symbol, MCSymbolRefExpr::VK_None,
                                  OutContext);
@@ -750,7 +1038,7 @@ static bool canDemoteGlobalVar(const GlobalVariable *gv, Function const *&f) {
   if (Pty->getAddressSpace() != llvm::ADDRESS_SPACE_SHARED)
     return false;
 
-  const Function *oneFunc = 0;
+  const Function *oneFunc = nullptr;
 
   bool flag = usedInOneFunc(gv, oneFunc);
   if (flag == false)
@@ -1010,6 +1298,8 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
   for (i = 0; i < n; i++)
     global_list.insert(global_list.end(), gv_array[i]);
 
+  clearAnnotationCache(&M);
+
   delete[] gv_array;
   return ret;
 
@@ -1105,10 +1395,10 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
   if (llvm::isSampler(*GVar)) {
     O << ".global .samplerref " << llvm::getSamplerName(*GVar);
 
-    const Constant *Initializer = NULL;
+    const Constant *Initializer = nullptr;
     if (GVar->hasInitializer())
       Initializer = GVar->getInitializer();
-    const ConstantInt *CI = NULL;
+    const ConstantInt *CI = nullptr;
     if (Initializer)
       CI = dyn_cast<ConstantInt>(Initializer);
     if (CI) {
@@ -1175,7 +1465,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
       return;
   }
 
-  const Function *demotedFunc = 0;
+  const Function *demotedFunc = nullptr;
   if (!processDemoted && canDemoteGlobalVar(GVar, demotedFunc)) {
     O << "// " << GVar->getName().str() << " has been demoted\n";
     if (localDecls.find(demotedFunc) != localDecls.end())
@@ -1347,7 +1637,7 @@ NVPTXAsmPrinter::getPTXFundamentalTypeStr(const Type *Ty, bool useB4PTR) const {
       return "u32";
   }
   llvm_unreachable("unexpected type");
-  return NULL;
+  return nullptr;
 }
 
 void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
@@ -1495,19 +1785,33 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     first = false;
 
     // Handle image/sampler parameters
-    if (llvm::isSampler(*I) || llvm::isImage(*I)) {
-      if (llvm::isImage(*I)) {
-        std::string sname = I->getName();
-        if (llvm::isImageWriteOnly(*I))
-          O << "\t.param .surfref " << *getSymbol(F) << "_param_"
-            << paramIndex;
-        else // Default image is read_only
-          O << "\t.param .texref " << *getSymbol(F) << "_param_"
-            << paramIndex;
-      } else // Should be llvm::isSampler(*I)
-        O << "\t.param .samplerref " << *getSymbol(F) << "_param_"
-          << paramIndex;
-      continue;
+    if (isKernelFunction(*F)) {
+      if (isSampler(*I) || isImage(*I)) {
+        if (isImage(*I)) {
+          std::string sname = I->getName();
+          if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .surfref ";
+            else
+              O << "\t.param .surfref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+          else { // Default image is read_only
+            if (nvptxSubtarget.hasImageHandles())
+              O << "\t.param .u64 .ptr .texref ";
+            else
+              O << "\t.param .texref ";
+            O << *CurrentFnSym << "_param_" << paramIndex;
+          }
+        } else {
+          if (nvptxSubtarget.hasImageHandles())
+            O << "\t.param .u64 .ptr .samplerref ";
+          else
+            O << "\t.param .samplerref ";
+          O << *CurrentFnSym << "_param_" << paramIndex;
+        }
+        continue;
+      }
     }
 
     if (PAL.hasAttribute(paramIndex + 1, Attribute::ByVal) == false) {
@@ -1752,13 +2056,35 @@ void NVPTXAsmPrinter::printScalarConstant(const Constant *CPV, raw_ostream &O) {
     return;
   }
   if (const GlobalValue *GVar = dyn_cast<GlobalValue>(CPV)) {
-    O << *getSymbol(GVar);
+    PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
+    if (EmitGeneric && !isa<Function>(CPV) && !IsNonGenericPointer) {
+      O << "generic(";
+      O << *getSymbol(GVar);
+      O << ")";
+    } else {
+      O << *getSymbol(GVar);
+    }
     return;
   }
   if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
     const Value *v = Cexpr->stripPointerCasts();
+    PointerType *PTy = dyn_cast<PointerType>(Cexpr->getType());
+    bool IsNonGenericPointer = false;
+    if (PTy && PTy->getAddressSpace() != 0) {
+      IsNonGenericPointer = true;
+    }
     if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
-      O << *getSymbol(GVar);
+      if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+        O << "generic(";
+        O << *getSymbol(GVar);
+        O << ")";
+      } else {
+        O << *getSymbol(GVar);
+      }
       return;
     } else {
       O << *LowerConstant(CPV, *this);
@@ -2121,7 +2447,7 @@ void NVPTXAsmPrinter::emitSrcInText(StringRef filename, unsigned line) {
 }
 
 LineReader *NVPTXAsmPrinter::getReader(std::string filename) {
-  if (reader == NULL) {
+  if (!reader) {
     reader = new LineReader(filename);
   }
 
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.h b/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7162420..a9f9bdd 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -96,6 +96,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
     unsigned curpos;
     raw_ostream &O;
     NVPTXAsmPrinter &AP;
+    bool EmitGeneric;
 
   public:
     AggBuffer(unsigned _size, raw_ostream &_O, NVPTXAsmPrinter &_AP)
@@ -104,6 +105,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
       size = _size;
       curpos = 0;
       numSymbols = 0;
+      EmitGeneric = AP.EmitGeneric;
     }
     ~AggBuffer() { delete[] buffer; }
     unsigned addBytes(unsigned char *Ptr, int Num, int Bytes) {
@@ -155,7 +157,18 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
             const Value *v = Symbols[nSym];
             if (const GlobalValue *GVar = dyn_cast<GlobalValue>(v)) {
               MCSymbol *Name = AP.getSymbol(GVar);
-              O << *Name;
+              PointerType *PTy = dyn_cast<PointerType>(GVar->getType());
+              bool IsNonGenericPointer = false;
+              if (PTy && PTy->getAddressSpace() != 0) {
+                IsNonGenericPointer = true;
+              }
+              if (EmitGeneric && !isa<Function>(v) && !IsNonGenericPointer) {
+                O << "generic(";
+                O << *Name;
+                O << ")";
+              } else {
+                O << *Name;
+              }
             } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(v)) {
               O << *nvptx::LowerConstant(Cexpr, AP);
             } else
@@ -176,31 +189,31 @@ class LLVM_LIBRARY_VISIBILITY NVPTXAsmPrinter : public AsmPrinter {
 
   friend class AggBuffer;
 
-  virtual void emitSrcInText(StringRef filename, unsigned line);
+  void emitSrcInText(StringRef filename, unsigned line);
 
 private:
-  virtual const char *getPassName() const { return "NVPTX Assembly Printer"; }
+  const char *getPassName() const override { return "NVPTX Assembly Printer"; }
 
   const Function *F;
   std::string CurrentFnName;
 
-  void EmitFunctionEntryLabel();
-  void EmitFunctionBodyStart();
-  void EmitFunctionBodyEnd();
-  void emitImplicitDef(const MachineInstr *MI) const;
+  void EmitFunctionEntryLabel() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void emitImplicitDef(const MachineInstr *MI) const override;
 
-  void EmitInstruction(const MachineInstr *);
+  void EmitInstruction(const MachineInstr *) override;
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
-  MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol);
+  MCOperand GetSymbolRef(const MCSymbol *Symbol);
   unsigned encodeVirtualRegister(unsigned Reg);
 
-  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = 0) const {}
+  void EmitAlignment(unsigned NumBits, const GlobalValue *GV = nullptr) const {}
 
   void printVecModifiedImmediate(const MachineOperand &MO, const char *Modifier,
                                  raw_ostream &O);
   void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                       const char *Modifier = 0);
+                       const char *Modifier = nullptr);
   void printImplicitDef(const MachineInstr *MI, raw_ostream &O) const;
   void printModuleLevelGV(const GlobalVariable *GVar, raw_ostream &O,
                           bool = false);
@@ -221,15 +234,15 @@ private:
   void printReturnValStr(const MachineFunction &MF, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &);
+                       raw_ostream &) override;
   void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
-                    const char *Modifier = 0);
+                    const char *Modifier = nullptr);
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &);
+                             raw_ostream &) override;
 protected:
-  bool doInitialization(Module &M);
-  bool doFinalization(Module &M);
+  bool doInitialization(Module &M) override;
+  bool doFinalization(Module &M) override;
 
 private:
   std::string CurrentBankselLabelInBasicBlock;
@@ -274,14 +287,33 @@ private:
   static const char *getRegisterName(unsigned RegNo);
   void emitDemotedVars(const Function *, raw_ostream &);
 
+  bool lowerImageHandleOperand(const MachineInstr *MI, unsigned OpNo,
+                               MCOperand &MCOp);
+  void lowerImageHandleSymbol(unsigned Index, MCOperand &MCOp);
+
   LineReader *reader;
   LineReader *getReader(std::string);
+
+  // Used to control the need to emit .generic() in the initializer of
+  // module scope variables.
+  // Although ptx supports the hybrid mode like the following,
+  //    .global .u32 a;
+  //    .global .u32 b;
+  //    .global .u32 addr[] = {a, generic(b)}
+  // we have difficulty representing the difference in the NVVM IR.
+  //
+  // Since the address value should always be generic in CUDA C and always
+  // be specific in OpenCL, we use this simple control here.
+  //
+  bool EmitGeneric;
+
 public:
   NVPTXAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer),
         nvptxSubtarget(TM.getSubtarget<NVPTXSubtarget>()) {
     CurrentBankselLabelInBasicBlock = "";
-    reader = NULL;
+    reader = nullptr;
+    EmitGeneric = (nvptxSubtarget.getDrvInterface() == NVPTX::CUDA);
   }
 
   ~NVPTXAsmPrinter() {
diff --git a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
index 158c482..962b123 100644
--- a/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
+++ b/lib/Target/NVPTX/NVPTXAssignValidGlobalNames.cpp
@@ -33,7 +33,7 @@ public:
   static char ID;
   NVPTXAssignValidGlobalNames() : ModulePass(ID) {}
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
   /// \brief Clean up the name to remove symbols invalid in PTX.
   std::string cleanUpName(StringRef Name);
diff --git a/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
new file mode 100644
index 0000000..f3a095d
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXFavorNonGenericAddrSpaces.cpp
@@ -0,0 +1,195 @@
+//===-- NVPTXFavorNonGenericAddrSpace.cpp - ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// When a load/store accesses the generic address space, checks whether the
+// address is casted from a non-generic address space. If so, remove this
+// addrspacecast because accessing non-generic address spaces is typically
+// faster. Besides seeking addrspacecasts, this optimization also traces into
+// the base pointer of a GEP.
+//
+// For instance, the code below loads a float from an array allocated in
+// addrspace(3).
+//
+// %0 = addrspacecast [10 x float] addrspace(3)* @a to [10 x float]*
+// %1 = gep [10 x float]* %0, i64 0, i64 %i
+// %2 = load float* %1 ; emits ld.f32
+//
+// First, function hoistAddrSpaceCastFromGEP reorders the addrspacecast
+// and the GEP to expose more optimization opportunities to function
+// optimizeMemoryInst. The intermediate code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %1 = addrspacecast float addrspace(3)* %0 to float*
+// %2 = load float* %1 ; still emits ld.f32, but will be optimized shortly
+//
+// Then, function optimizeMemoryInstruction detects a load from addrspacecast'ed
+// generic pointers, and folds the load and the addrspacecast into a load from
+// the original address space. The final code looks like:
+//
+// %0 = gep [10 x float] addrspace(3)* @a, i64 0, i64 %i
+// %2 = load float addrspace(3)* %0 ; emits ld.shared.f32
+//
+// This pass may remove an addrspacecast in a different BB. Therefore, we
+// implement it as a FunctionPass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// An option to disable this optimization. Enable it by default.
+static cl::opt<bool> DisableFavorNonGeneric(
+  "disable-nvptx-favor-non-generic",
+  cl::init(false),
+  cl::desc("Do not convert generic address space usage "
+           "to non-generic address space usage"),
+  cl::Hidden);
+
+namespace {
+/// \brief NVPTXFavorNonGenericAddrSpaces
+class NVPTXFavorNonGenericAddrSpaces : public FunctionPass {
+public:
+  static char ID;
+  NVPTXFavorNonGenericAddrSpaces() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  /// Optimizes load/store instructions. Idx is the index of the pointer operand
+  /// (0 for load, and 1 for store). Returns true if it changes anything.
+  bool optimizeMemoryInstruction(Instruction *I, unsigned Idx);
+  /// Transforms "gep (addrspacecast X), indices" into "addrspacecast (gep X,
+  /// indices)".  This reordering exposes to optimizeMemoryInstruction more
+  /// optimization opportunities on loads and stores. Returns true if it changes
+  /// the program.
+  bool hoistAddrSpaceCastFromGEP(GEPOperator *GEP);
+};
+}
+
+char NVPTXFavorNonGenericAddrSpaces::ID = 0;
+
+namespace llvm {
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
+}
+INITIALIZE_PASS(NVPTXFavorNonGenericAddrSpaces, "nvptx-favor-non-generic",
+                "Remove unnecessary non-generic-to-generic addrspacecasts",
+                false, false)
+
+// Decides whether removing Cast is valid and beneficial. Cast can be an
+// instruction or a constant expression.
+static bool IsEliminableAddrSpaceCast(Operator *Cast) {
+  // Returns false if not even an addrspacecast.
+  if (Cast->getOpcode() != Instruction::AddrSpaceCast)
+    return false;
+
+  Value *Src = Cast->getOperand(0);
+  PointerType *SrcTy = cast<PointerType>(Src->getType());
+  PointerType *DestTy = cast<PointerType>(Cast->getType());
+  // TODO: For now, we only handle the case where the addrspacecast only changes
+  // the address space but not the type. If the type also changes, we could
+  // still get rid of the addrspacecast by adding an extra bitcast, but we
+  // rarely see such scenarios.
+  if (SrcTy->getElementType() != DestTy->getElementType())
+    return false;
+
+  // Checks whether the addrspacecast is from a non-generic address space to the
+  // generic address space.
+  return (SrcTy->getAddressSpace() != AddressSpace::ADDRESS_SPACE_GENERIC &&
+          DestTy->getAddressSpace() == AddressSpace::ADDRESS_SPACE_GENERIC);
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::hoistAddrSpaceCastFromGEP(
+    GEPOperator *GEP) {
+  Operator *Cast = dyn_cast<Operator>(GEP->getPointerOperand());
+  if (!Cast)
+    return false;
+
+  if (!IsEliminableAddrSpaceCast(Cast))
+    return false;
+
+  SmallVector<Value *, 8> Indices(GEP->idx_begin(), GEP->idx_end());
+  if (Instruction *GEPI = dyn_cast<Instruction>(GEP)) {
+    // %1 = gep (addrspacecast X), indices
+    // =>
+    // %0 = gep X, indices
+    // %1 = addrspacecast %0
+    GetElementPtrInst *NewGEPI = GetElementPtrInst::Create(Cast->getOperand(0),
+                                                           Indices,
+                                                           GEP->getName(),
+                                                           GEPI);
+    NewGEPI->setIsInBounds(GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        new AddrSpaceCastInst(NewGEPI, GEP->getType(), "", GEPI));
+  } else {
+    // GEP is a constant expression.
+    Constant *NewGEPCE = ConstantExpr::getGetElementPtr(
+        cast<Constant>(Cast->getOperand(0)),
+        Indices,
+        GEP->isInBounds());
+    GEP->replaceAllUsesWith(
+        ConstantExpr::getAddrSpaceCast(NewGEPCE, GEP->getType()));
+  }
+
+  return true;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::optimizeMemoryInstruction(Instruction *MI,
+                                                               unsigned Idx) {
+  // If the pointer operand is a GEP, hoist the addrspacecast if any from the
+  // GEP to expose more optimization opportunites.
+  if (GEPOperator *GEP = dyn_cast<GEPOperator>(MI->getOperand(Idx))) {
+    hoistAddrSpaceCastFromGEP(GEP);
+  }
+
+  // load/store (addrspacecast X) => load/store X if shortcutting the
+  // addrspacecast is valid and can improve performance.
+  //
+  // e.g.,
+  // %1 = addrspacecast float addrspace(3)* %0 to float*
+  // %2 = load float* %1
+  // ->
+  // %2 = load float addrspace(3)* %0
+  //
+  // Note: the addrspacecast can also be a constant expression.
+  if (Operator *Cast = dyn_cast<Operator>(MI->getOperand(Idx))) {
+    if (IsEliminableAddrSpaceCast(Cast)) {
+      MI->setOperand(Idx, Cast->getOperand(0));
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool NVPTXFavorNonGenericAddrSpaces::runOnFunction(Function &F) {
+  if (DisableFavorNonGeneric)
+    return false;
+
+  bool Changed = false;
+  for (Function::iterator B = F.begin(), BE = F.end(); B != BE; ++B) {
+    for (BasicBlock::iterator I = B->begin(), IE = B->end(); I != IE; ++I) {
+      if (isa<LoadInst>(I)) {
+        // V = load P
+        Changed |= optimizeMemoryInstruction(I, 0);
+      } else if (isa<StoreInst>(I)) {
+        // store V, P
+        Changed |= optimizeMemoryInstruction(I, 1);
+      }
+    }
+  }
+  return Changed;
+}
+
+FunctionPass *llvm::createNVPTXFavorNonGenericAddrSpacesPass() {
+  return new NVPTXFavorNonGenericAddrSpaces();
+}
diff --git a/lib/Target/NVPTX/NVPTXFrameLowering.h b/lib/Target/NVPTX/NVPTXFrameLowering.h
index 819f1dd..2ae6d72 100644
--- a/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -28,13 +28,13 @@ public:
       : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, 8, 0), tm(_tm),
         is64bit(_is64bit) {}
 
-  virtual bool hasFP(const MachineFunction &MF) const;
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  bool hasFP(const MachineFunction &MF) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 };
 
 } // End llvm namespace
diff --git a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index 45f0734..023dd5e 100644
--- a/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -40,10 +40,9 @@ public:
 
   GenericToNVVM() : ModulePass(ID) {}
 
-  virtual bool runOnModule(Module &M);
+  bool runOnModule(Module &M) override;
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
-  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
 
 private:
   Value *getOrInsertCVTA(Module *M, Function *F, GlobalVariable *GV,
@@ -88,7 +87,8 @@ bool GenericToNVVM::runOnModule(Module &M) {
         !GV->getName().startswith("llvm.")) {
       GlobalVariable *NewGV = new GlobalVariable(
           M, GV->getType()->getElementType(), GV->isConstant(),
-          GV->getLinkage(), GV->hasInitializer() ? GV->getInitializer() : NULL,
+          GV->getLinkage(),
+          GV->hasInitializer() ? GV->getInitializer() : nullptr,
           "", GV, GV->getThreadLocalMode(), llvm::ADDRESS_SPACE_GLOBAL);
       NewGV->copyAttributesFrom(GV);
       GVMap[GV] = NewGV;
@@ -162,7 +162,7 @@ Value *GenericToNVVM::getOrInsertCVTA(Module *M, Function *F,
                                       GlobalVariable *GV,
                                       IRBuilder<> &Builder) {
   PointerType *GVType = GV->getType();
-  Value *CVTA = NULL;
+  Value *CVTA = nullptr;
 
   // See if the address space conversion requires the operand to be bitcast
   // to i8 addrspace(n)* first.
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index bd08d2d..cd30880 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -20,11 +20,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 
-#undef DEBUG_TYPE
-#define DEBUG_TYPE "nvptx-isel"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-isel"
+
 static cl::opt<int>
 FMAContractLevel("nvptx-fma-level", cl::ZeroOrMore, cl::Hidden,
                  cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
@@ -120,10 +119,10 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL; // Already selected.
+    return nullptr; // Already selected.
   }
 
-  SDNode *ResNode = NULL;
+  SDNode *ResNode = nullptr;
   switch (N->getOpcode()) {
   case ISD::LOAD:
     ResNode = SelectLoad(N);
@@ -162,6 +161,98 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
   case NVPTXISD::StoreParamU32:
     ResNode = SelectStoreParam(N);
     break;
+  case ISD::INTRINSIC_WO_CHAIN:
+    ResNode = SelectIntrinsicNoChain(N);
+    break;
+  case NVPTXISD::Tex1DFloatI32:
+  case NVPTXISD::Tex1DFloatFloat:
+  case NVPTXISD::Tex1DFloatFloatLevel:
+  case NVPTXISD::Tex1DFloatFloatGrad:
+  case NVPTXISD::Tex1DI32I32:
+  case NVPTXISD::Tex1DI32Float:
+  case NVPTXISD::Tex1DI32FloatLevel:
+  case NVPTXISD::Tex1DI32FloatGrad:
+  case NVPTXISD::Tex1DArrayFloatI32:
+  case NVPTXISD::Tex1DArrayFloatFloat:
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+  case NVPTXISD::Tex1DArrayI32I32:
+  case NVPTXISD::Tex1DArrayI32Float:
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+  case NVPTXISD::Tex2DFloatI32:
+  case NVPTXISD::Tex2DFloatFloat:
+  case NVPTXISD::Tex2DFloatFloatLevel:
+  case NVPTXISD::Tex2DFloatFloatGrad:
+  case NVPTXISD::Tex2DI32I32:
+  case NVPTXISD::Tex2DI32Float:
+  case NVPTXISD::Tex2DI32FloatLevel:
+  case NVPTXISD::Tex2DI32FloatGrad:
+  case NVPTXISD::Tex2DArrayFloatI32:
+  case NVPTXISD::Tex2DArrayFloatFloat:
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+  case NVPTXISD::Tex2DArrayI32I32:
+  case NVPTXISD::Tex2DArrayI32Float:
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+  case NVPTXISD::Tex3DFloatI32:
+  case NVPTXISD::Tex3DFloatFloat:
+  case NVPTXISD::Tex3DFloatFloatLevel:
+  case NVPTXISD::Tex3DFloatFloatGrad:
+  case NVPTXISD::Tex3DI32I32:
+  case NVPTXISD::Tex3DI32Float:
+  case NVPTXISD::Tex3DI32FloatLevel:
+  case NVPTXISD::Tex3DI32FloatGrad:
+    ResNode = SelectTextureIntrinsic(N);
+    break;
+  case NVPTXISD::Suld1DI8Trap:
+  case NVPTXISD::Suld1DI16Trap:
+  case NVPTXISD::Suld1DI32Trap:
+  case NVPTXISD::Suld1DV2I8Trap:
+  case NVPTXISD::Suld1DV2I16Trap:
+  case NVPTXISD::Suld1DV2I32Trap:
+  case NVPTXISD::Suld1DV4I8Trap:
+  case NVPTXISD::Suld1DV4I16Trap:
+  case NVPTXISD::Suld1DV4I32Trap:
+  case NVPTXISD::Suld1DArrayI8Trap:
+  case NVPTXISD::Suld1DArrayI16Trap:
+  case NVPTXISD::Suld1DArrayI32Trap:
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+  case NVPTXISD::Suld2DI8Trap:
+  case NVPTXISD::Suld2DI16Trap:
+  case NVPTXISD::Suld2DI32Trap:
+  case NVPTXISD::Suld2DV2I8Trap:
+  case NVPTXISD::Suld2DV2I16Trap:
+  case NVPTXISD::Suld2DV2I32Trap:
+  case NVPTXISD::Suld2DV4I8Trap:
+  case NVPTXISD::Suld2DV4I16Trap:
+  case NVPTXISD::Suld2DV4I32Trap:
+  case NVPTXISD::Suld2DArrayI8Trap:
+  case NVPTXISD::Suld2DArrayI16Trap:
+  case NVPTXISD::Suld2DArrayI32Trap:
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+  case NVPTXISD::Suld3DI8Trap:
+  case NVPTXISD::Suld3DI16Trap:
+  case NVPTXISD::Suld3DI32Trap:
+  case NVPTXISD::Suld3DV2I8Trap:
+  case NVPTXISD::Suld3DV2I16Trap:
+  case NVPTXISD::Suld3DV2I32Trap:
+  case NVPTXISD::Suld3DV4I8Trap:
+  case NVPTXISD::Suld3DV4I16Trap:
+  case NVPTXISD::Suld3DV4I32Trap:
+    ResNode = SelectSurfaceIntrinsic(N);
+    break;
   case ISD::ADDRSPACECAST:
     ResNode = SelectAddrSpaceCast(N);
     break;
@@ -175,7 +266,7 @@ SDNode *NVPTXDAGToDAGISel::Select(SDNode *N) {
 
 static unsigned int getCodeAddrSpace(MemSDNode *N,
                                      const NVPTXSubtarget &Subtarget) {
-  const Value *Src = N->getSrcValue();
+  const Value *Src = N->getMemOperand()->getValue();
 
   if (!Src)
     return NVPTX::PTXLdStInstCode::GENERIC;
@@ -194,6 +285,24 @@ static unsigned int getCodeAddrSpace(MemSDNode *N,
   return NVPTX::PTXLdStInstCode::GENERIC;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectIntrinsicNoChain(SDNode *N) {
+  unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IID) {
+  default:
+    return nullptr;
+  case Intrinsic::nvvm_texsurf_handle_internal:
+    return SelectTexSurfHandle(N);
+  }
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectTexSurfHandle(SDNode *N) {
+  // Op 0 is the intrinsic ID
+  SDValue Wrapper = N->getOperand(1);
+  SDValue GlobalVal = Wrapper.getOperand(0);
+  return CurDAG->getMachineNode(NVPTX::texsurf_handles, SDLoc(N), MVT::i64,
+                                GlobalVal);
+}
+
 SDNode *NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   SDValue Src = N->getOperand(0);
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
@@ -258,14 +367,14 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
   EVT LoadedVT = LD->getMemoryVT();
-  SDNode *NVPTXLD = NULL;
+  SDNode *NVPTXLD = nullptr;
 
   // do not support pre/post inc/dec
   if (LD->isIndexed())
-    return NULL;
+    return nullptr;
 
   if (!LoadedVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(LD, Subtarget);
@@ -288,7 +397,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return NULL;
+      return nullptr;
   }
 
   // Type Setting: fromType + fromTypeWidth
@@ -337,7 +446,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_avar;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(fromType),
@@ -366,7 +475,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
       Opcode = NVPTX::LD_f64_asi;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(fromType),
@@ -396,7 +505,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (TargetVT) {
@@ -419,7 +528,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_ari;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -448,7 +557,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (TargetVT) {
@@ -471,7 +580,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
         Opcode = NVPTX::LD_f64_areg;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -480,7 +589,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoad(SDNode *N) {
     NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
   }
 
-  if (NVPTXLD != NULL) {
+  if (NVPTXLD) {
     MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
     MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
     cast<MachineSDNode>(NVPTXLD)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -501,7 +610,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   EVT LoadedVT = MemSD->getMemoryVT();
 
   if (!LoadedVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int CodeAddrSpace = getCodeAddrSpace(MemSD, Subtarget);
@@ -547,7 +656,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     VecType = NVPTX::PTXLdStInstCode::V4;
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
   EVT EltVT = N->getValueType(0);
@@ -555,11 +664,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_avar;
         break;
@@ -583,7 +692,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_avar;
         break;
@@ -609,11 +718,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
                  : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LoadV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v2_asi;
         break;
@@ -637,7 +746,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     case NVPTXISD::LoadV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::LDV_i8_v4_asi;
         break;
@@ -664,11 +773,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari_64;
           break;
@@ -692,7 +801,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari_64;
           break;
@@ -711,11 +820,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_ari;
           break;
@@ -739,7 +848,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_ari;
           break;
@@ -766,11 +875,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg_64;
           break;
@@ -794,7 +903,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg_64;
           break;
@@ -813,11 +922,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LoadV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v2_areg;
           break;
@@ -841,7 +950,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadVector(SDNode *N) {
       case NVPTXISD::LoadV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::LDV_i8_v4_areg;
           break;
@@ -887,11 +996,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::LDGV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
         break;
@@ -915,7 +1024,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDUV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
         break;
@@ -939,7 +1048,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDGV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
         break;
@@ -957,7 +1066,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     case NVPTXISD::LDUV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
         break;
@@ -975,19 +1084,18 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     }
 
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 2));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else if (Subtarget.is64Bit()
                  ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                  : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
           break;
@@ -1011,7 +1119,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
           break;
@@ -1035,7 +1143,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
           break;
@@ -1053,7 +1161,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
           break;
@@ -1072,11 +1180,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
           break;
@@ -1100,7 +1208,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
           break;
@@ -1124,7 +1232,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
           break;
@@ -1142,7 +1250,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
           break;
@@ -1162,17 +1270,16 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
 
     SDValue Ops[] = { Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 3));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   } else {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
           break;
@@ -1196,7 +1303,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
           break;
@@ -1220,7 +1327,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
           break;
@@ -1238,7 +1345,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
           break;
@@ -1257,11 +1364,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::LDGV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
           break;
@@ -1285,7 +1392,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
           break;
@@ -1309,7 +1416,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDGV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
           break;
@@ -1327,7 +1434,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
       case NVPTXISD::LDUV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
           break;
@@ -1346,8 +1453,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLDGLDUVector(SDNode *N) {
     }
 
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(),
-                                ArrayRef<SDValue>(Ops, 2));
+    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -1361,14 +1467,14 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
   SDLoc dl(N);
   StoreSDNode *ST = cast<StoreSDNode>(N);
   EVT StoreVT = ST->getMemoryVT();
-  SDNode *NVPTXST = NULL;
+  SDNode *NVPTXST = nullptr;
 
   // do not support pre/post inc/dec
   if (ST->isIndexed())
-    return NULL;
+    return nullptr;
 
   if (!StoreVT.isSimple())
-    return NULL;
+    return nullptr;
 
   // Address Space Setting
   unsigned int codeAddrSpace = getCodeAddrSpace(ST, Subtarget);
@@ -1391,7 +1497,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     else if (num == 4)
       vecType = NVPTX::PTXLdStInstCode::V4;
     else
-      return NULL;
+      return nullptr;
   }
 
   // Type Setting: toType + toTypeWidth
@@ -1435,7 +1541,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_avar;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(toType),
@@ -1464,7 +1570,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
       Opcode = NVPTX::ST_f64_asi;
       break;
     default:
-      return NULL;
+      return nullptr;
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
                       getI32Imm(vecType), getI32Imm(toType),
@@ -1494,7 +1600,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (SourceVT) {
@@ -1517,7 +1623,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_ari;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -1546,7 +1652,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg_64;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     } else {
       switch (SourceVT) {
@@ -1569,7 +1675,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
         Opcode = NVPTX::ST_f64_areg;
         break;
       default:
-        return NULL;
+        return nullptr;
       }
     }
     SDValue Ops[] = { N1, getI32Imm(isVolatile), getI32Imm(codeAddrSpace),
@@ -1578,7 +1684,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStore(SDNode *N) {
     NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
   }
 
-  if (NVPTXST != NULL) {
+  if (NVPTXST) {
     MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
     MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
     cast<MachineSDNode>(NVPTXST)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -1645,7 +1751,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     N2 = N->getOperand(5);
     break;
   default:
-    return NULL;
+    return nullptr;
   }
 
   StOps.push_back(getI32Imm(IsVolatile));
@@ -1657,11 +1763,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
   if (SelectDirectAddr(N2, Addr)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_avar;
         break;
@@ -1685,7 +1791,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_avar;
         break;
@@ -1707,11 +1813,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
                  : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
-      return NULL;
+      return nullptr;
     case NVPTXISD::StoreV2:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v2_asi;
         break;
@@ -1735,7 +1841,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     case NVPTXISD::StoreV4:
       switch (EltVT.getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i8:
         Opcode = NVPTX::STV_i8_v4_asi;
         break;
@@ -1759,11 +1865,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari_64;
           break;
@@ -1787,7 +1893,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari_64;
           break;
@@ -1806,11 +1912,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_ari;
           break;
@@ -1834,7 +1940,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_ari;
           break;
@@ -1857,11 +1963,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     if (Subtarget.is64Bit()) {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg_64;
           break;
@@ -1885,7 +1991,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg_64;
           break;
@@ -1904,11 +2010,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
     } else {
       switch (N->getOpcode()) {
       default:
-        return NULL;
+        return nullptr;
       case NVPTXISD::StoreV2:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v2_areg;
           break;
@@ -1932,7 +2038,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreVector(SDNode *N) {
       case NVPTXISD::StoreV4:
         switch (EltVT.getSimpleVT().SimpleTy) {
         default:
-          return NULL;
+          return nullptr;
         case MVT::i8:
           Opcode = NVPTX::STV_i8_v4_areg;
           break;
@@ -1973,7 +2079,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   unsigned VecSize;
   switch (Node->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::LoadParam:
     VecSize = 1;
     break;
@@ -1992,11 +2098,11 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
 
   switch (VecSize) {
   default:
-    return NULL;
+    return nullptr;
   case 1:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemI8;
       break;
@@ -2023,7 +2129,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 2:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV2I8;
       break;
@@ -2050,7 +2156,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
   case 4:
     switch (MemVT.getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opc = NVPTX::LoadParamMemV4I8;
       break;
@@ -2077,7 +2183,7 @@ SDNode *NVPTXDAGToDAGISel::SelectLoadParam(SDNode *Node) {
     VTs = CurDAG->getVTList(EltVT, EltVT, MVT::Other, MVT::Glue);
   } else {
     EVT EVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other, MVT::Glue };
-    VTs = CurDAG->getVTList(&EVTs[0], array_lengthof(EVTs));
+    VTs = CurDAG->getVTList(EVTs);
   }
 
   unsigned OffsetVal = cast<ConstantSDNode>(Offset)->getZExtValue();
@@ -2103,7 +2209,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::StoreRetval:
     NumElts = 1;
     break;
@@ -2128,11 +2234,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   unsigned Opcode = 0;
   switch (NumElts) {
   default:
-    return NULL;
+    return nullptr;
   case 1:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalI8;
       break;
@@ -2159,7 +2265,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 2:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV2I8;
       break;
@@ -2186,7 +2292,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreRetval(SDNode *N) {
   case 4:
     switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
     default:
-      return NULL;
+      return nullptr;
     case MVT::i1:
       Opcode = NVPTX::StoreRetvalV4I8;
       break;
@@ -2229,7 +2335,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   unsigned NumElts = 1;
   switch (N->getOpcode()) {
   default:
-    return NULL;
+    return nullptr;
   case NVPTXISD::StoreParamU32:
   case NVPTXISD::StoreParamS32:
   case NVPTXISD::StoreParam:
@@ -2260,11 +2366,11 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   default:
     switch (NumElts) {
     default:
-      return NULL;
+      return nullptr;
     case 1:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamI8;
         break;
@@ -2291,7 +2397,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 2:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV2I8;
         break;
@@ -2318,7 +2424,7 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
     case 4:
       switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
       default:
-        return NULL;
+        return nullptr;
       case MVT::i1:
         Opcode = NVPTX::StoreParamV4I8;
         break;
@@ -2371,6 +2477,488 @@ SDNode *NVPTXDAGToDAGISel::SelectStoreParam(SDNode *N) {
   return Ret;
 }
 
+SDNode *NVPTXDAGToDAGISel::SelectTextureIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexRef = N->getOperand(1);
+  SDValue SampRef = N->getOperand(2);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Tex1DFloatI32:
+    Opc = NVPTX::TEX_1D_F32_I32;
+    break;
+  case NVPTXISD::Tex1DFloatFloat:
+    Opc = NVPTX::TEX_1D_F32_F32;
+    break;
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DI32I32:
+    Opc = NVPTX::TEX_1D_I32_I32;
+    break;
+  case NVPTXISD::Tex1DI32Float:
+    Opc = NVPTX::TEX_1D_I32_F32;
+    break;
+  case NVPTXISD::Tex1DI32FloatLevel:
+    Opc = NVPTX::TEX_1D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DI32FloatGrad:
+    Opc = NVPTX::TEX_1D_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayFloatI32:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_I32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloat:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex1DArrayI32I32:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_I32;
+    break;
+  case NVPTXISD::Tex1DArrayI32Float:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32;
+    break;
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+    Opc = NVPTX::TEX_1D_ARRAY_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DFloatI32:
+    Opc = NVPTX::TEX_2D_F32_I32;
+    break;
+  case NVPTXISD::Tex2DFloatFloat:
+    Opc = NVPTX::TEX_2D_F32_F32;
+    break;
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DI32I32:
+    Opc = NVPTX::TEX_2D_I32_I32;
+    break;
+  case NVPTXISD::Tex2DI32Float:
+    Opc = NVPTX::TEX_2D_I32_F32;
+    break;
+  case NVPTXISD::Tex2DI32FloatLevel:
+    Opc = NVPTX::TEX_2D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DI32FloatGrad:
+    Opc = NVPTX::TEX_2D_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayFloatI32:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_I32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloat:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex2DArrayI32I32:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_I32;
+    break;
+  case NVPTXISD::Tex2DArrayI32Float:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32;
+    break;
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+    Opc = NVPTX::TEX_2D_ARRAY_I32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DFloatI32:
+    Opc = NVPTX::TEX_3D_F32_I32;
+    break;
+  case NVPTXISD::Tex3DFloatFloat:
+    Opc = NVPTX::TEX_3D_F32_F32;
+    break;
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    Opc = NVPTX::TEX_3D_F32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    Opc = NVPTX::TEX_3D_F32_F32_GRAD;
+    break;
+  case NVPTXISD::Tex3DI32I32:
+    Opc = NVPTX::TEX_3D_I32_I32;
+    break;
+  case NVPTXISD::Tex3DI32Float:
+    Opc = NVPTX::TEX_3D_I32_F32;
+    break;
+  case NVPTXISD::Tex3DI32FloatLevel:
+    Opc = NVPTX::TEX_3D_I32_F32_LEVEL;
+    break;
+  case NVPTXISD::Tex3DI32FloatGrad:
+    Opc = NVPTX::TEX_3D_I32_F32_GRAD;
+    break;
+  }
+
+  Ops.push_back(TexRef);
+  Ops.push_back(SampRef);
+
+  // Copy over indices
+  for (unsigned i = 3; i < N->getNumOperands(); ++i) {
+    Ops.push_back(N->getOperand(i));
+  }
+
+  Ops.push_back(Chain);
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
+SDNode *NVPTXDAGToDAGISel::SelectSurfaceIntrinsic(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue TexHandle = N->getOperand(1);
+  SDNode *Ret = nullptr;
+  unsigned Opc = 0;
+  SmallVector<SDValue, 8> Ops;
+  switch (N->getOpcode()) {
+  default: return nullptr;
+  case NVPTXISD::Suld1DI8Trap:
+    Opc = NVPTX::SULD_1D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI16Trap:
+    Opc = NVPTX::SULD_1D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DI32Trap:
+    Opc = NVPTX::SULD_1D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I8Trap:
+    Opc = NVPTX::SULD_1D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I16Trap:
+    Opc = NVPTX::SULD_1D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV2I32Trap:
+    Opc = NVPTX::SULD_1D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I8Trap:
+    Opc = NVPTX::SULD_1D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I16Trap:
+    Opc = NVPTX::SULD_1D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DV4I32Trap:
+    Opc = NVPTX::SULD_1D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayI32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV2I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I8Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I16Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld1DArrayV4I32Trap:
+    Opc = NVPTX::SULD_1D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI8Trap:
+    Opc = NVPTX::SULD_2D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI16Trap:
+    Opc = NVPTX::SULD_2D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DI32Trap:
+    Opc = NVPTX::SULD_2D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I8Trap:
+    Opc = NVPTX::SULD_2D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I16Trap:
+    Opc = NVPTX::SULD_2D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV2I32Trap:
+    Opc = NVPTX::SULD_2D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I8Trap:
+    Opc = NVPTX::SULD_2D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I16Trap:
+    Opc = NVPTX::SULD_2D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DV4I32Trap:
+    Opc = NVPTX::SULD_2D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayI32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV2I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I8Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I16Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld2DArrayV4I32Trap:
+    Opc = NVPTX::SULD_2D_ARRAY_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI8Trap:
+    Opc = NVPTX::SULD_3D_I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI16Trap:
+    Opc = NVPTX::SULD_3D_I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DI32Trap:
+    Opc = NVPTX::SULD_3D_I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I8Trap:
+    Opc = NVPTX::SULD_3D_V2I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I16Trap:
+    Opc = NVPTX::SULD_3D_V2I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV2I32Trap:
+    Opc = NVPTX::SULD_3D_V2I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I8Trap:
+    Opc = NVPTX::SULD_3D_V4I8_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I16Trap:
+    Opc = NVPTX::SULD_3D_V4I16_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  case NVPTXISD::Suld3DV4I32Trap:
+    Opc = NVPTX::SULD_3D_V4I32_TRAP;
+    Ops.push_back(TexHandle);
+    Ops.push_back(N->getOperand(2));
+    Ops.push_back(N->getOperand(3));
+    Ops.push_back(N->getOperand(4));
+    Ops.push_back(Chain);
+    break;
+  }
+  Ret = CurDAG->getMachineNode(Opc, SDLoc(N), N->getVTList(), Ops);
+  return Ret;
+}
+
 // SelectDirectAddr - Match a direct address for DAG.
 // A direct address could be a globaladdress or externalsymbol.
 bool NVPTXDAGToDAGISel::SelectDirectAddr(SDValue N, SDValue &Address) {
@@ -2464,14 +3052,18 @@ bool NVPTXDAGToDAGISel::SelectADDRri64(SDNode *OpNode, SDValue Addr,
 
 bool NVPTXDAGToDAGISel::ChkMemSDNodeAddressSpace(SDNode *N,
                                                  unsigned int spN) const {
-  const Value *Src = NULL;
+  const Value *Src = nullptr;
   // Even though MemIntrinsicSDNode is a subclas of MemSDNode,
   // the classof() for MemSDNode does not include MemIntrinsicSDNode
   // (See SelectionDAGNodes.h). So we need to check for both.
   if (MemSDNode *mN = dyn_cast<MemSDNode>(N)) {
-    Src = mN->getSrcValue();
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
   } else if (MemSDNode *mN = dyn_cast<MemIntrinsicSDNode>(N)) {
-    Src = mN->getSrcValue();
+    if (spN == 0 && mN->getMemOperand()->getPseudoValue())
+      return true;
+    Src = mN->getMemOperand()->getValue();
   }
   if (!Src)
     return false;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 93ad169..11f92e7 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-isel"
-
 #include "NVPTX.h"
 #include "NVPTXISelLowering.h"
 #include "NVPTXRegisterInfo.h"
@@ -46,19 +44,22 @@ public:
                              CodeGenOpt::Level   OptLevel);
 
   // Pass Name
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "NVPTX DAG->DAG Pattern Instruction Selection";
   }
 
   const NVPTXSubtarget &Subtarget;
 
-  virtual bool SelectInlineAsmMemoryOperand(
-      const SDValue &Op, char ConstraintCode, std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 private:
 // Include the pieces autogenerated from the target description.
 #include "NVPTXGenDAGISel.inc"
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
+  SDNode *SelectIntrinsicNoChain(SDNode *N);
+  SDNode *SelectTexSurfHandle(SDNode *N);
   SDNode *SelectLoad(SDNode *N);
   SDNode *SelectLoadVector(SDNode *N);
   SDNode *SelectLDGLDUVector(SDNode *N);
@@ -68,6 +69,8 @@ private:
   SDNode *SelectStoreRetval(SDNode *N);
   SDNode *SelectStoreParam(SDNode *N);
   SDNode *SelectAddrSpaceCast(SDNode *N);
+  SDNode *SelectTextureIntrinsic(SDNode *N);
+  SDNode *SelectSurfaceIntrinsic(SDNode *N);
         
   inline SDValue getI32Imm(unsigned Imm) {
     return CurDAG->getTargetConstant(Imm, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8e25a65..b0943be 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -75,7 +75,7 @@ static bool IsPTXVectorType(MVT VT) {
 /// LowerCall, and LowerReturn.
 static void ComputePTXValueVTs(const TargetLowering &TLI, Type *Ty,
                                SmallVectorImpl<EVT> &ValueVTs,
-                               SmallVectorImpl<uint64_t> *Offsets = 0,
+                               SmallVectorImpl<uint64_t> *Offsets = nullptr,
                                uint64_t StartingOffset = 0) {
   SmallVector<EVT, 16> TempVTs;
   SmallVector<uint64_t, 16> TempOffsets;
@@ -245,7 +245,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(NVPTXTargetMachine &TM)
 const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
   default:
-    return 0;
+    return nullptr;
   case NVPTXISD::CALL:
     return "NVPTXISD::CALL";
   case NVPTXISD::RET_FLAG:
@@ -328,6 +328,116 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::StoreV2";
   case NVPTXISD::StoreV4:
     return "NVPTXISD::StoreV4";
+  case NVPTXISD::Tex1DFloatI32:        return "NVPTXISD::Tex1DFloatI32";
+  case NVPTXISD::Tex1DFloatFloat:      return "NVPTXISD::Tex1DFloatFloat";
+  case NVPTXISD::Tex1DFloatFloatLevel:
+    return "NVPTXISD::Tex1DFloatFloatLevel";
+  case NVPTXISD::Tex1DFloatFloatGrad:
+    return "NVPTXISD::Tex1DFloatFloatGrad";
+  case NVPTXISD::Tex1DI32I32:          return "NVPTXISD::Tex1DI32I32";
+  case NVPTXISD::Tex1DI32Float:        return "NVPTXISD::Tex1DI32Float";
+  case NVPTXISD::Tex1DI32FloatLevel:
+    return "NVPTXISD::Tex1DI32FloatLevel";
+  case NVPTXISD::Tex1DI32FloatGrad:
+    return "NVPTXISD::Tex1DI32FloatGrad";
+  case NVPTXISD::Tex1DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
+  case NVPTXISD::Tex1DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex1DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex1DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex1DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
+  case NVPTXISD::Tex1DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
+  case NVPTXISD::Tex1DArrayI32FloatLevel:
+    return "NVPTXISD::Tex2DArrayI32FloatLevel";
+  case NVPTXISD::Tex1DArrayI32FloatGrad:
+    return "NVPTXISD::Tex2DArrayI32FloatGrad";
+  case NVPTXISD::Tex2DFloatI32:        return "NVPTXISD::Tex2DFloatI32";
+  case NVPTXISD::Tex2DFloatFloat:      return "NVPTXISD::Tex2DFloatFloat";
+  case NVPTXISD::Tex2DFloatFloatLevel:
+    return "NVPTXISD::Tex2DFloatFloatLevel";
+  case NVPTXISD::Tex2DFloatFloatGrad:
+    return "NVPTXISD::Tex2DFloatFloatGrad";
+  case NVPTXISD::Tex2DI32I32:          return "NVPTXISD::Tex2DI32I32";
+  case NVPTXISD::Tex2DI32Float:        return "NVPTXISD::Tex2DI32Float";
+  case NVPTXISD::Tex2DI32FloatLevel:
+    return "NVPTXISD::Tex2DI32FloatLevel";
+  case NVPTXISD::Tex2DI32FloatGrad:
+    return "NVPTXISD::Tex2DI32FloatGrad";
+  case NVPTXISD::Tex2DArrayFloatI32:   return "NVPTXISD::Tex2DArrayFloatI32";
+  case NVPTXISD::Tex2DArrayFloatFloat: return "NVPTXISD::Tex2DArrayFloatFloat";
+  case NVPTXISD::Tex2DArrayFloatFloatLevel:
+    return "NVPTXISD::Tex2DArrayFloatFloatLevel";
+  case NVPTXISD::Tex2DArrayFloatFloatGrad:
+    return "NVPTXISD::Tex2DArrayFloatFloatGrad";
+  case NVPTXISD::Tex2DArrayI32I32:     return "NVPTXISD::Tex2DArrayI32I32";
+  case NVPTXISD::Tex2DArrayI32Float:   return "NVPTXISD::Tex2DArrayI32Float";
+  case NVPTXISD::Tex2DArrayI32FloatLevel:
+    return "NVPTXISD::Tex2DArrayI32FloatLevel";
+  case NVPTXISD::Tex2DArrayI32FloatGrad:
+    return "NVPTXISD::Tex2DArrayI32FloatGrad";
+  case NVPTXISD::Tex3DFloatI32:        return "NVPTXISD::Tex3DFloatI32";
+  case NVPTXISD::Tex3DFloatFloat:      return "NVPTXISD::Tex3DFloatFloat";
+  case NVPTXISD::Tex3DFloatFloatLevel:
+    return "NVPTXISD::Tex3DFloatFloatLevel";
+  case NVPTXISD::Tex3DFloatFloatGrad:
+    return "NVPTXISD::Tex3DFloatFloatGrad";
+  case NVPTXISD::Tex3DI32I32:          return "NVPTXISD::Tex3DI32I32";
+  case NVPTXISD::Tex3DI32Float:        return "NVPTXISD::Tex3DI32Float";
+  case NVPTXISD::Tex3DI32FloatLevel:
+    return "NVPTXISD::Tex3DI32FloatLevel";
+  case NVPTXISD::Tex3DI32FloatGrad:
+    return "NVPTXISD::Tex3DI32FloatGrad";
+
+  case NVPTXISD::Suld1DI8Trap:          return "NVPTXISD::Suld1DI8Trap";
+  case NVPTXISD::Suld1DI16Trap:         return "NVPTXISD::Suld1DI16Trap";
+  case NVPTXISD::Suld1DI32Trap:         return "NVPTXISD::Suld1DI32Trap";
+  case NVPTXISD::Suld1DV2I8Trap:        return "NVPTXISD::Suld1DV2I8Trap";
+  case NVPTXISD::Suld1DV2I16Trap:       return "NVPTXISD::Suld1DV2I16Trap";
+  case NVPTXISD::Suld1DV2I32Trap:       return "NVPTXISD::Suld1DV2I32Trap";
+  case NVPTXISD::Suld1DV4I8Trap:        return "NVPTXISD::Suld1DV4I8Trap";
+  case NVPTXISD::Suld1DV4I16Trap:       return "NVPTXISD::Suld1DV4I16Trap";
+  case NVPTXISD::Suld1DV4I32Trap:       return "NVPTXISD::Suld1DV4I32Trap";
+
+  case NVPTXISD::Suld1DArrayI8Trap:     return "NVPTXISD::Suld1DArrayI8Trap";
+  case NVPTXISD::Suld1DArrayI16Trap:    return "NVPTXISD::Suld1DArrayI16Trap";
+  case NVPTXISD::Suld1DArrayI32Trap:    return "NVPTXISD::Suld1DArrayI32Trap";
+  case NVPTXISD::Suld1DArrayV2I8Trap:   return "NVPTXISD::Suld1DArrayV2I8Trap";
+  case NVPTXISD::Suld1DArrayV2I16Trap:  return "NVPTXISD::Suld1DArrayV2I16Trap";
+  case NVPTXISD::Suld1DArrayV2I32Trap:  return "NVPTXISD::Suld1DArrayV2I32Trap";
+  case NVPTXISD::Suld1DArrayV4I8Trap:   return "NVPTXISD::Suld1DArrayV4I8Trap";
+  case NVPTXISD::Suld1DArrayV4I16Trap:  return "NVPTXISD::Suld1DArrayV4I16Trap";
+  case NVPTXISD::Suld1DArrayV4I32Trap:  return "NVPTXISD::Suld1DArrayV4I32Trap";
+
+  case NVPTXISD::Suld2DI8Trap:          return "NVPTXISD::Suld2DI8Trap";
+  case NVPTXISD::Suld2DI16Trap:         return "NVPTXISD::Suld2DI16Trap";
+  case NVPTXISD::Suld2DI32Trap:         return "NVPTXISD::Suld2DI32Trap";
+  case NVPTXISD::Suld2DV2I8Trap:        return "NVPTXISD::Suld2DV2I8Trap";
+  case NVPTXISD::Suld2DV2I16Trap:       return "NVPTXISD::Suld2DV2I16Trap";
+  case NVPTXISD::Suld2DV2I32Trap:       return "NVPTXISD::Suld2DV2I32Trap";
+  case NVPTXISD::Suld2DV4I8Trap:        return "NVPTXISD::Suld2DV4I8Trap";
+  case NVPTXISD::Suld2DV4I16Trap:       return "NVPTXISD::Suld2DV4I16Trap";
+  case NVPTXISD::Suld2DV4I32Trap:       return "NVPTXISD::Suld2DV4I32Trap";
+
+  case NVPTXISD::Suld2DArrayI8Trap:     return "NVPTXISD::Suld2DArrayI8Trap";
+  case NVPTXISD::Suld2DArrayI16Trap:    return "NVPTXISD::Suld2DArrayI16Trap";
+  case NVPTXISD::Suld2DArrayI32Trap:    return "NVPTXISD::Suld2DArrayI32Trap";
+  case NVPTXISD::Suld2DArrayV2I8Trap:   return "NVPTXISD::Suld2DArrayV2I8Trap";
+  case NVPTXISD::Suld2DArrayV2I16Trap:  return "NVPTXISD::Suld2DArrayV2I16Trap";
+  case NVPTXISD::Suld2DArrayV2I32Trap:  return "NVPTXISD::Suld2DArrayV2I32Trap";
+  case NVPTXISD::Suld2DArrayV4I8Trap:   return "NVPTXISD::Suld2DArrayV4I8Trap";
+  case NVPTXISD::Suld2DArrayV4I16Trap:  return "NVPTXISD::Suld2DArrayV4I16Trap";
+  case NVPTXISD::Suld2DArrayV4I32Trap:  return "NVPTXISD::Suld2DArrayV4I32Trap";
+
+  case NVPTXISD::Suld3DI8Trap:          return "NVPTXISD::Suld3DI8Trap";
+  case NVPTXISD::Suld3DI16Trap:         return "NVPTXISD::Suld3DI16Trap";
+  case NVPTXISD::Suld3DI32Trap:         return "NVPTXISD::Suld3DI32Trap";
+  case NVPTXISD::Suld3DV2I8Trap:        return "NVPTXISD::Suld3DV2I8Trap";
+  case NVPTXISD::Suld3DV2I16Trap:       return "NVPTXISD::Suld3DV2I16Trap";
+  case NVPTXISD::Suld3DV2I32Trap:       return "NVPTXISD::Suld3DV2I32Trap";
+  case NVPTXISD::Suld3DV4I8Trap:        return "NVPTXISD::Suld3DV4I8Trap";
+  case NVPTXISD::Suld3DV4I16Trap:       return "NVPTXISD::Suld3DV4I16Trap";
+  case NVPTXISD::Suld3DV4I32Trap:       return "NVPTXISD::Suld3DV4I32Trap";
   }
 }
 
@@ -526,7 +636,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue Chain = CLI.Chain;
   SDValue Callee = CLI.Callee;
   bool &isTailCall = CLI.IsTailCall;
-  ArgListTy &Args = CLI.Args;
+  ArgListTy &Args = CLI.getArgs();
   Type *retTy = CLI.RetTy;
   ImmutableCallSite *CS = CLI.CS;
 
@@ -575,7 +685,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                       DAG.getConstant(paramCount, MVT::i32),
                                       DAG.getConstant(sz, MVT::i32), InFlag };
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps, 5);
+                            DeclareParamOps);
         InFlag = Chain.getValue(1);
         unsigned curOffset = 0;
         for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
@@ -599,7 +709,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                        DAG.getConstant(curOffset, MVT::i32),
                                        StVal, InFlag };
             Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                            CopyParamVTs, &CopyParamOps[0], 5,
+                                            CopyParamVTs, CopyParamOps,
                                             elemtype, MachinePointerInfo());
             InFlag = Chain.getValue(1);
             curOffset += sz / 8;
@@ -621,7 +731,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                       DAG.getConstant(paramCount, MVT::i32),
                                       DAG.getConstant(sz, MVT::i32), InFlag };
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps, 5);
+                            DeclareParamOps);
         InFlag = Chain.getValue(1);
         unsigned NumElts = ObjectVT.getVectorNumElements();
         EVT EltVT = ObjectVT.getVectorElementType();
@@ -644,7 +754,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      DAG.getConstant(0, MVT::i32), Elt,
                                      InFlag };
           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, &CopyParamOps[0], 5,
+                                          CopyParamVTs, CopyParamOps,
                                           MemVT, MachinePointerInfo());
           InFlag = Chain.getValue(1);
         } else if (NumElts == 2) {
@@ -661,7 +771,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                      DAG.getConstant(0, MVT::i32), Elt0, Elt1,
                                      InFlag };
           Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
-                                          CopyParamVTs, &CopyParamOps[0], 6,
+                                          CopyParamVTs, CopyParamOps,
                                           MemVT, MachinePointerInfo());
           InFlag = Chain.getValue(1);
         } else {
@@ -735,9 +845,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
             Ops.push_back(InFlag);
 
             SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, &Ops[0],
-                                            Ops.size(), MemVT,
-                                            MachinePointerInfo());
+            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
+                                            MemVT, MachinePointerInfo());
             InFlag = Chain.getValue(1);
             curOffset += PerStoreOffset;
           }
@@ -762,7 +871,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                     DAG.getConstant(sz, MVT::i32),
                                     DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                          DeclareParamOps, 5);
+                          DeclareParamOps);
       InFlag = Chain.getValue(1);
       SDValue OutV = OutVals[OIdx];
       if (needExtend) {
@@ -781,7 +890,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         opcode = NVPTXISD::StoreParamU32;
       else if (Outs[OIdx].Flags.isSExt())
         opcode = NVPTXISD::StoreParamS32;
-      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps, 5,
+      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
                                       VT, MachinePointerInfo());
 
       InFlag = Chain.getValue(1);
@@ -806,7 +915,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       InFlag
     };
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                        DeclareParamOps, 5);
+                        DeclareParamOps);
     InFlag = Chain.getValue(1);
     unsigned curOffset = 0;
     for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
@@ -834,7 +943,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                    DAG.getConstant(curOffset, MVT::i32), theVal,
                                    InFlag };
         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                        CopyParamOps, 5, elemtype,
+                                        CopyParamOps, elemtype,
                                         MachinePointerInfo());
 
         InFlag = Chain.getValue(1);
@@ -865,7 +974,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   DAG.getConstant(resultsz, MVT::i32),
                                   DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareRet, dl, DeclareRetVTs,
-                          DeclareRetOps, 5);
+                          DeclareRetOps);
       InFlag = Chain.getValue(1);
     } else {
       retAlignment = getArgumentAlignment(Callee, CS, retTy, 0);
@@ -875,7 +984,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   DAG.getConstant(resultsz / 8, MVT::i32),
                                   DAG.getConstant(0, MVT::i32), InFlag };
       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
-                          DeclareRetOps, 5);
+                          DeclareRetOps);
       InFlag = Chain.getValue(1);
     }
   }
@@ -895,7 +1004,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SDValue ProtoOps[] = {
       Chain, DAG.getTargetExternalSymbol(ProtoStr, MVT::i32), InFlag,
     };
-    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, &ProtoOps[0], 3);
+    Chain = DAG.getNode(NVPTXISD::CallPrototype, dl, ProtoVTs, ProtoOps);
     InFlag = Chain.getValue(1);
   }
   // Op to just print "call"
@@ -904,20 +1013,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain, DAG.getConstant((Ins.size() == 0) ? 0 : 1, MVT::i32), InFlag
   };
   Chain = DAG.getNode(Func ? (NVPTXISD::PrintCallUni) : (NVPTXISD::PrintCall),
-                      dl, PrintCallVTs, PrintCallOps, 3);
+                      dl, PrintCallVTs, PrintCallOps);
   InFlag = Chain.getValue(1);
 
   // Ops to print out the function name
   SDVTList CallVoidVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallVoidOps[] = { Chain, Callee, InFlag };
-  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps, 3);
+  Chain = DAG.getNode(NVPTXISD::CallVoid, dl, CallVoidVTs, CallVoidOps);
   InFlag = Chain.getValue(1);
 
   // Ops to print out the param list
   SDVTList CallArgBeginVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallArgBeginOps[] = { Chain, InFlag };
   Chain = DAG.getNode(NVPTXISD::CallArgBegin, dl, CallArgBeginVTs,
-                      CallArgBeginOps, 2);
+                      CallArgBeginOps);
   InFlag = Chain.getValue(1);
 
   for (unsigned i = 0, e = paramCount; i != e; ++i) {
@@ -929,21 +1038,20 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SDVTList CallArgVTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue CallArgOps[] = { Chain, DAG.getConstant(1, MVT::i32),
                              DAG.getConstant(i, MVT::i32), InFlag };
-    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps, 4);
+    Chain = DAG.getNode(opcode, dl, CallArgVTs, CallArgOps);
     InFlag = Chain.getValue(1);
   }
   SDVTList CallArgEndVTs = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue CallArgEndOps[] = { Chain, DAG.getConstant(Func ? 1 : 0, MVT::i32),
                               InFlag };
-  Chain =
-      DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps, 3);
+  Chain = DAG.getNode(NVPTXISD::CallArgEnd, dl, CallArgEndVTs, CallArgEndOps);
   InFlag = Chain.getValue(1);
 
   if (!Func) {
     SDVTList PrototypeVTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue PrototypeOps[] = { Chain, DAG.getConstant(uniqueCallSite, MVT::i32),
                                InFlag };
-    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps, 3);
+    Chain = DAG.getNode(NVPTXISD::Prototype, dl, PrototypeVTs, PrototypeOps);
     InFlag = Chain.getValue(1);
   }
 
@@ -962,7 +1070,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
       if (NumElts == 1) {
         // Just a simple load
-        std::vector<EVT> LoadRetVTs;
+        SmallVector<EVT, 4> LoadRetVTs;
         if (needTruncate) {
           // If loading i1 result, generate
           //   load i16
@@ -972,15 +1080,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           LoadRetVTs.push_back(EltVT);
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        std::vector<SDValue> LoadRetOps;
+        SmallVector<SDValue, 4> LoadRetOps;
         LoadRetOps.push_back(Chain);
         LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
         LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), EltVT, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         SDValue Ret0 = retval;
@@ -989,7 +1096,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         InVals.push_back(Ret0);
       } else if (NumElts == 2) {
         // LoadV2
-        std::vector<EVT> LoadRetVTs;
+        SmallVector<EVT, 4> LoadRetVTs;
         if (needTruncate) {
           // If loading i1 result, generate
           //   load i16
@@ -1002,15 +1109,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         }
         LoadRetVTs.push_back(MVT::Other);
         LoadRetVTs.push_back(MVT::Glue);
-        std::vector<SDValue> LoadRetOps;
+        SmallVector<SDValue, 4> LoadRetOps;
         LoadRetOps.push_back(Chain);
         LoadRetOps.push_back(DAG.getConstant(1, MVT::i32));
         LoadRetOps.push_back(DAG.getConstant(0, MVT::i32));
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParamV2, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), EltVT, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
         Chain = retval.getValue(2);
         InFlag = retval.getValue(3);
         SDValue Ret0 = retval.getValue(0);
@@ -1054,8 +1160,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           LoadRetOps.push_back(DAG.getConstant(Ofst, MVT::i32));
           LoadRetOps.push_back(InFlag);
           SDValue retval = DAG.getMemIntrinsicNode(
-              Opc, dl, DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()),
-              &LoadRetOps[0], LoadRetOps.size(), EltVT, MachinePointerInfo());
+              Opc, dl, DAG.getVTList(LoadRetVTs),
+              LoadRetOps, EltVT, MachinePointerInfo());
           if (VecSize == 2) {
             Chain = retval.getValue(2);
             InFlag = retval.getValue(3);
@@ -1110,8 +1216,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         LoadRetOps.push_back(InFlag);
         SDValue retval = DAG.getMemIntrinsicNode(
             NVPTXISD::LoadParam, dl,
-            DAG.getVTList(&LoadRetVTs[0], LoadRetVTs.size()), &LoadRetOps[0],
-            LoadRetOps.size(), TheLoadType, MachinePointerInfo());
+            DAG.getVTList(LoadRetVTs), LoadRetOps,
+            TheLoadType, MachinePointerInfo());
         Chain = retval.getValue(1);
         InFlag = retval.getValue(2);
         SDValue Ret0 = retval.getValue(0);
@@ -1153,8 +1259,7 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
                                 DAG.getIntPtrConstant(j)));
     }
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), &Ops[0],
-                     Ops.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Ops);
 }
 
 SDValue
@@ -1209,7 +1314,7 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
   // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
   // in LegalizeDAG.cpp which also uses MergeValues.
   SDValue Ops[] = { result, LD->getChain() };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -1297,7 +1402,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     MemSDNode *MemSD = cast<MemSDNode>(N);
 
     SDValue NewSt = DAG.getMemIntrinsicNode(
-        Opcode, DL, DAG.getVTList(MVT::Other), &Ops[0], Ops.size(),
+        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
         MemSD->getMemoryVT(), MemSD->getMemOperand());
 
     //return DCI.CombineTo(N, NewSt, true);
@@ -1429,7 +1534,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     if (isImageOrSamplerVal(
             theArgs[i],
             (theArgs[i]->getParent() ? theArgs[i]->getParent()->getParent()
-                                     : 0))) {
+                                     : nullptr))) {
       assert(isKernel && "Only kernels can have image/sampler params");
       InVals.push_back(DAG.getConstant(i + 1, MVT::i32));
       continue;
@@ -1683,8 +1788,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   //}
 
   if (!OutChains.empty())
-    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &OutChains[0],
-                            OutChains.size()));
+    DAG.setRoot(DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains));
 
   return Chain;
 }
@@ -1726,7 +1830,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                      DAG.getVTList(MVT::Other), &Ops[0], 3,
+                                      DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
 
     } else if (NumElts == 2) {
@@ -1742,7 +1846,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       SDValue Ops[] = { Chain, DAG.getConstant(0, MVT::i32), StoreVal0,
                         StoreVal1 };
       Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
-                                      DAG.getVTList(MVT::Other), &Ops[0], 4,
+                                      DAG.getVTList(MVT::Other), Ops,
                                       EltVT, MachinePointerInfo());
     } else {
       // V4 stores
@@ -1814,8 +1918,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
         // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
         Chain =
-            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), &Ops[0],
-                                    Ops.size(), EltVT, MachinePointerInfo());
+            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
+                                    EltVT, MachinePointerInfo());
         Offset += PerStoreOffset;
       }
     }
@@ -1852,8 +1956,8 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
         SDValue Ops[] = { Chain, DAG.getConstant(SizeSoFar, MVT::i32), TmpVal };
         Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                        DAG.getVTList(MVT::Other), &Ops[0],
-                                        3, TheStoreType,
+                                        DAG.getVTList(MVT::Other), Ops,
+                                        TheStoreType,
                                         MachinePointerInfo());
         if(TheValType.isVector())
           SizeSoFar += 
@@ -1891,6 +1995,195 @@ bool NVPTXTargetLowering::isTypeSupportedInIntrinsic(MVT VT) const {
   return false;
 }
 
+static unsigned getOpcForTextureInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+
+  case Intrinsic::nvvm_tex_1d_v4f32_i32:
+    return NVPTXISD::Tex1DFloatI32;
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloat;
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+    return NVPTXISD::Tex1DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_v4i32_i32:
+    return NVPTXISD::Tex1DI32I32;
+  case Intrinsic::nvvm_tex_1d_v4i32_f32:
+    return NVPTXISD::Tex1DI32Float;
+  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
+    return NVPTXISD::Tex1DI32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
+    return NVPTXISD::Tex1DI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
+    return NVPTXISD::Tex1DArrayFloatI32;
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex1DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
+    return NVPTXISD::Tex1DArrayI32I32;
+  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32Float;
+  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32FloatLevel;
+  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
+    return NVPTXISD::Tex1DArrayI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_v4f32_i32:
+    return NVPTXISD::Tex2DFloatI32;
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloat;
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+    return NVPTXISD::Tex2DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_v4i32_i32:
+    return NVPTXISD::Tex2DI32I32;
+  case Intrinsic::nvvm_tex_2d_v4i32_f32:
+    return NVPTXISD::Tex2DI32Float;
+  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
+    return NVPTXISD::Tex2DI32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
+    return NVPTXISD::Tex2DI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
+    return NVPTXISD::Tex2DArrayFloatI32;
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloat;
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+    return NVPTXISD::Tex2DArrayFloatFloatGrad;
+  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
+    return NVPTXISD::Tex2DArrayI32I32;
+  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32Float;
+  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32FloatLevel;
+  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
+    return NVPTXISD::Tex2DArrayI32FloatGrad;
+
+  case Intrinsic::nvvm_tex_3d_v4f32_i32:
+    return NVPTXISD::Tex3DFloatI32;
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloat;
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
+    return NVPTXISD::Tex3DFloatFloatGrad;
+  case Intrinsic::nvvm_tex_3d_v4i32_i32:
+    return NVPTXISD::Tex3DI32I32;
+  case Intrinsic::nvvm_tex_3d_v4i32_f32:
+    return NVPTXISD::Tex3DI32Float;
+  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
+    return NVPTXISD::Tex3DI32FloatLevel;
+  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32:
+    return NVPTXISD::Tex3DI32FloatGrad;
+  }
+}
+
+static unsigned getOpcForSurfaceInstr(unsigned Intrinsic) {
+  switch (Intrinsic) {
+  default:
+    return 0;
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+    return NVPTXISD::Suld1DI8Trap;
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+    return NVPTXISD::Suld1DI16Trap;
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+    return NVPTXISD::Suld1DI32Trap;
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+    return NVPTXISD::Suld1DV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+    return NVPTXISD::Suld1DV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+    return NVPTXISD::Suld1DV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+    return NVPTXISD::Suld1DV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+    return NVPTXISD::Suld1DV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+    return NVPTXISD::Suld1DV4I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+    return NVPTXISD::Suld1DArrayI8Trap;
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+    return NVPTXISD::Suld1DArrayI16Trap;
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+    return NVPTXISD::Suld1DArrayI32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+    return NVPTXISD::Suld1DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+    return NVPTXISD::Suld1DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+    return NVPTXISD::Suld1DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+    return NVPTXISD::Suld1DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+    return NVPTXISD::Suld1DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+    return NVPTXISD::Suld1DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+    return NVPTXISD::Suld2DI8Trap;
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+    return NVPTXISD::Suld2DI16Trap;
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+    return NVPTXISD::Suld2DI32Trap;
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+    return NVPTXISD::Suld2DV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+    return NVPTXISD::Suld2DV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+    return NVPTXISD::Suld2DV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+    return NVPTXISD::Suld2DV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+    return NVPTXISD::Suld2DV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+    return NVPTXISD::Suld2DV4I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+    return NVPTXISD::Suld2DArrayI8Trap;
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+    return NVPTXISD::Suld2DArrayI16Trap;
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+    return NVPTXISD::Suld2DArrayI32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+    return NVPTXISD::Suld2DArrayV2I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+    return NVPTXISD::Suld2DArrayV2I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+    return NVPTXISD::Suld2DArrayV2I32Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+    return NVPTXISD::Suld2DArrayV4I8Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+    return NVPTXISD::Suld2DArrayV4I16Trap;
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+    return NVPTXISD::Suld2DArrayV4I32Trap;
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+    return NVPTXISD::Suld3DI8Trap;
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+    return NVPTXISD::Suld3DI16Trap;
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+    return NVPTXISD::Suld3DI32Trap;
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+    return NVPTXISD::Suld3DV2I8Trap;
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+    return NVPTXISD::Suld3DV2I16Trap;
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+    return NVPTXISD::Suld3DV2I32Trap;
+  case Intrinsic::nvvm_suld_3d_v4i8_trap:
+    return NVPTXISD::Suld3DV4I8Trap;
+  case Intrinsic::nvvm_suld_3d_v4i16_trap:
+    return NVPTXISD::Suld3DV4I16Trap;
+  case Intrinsic::nvvm_suld_3d_v4i32_trap:
+    return NVPTXISD::Suld3DV4I32Trap;
+  }
+}
+
 // llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
 // TgtMemIntrinsic
 // because we need the information that is only available in the "Value" type
@@ -1944,6 +2237,142 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.align = 0;
     return true;
 
+  case Intrinsic::nvvm_tex_1d_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_i32:
+  case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_v4f32_i32:
+  case Intrinsic::nvvm_tex_3d_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4f32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::f32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_tex_1d_v4i32_i32:
+  case Intrinsic::nvvm_tex_1d_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_v4i32_i32:
+  case Intrinsic::nvvm_tex_1d_array_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_1d_array_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_v4i32_i32:
+  case Intrinsic::nvvm_tex_2d_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_v4i32_i32:
+  case Intrinsic::nvvm_tex_2d_array_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_2d_array_grad_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_v4i32_i32:
+  case Intrinsic::nvvm_tex_3d_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_level_v4i32_f32:
+  case Intrinsic::nvvm_tex_3d_grad_v4i32_f32: {
+    Info.opc = getOpcForTextureInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i8_trap:
+  case Intrinsic::nvvm_suld_1d_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_v4i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_i8_trap:
+  case Intrinsic::nvvm_suld_2d_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_v4i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
+  case Intrinsic::nvvm_suld_3d_i8_trap:
+  case Intrinsic::nvvm_suld_3d_v2i8_trap:
+  case Intrinsic::nvvm_suld_3d_v4i8_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i8;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i16_trap:
+  case Intrinsic::nvvm_suld_1d_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_v4i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_i16_trap:
+  case Intrinsic::nvvm_suld_2d_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_v4i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
+  case Intrinsic::nvvm_suld_3d_i16_trap:
+  case Intrinsic::nvvm_suld_3d_v2i16_trap:
+  case Intrinsic::nvvm_suld_3d_v4i16_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i16;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+  case Intrinsic::nvvm_suld_1d_i32_trap:
+  case Intrinsic::nvvm_suld_1d_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_v4i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_i32_trap:
+  case Intrinsic::nvvm_suld_2d_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_v4i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
+  case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
+  case Intrinsic::nvvm_suld_3d_i32_trap:
+  case Intrinsic::nvvm_suld_3d_v2i32_trap:
+  case Intrinsic::nvvm_suld_3d_v4i32_trap: {
+    Info.opc = getOpcForSurfaceInstr(Intrinsic);
+    Info.memVT = MVT::i32;
+    Info.ptrVal = nullptr;
+    Info.offset = 0;
+    Info.vol = 0;
+    Info.readMem = true;
+    Info.writeMem = false;
+    Info.align = 16;
+    return true;
+  }
+
   }
   return false;
 }
@@ -2094,7 +2523,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   case 4: {
     Opcode = NVPTXISD::LoadV4;
     EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-    LdResVTs = DAG.getVTList(ListVTs, 5);
+    LdResVTs = DAG.getVTList(ListVTs);
     break;
   }
   }
@@ -2111,8 +2540,8 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   // pass along the extension information
   OtherOps.push_back(DAG.getIntPtrConstant(LD->getExtensionType()));
 
-  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, &OtherOps[0],
-                                          OtherOps.size(), LD->getMemoryVT(),
+  SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                          LD->getMemoryVT(),
                                           LD->getMemOperand());
 
   SmallVector<SDValue, 4> ScalarRes;
@@ -2126,8 +2555,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   SDValue LoadChain = NewLD.getValue(NumElts);
 
-  SDValue BuildVec =
-      DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+  SDValue BuildVec = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
 
   Results.push_back(BuildVec);
   Results.push_back(LoadChain);
@@ -2207,7 +2635,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
           break;
         }
         EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
-        LdResVTs = DAG.getVTList(ListVTs, 5);
+        LdResVTs = DAG.getVTList(ListVTs);
         break;
       }
       }
@@ -2224,9 +2652,9 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
 
       MemIntrinsicSDNode *MemSD = cast<MemIntrinsicSDNode>(N);
 
-      SDValue NewLD = DAG.getMemIntrinsicNode(
-          Opcode, DL, LdResVTs, &OtherOps[0], OtherOps.size(),
-          MemSD->getMemoryVT(), MemSD->getMemOperand());
+      SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
+                                              MemSD->getMemoryVT(),
+                                              MemSD->getMemOperand());
 
       SmallVector<SDValue, 4> ScalarRes;
 
@@ -2241,7 +2669,7 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       SDValue LoadChain = NewLD.getValue(NumElts);
 
       SDValue BuildVec =
-          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, &ScalarRes[0], NumElts);
+          DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
 
       Results.push_back(BuildVec);
       Results.push_back(LoadChain);
@@ -2263,8 +2691,8 @@ static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG,
       // We make sure the memory type is i8, which will be used during isel
       // to select the proper instruction.
       SDValue NewLD =
-          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, &Ops[0],
-                                  Ops.size(), MVT::i8, MemSD->getMemOperand());
+          DAG.getMemIntrinsicNode(ISD::INTRINSIC_W_CHAIN, DL, LdResVTs, Ops,
+                                  MVT::i8, MemSD->getMemOperand());
 
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
                                     NewLD.getValue(0)));
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index c1e8c21..7bad8a2 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -70,7 +70,100 @@ enum NodeType {
   StoreParamU32, // to zext and store a <32bit value, not used currently 
   StoreRetval,
   StoreRetvalV2,
-  StoreRetvalV4
+  StoreRetvalV4,
+
+  // Texture intrinsics
+  Tex1DFloatI32,
+  Tex1DFloatFloat,
+  Tex1DFloatFloatLevel,
+  Tex1DFloatFloatGrad,
+  Tex1DI32I32,
+  Tex1DI32Float,
+  Tex1DI32FloatLevel,
+  Tex1DI32FloatGrad,
+  Tex1DArrayFloatI32,
+  Tex1DArrayFloatFloat,
+  Tex1DArrayFloatFloatLevel,
+  Tex1DArrayFloatFloatGrad,
+  Tex1DArrayI32I32,
+  Tex1DArrayI32Float,
+  Tex1DArrayI32FloatLevel,
+  Tex1DArrayI32FloatGrad,
+  Tex2DFloatI32,
+  Tex2DFloatFloat,
+  Tex2DFloatFloatLevel,
+  Tex2DFloatFloatGrad,
+  Tex2DI32I32,
+  Tex2DI32Float,
+  Tex2DI32FloatLevel,
+  Tex2DI32FloatGrad,
+  Tex2DArrayFloatI32,
+  Tex2DArrayFloatFloat,
+  Tex2DArrayFloatFloatLevel,
+  Tex2DArrayFloatFloatGrad,
+  Tex2DArrayI32I32,
+  Tex2DArrayI32Float,
+  Tex2DArrayI32FloatLevel,
+  Tex2DArrayI32FloatGrad,
+  Tex3DFloatI32,
+  Tex3DFloatFloat,
+  Tex3DFloatFloatLevel,
+  Tex3DFloatFloatGrad,
+  Tex3DI32I32,
+  Tex3DI32Float,
+  Tex3DI32FloatLevel,
+  Tex3DI32FloatGrad,
+
+  // Surface intrinsics
+  Suld1DI8Trap,
+  Suld1DI16Trap,
+  Suld1DI32Trap,
+  Suld1DV2I8Trap,
+  Suld1DV2I16Trap,
+  Suld1DV2I32Trap,
+  Suld1DV4I8Trap,
+  Suld1DV4I16Trap,
+  Suld1DV4I32Trap,
+
+  Suld1DArrayI8Trap,
+  Suld1DArrayI16Trap,
+  Suld1DArrayI32Trap,
+  Suld1DArrayV2I8Trap,
+  Suld1DArrayV2I16Trap,
+  Suld1DArrayV2I32Trap,
+  Suld1DArrayV4I8Trap,
+  Suld1DArrayV4I16Trap,
+  Suld1DArrayV4I32Trap,
+
+  Suld2DI8Trap,
+  Suld2DI16Trap,
+  Suld2DI32Trap,
+  Suld2DV2I8Trap,
+  Suld2DV2I16Trap,
+  Suld2DV2I32Trap,
+  Suld2DV4I8Trap,
+  Suld2DV4I16Trap,
+  Suld2DV4I32Trap,
+
+  Suld2DArrayI8Trap,
+  Suld2DArrayI16Trap,
+  Suld2DArrayI32Trap,
+  Suld2DArrayV2I8Trap,
+  Suld2DArrayV2I16Trap,
+  Suld2DArrayV2I32Trap,
+  Suld2DArrayV4I8Trap,
+  Suld2DArrayV4I16Trap,
+  Suld2DArrayV4I32Trap,
+
+  Suld3DI8Trap,
+  Suld3DI16Trap,
+  Suld3DI32Trap,
+  Suld3DV2I8Trap,
+  Suld3DV2I16Trap,
+  Suld3DV2I32Trap,
+  Suld3DV4I8Trap,
+  Suld3DV4I16Trap,
+  Suld3DV4I32Trap
 };
 }
 
@@ -80,68 +173,70 @@ enum NodeType {
 class NVPTXTargetLowering : public TargetLowering {
 public:
   explicit NVPTXTargetLowering(NVPTXTargetMachine &TM);
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
   SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerGlobalAddress(const GlobalValue *GV, int64_t Offset,
                              SelectionDAG &DAG) const;
 
-  virtual const char *getTargetNodeName(unsigned Opcode) const;
+  const char *getTargetNodeName(unsigned Opcode) const override;
 
   bool isTypeSupportedInIntrinsic(MVT VT) const;
 
   bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
-                          unsigned Intrinsic) const;
+                          unsigned Intrinsic) const override;
 
   /// isLegalAddressingMode - Return true if the addressing mode represented
   /// by AM is legal for this target, for a load/store of the specified type
   /// Used to guide target specific optimizations, like loop strength
   /// reduction (LoopStrengthReduce.cpp) and memory optimization for
   /// address mode (CodeGenPrepare.cpp)
-  virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const;
+  bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   /// getFunctionAlignment - Return the Log2 alignment of this function.
-  virtual unsigned getFunctionAlignment(const Function *F) const;
+  unsigned getFunctionAlignment(const Function *F) const;
 
-  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const {
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override {
     if (VT.isVector())
       return MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
     return MVT::i1;
   }
 
-  ConstraintType getConstraintType(const std::string &Constraint) const;
+  ConstraintType
+  getConstraintType(const std::string &Constraint) const override;
   std::pair<unsigned, const TargetRegisterClass *>
-  getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+  getRegForInlineAsmConstraint(const std::string &Constraint,
+                               MVT VT) const override;
 
-  virtual SDValue LowerFormalArguments(
+  SDValue LowerFormalArguments(
       SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
       const SmallVectorImpl<ISD::InputArg> &Ins, SDLoc dl, SelectionDAG &DAG,
-      SmallVectorImpl<SDValue> &InVals) const;
+      SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual SDValue
-  LowerCall(CallLoweringInfo &CLI, SmallVectorImpl<SDValue> &InVals) const;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
 
   std::string getPrototype(Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
                            unsigned retAlignment,
                            const ImmutableCallSite *CS) const;
 
-  virtual SDValue
+  SDValue
   LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
               const SmallVectorImpl<ISD::OutputArg> &Outs,
               const SmallVectorImpl<SDValue> &OutVals, SDLoc dl,
-              SelectionDAG &DAG) const;
+              SelectionDAG &DAG) const override;
 
-  virtual void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
-                                            std::vector<SDValue> &Ops,
-                                            SelectionDAG &DAG) const;
+  void LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
 
   NVPTXTargetMachine *nvTM;
 
   // PTX always uses 32-bit shift amounts
-  virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+  MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
-  virtual bool shouldSplitVectorType(EVT VT) const override;
+  bool shouldSplitVectorType(EVT VT) const override;
 
 private:
   const NVPTXSubtarget &nvptxSubtarget; // cache the subtarget here
@@ -160,8 +255,8 @@ private:
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
 
-  virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const;
+  void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 
   unsigned getArgumentAlignment(SDValue Callee, const ImmutableCallSite *CS,
                                 Type *Ty, unsigned Idx) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
new file mode 100644
index 0000000..397f4bc
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -0,0 +1,178 @@
+//===-- NVPTXImageOptimizer.cpp - Image optimization pass -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR-level optimizations of image access code,
+// including:
+//
+// 1. Eliminate istypep intrinsics when image access qualifier is known
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXUtilities.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXImageOptimizer : public FunctionPass {
+private:
+  static char ID;
+  SmallVector<Instruction*, 4> InstrToDelete;
+
+public:
+  NVPTXImageOptimizer();
+
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool replaceIsTypePSampler(Instruction &I);
+  bool replaceIsTypePSurface(Instruction &I);
+  bool replaceIsTypePTexture(Instruction &I);
+  Value *cleanupValue(Value *V);
+  void replaceWith(Instruction *From, ConstantInt *To);
+};
+}
+
+char NVPTXImageOptimizer::ID = 0;
+
+NVPTXImageOptimizer::NVPTXImageOptimizer()
+  : FunctionPass(ID) {}
+
+bool NVPTXImageOptimizer::runOnFunction(Function &F) {
+  bool Changed = false;
+  InstrToDelete.clear();
+
+  // Look for call instructions in the function
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE;
+       ++BI) {
+    for (BasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      Instruction &Instr = *I;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        Function *CalledF = CI->getCalledFunction();
+        if (CalledF && CalledF->isIntrinsic()) {
+          // This is an intrinsic function call, check if its an istypep
+          switch (CalledF->getIntrinsicID()) {
+          default: break;
+          case Intrinsic::nvvm_istypep_sampler:
+            Changed |= replaceIsTypePSampler(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_surface:
+            Changed |= replaceIsTypePSurface(Instr);
+            break;
+          case Intrinsic::nvvm_istypep_texture:
+            Changed |= replaceIsTypePTexture(Instr);
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  // Delete any istypep instances we replaced in the IR
+  for (unsigned i = 0, e = InstrToDelete.size(); i != e; ++i)
+    InstrToDelete[i]->eraseFromParent();
+
+  return Changed;
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isSampler(*TexHandle)) {
+    // This is an OpenCL sampler, so it must be a samplerref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL image, so it cannot be a samplerref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePSurface(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadWrite(*TexHandle) ||
+      isImageWriteOnly(*TexHandle)) {
+    // This is an OpenCL read-only/read-write image, so it must be a surfref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageReadOnly(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-only/ imageor sampler, so it cannot be
+    // a surfref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+bool NVPTXImageOptimizer::replaceIsTypePTexture(Instruction &I) {
+  Value *TexHandle = cleanupValue(I.getOperand(0));
+  if (isImageReadOnly(*TexHandle)) {
+    // This is an OpenCL read-only image, so it must be a texref
+    replaceWith(&I, ConstantInt::getTrue(I.getContext()));
+    return true;
+  } else if (isImageWriteOnly(*TexHandle) ||
+             isImageReadWrite(*TexHandle) ||
+             isSampler(*TexHandle)) {
+    // This is an OpenCL read-write/write-only image or a sampler, so it
+    // cannot be a texref
+    replaceWith(&I, ConstantInt::getFalse(I.getContext()));
+    return true;
+  } else {
+    // The image type is unknown, so we cannot eliminate the intrinsic
+    return false;
+  }
+}
+
+void NVPTXImageOptimizer::replaceWith(Instruction *From, ConstantInt *To) {
+  // We implement "poor man's DCE" here to make sure any code that is no longer
+  // live is actually unreachable and can be trivially eliminated by the
+  // unreachable block elimiation pass.
+  for (CallInst::use_iterator UI = From->use_begin(), UE = From->use_end();
+       UI != UE; ++UI) {
+    if (BranchInst *BI = dyn_cast<BranchInst>(*UI)) {
+      if (BI->isUnconditional()) continue;
+      BasicBlock *Dest;
+      if (To->isZero())
+        // Get false block
+        Dest = BI->getSuccessor(1);
+      else
+        // Get true block
+        Dest = BI->getSuccessor(0);
+      BranchInst::Create(Dest, BI);
+      InstrToDelete.push_back(BI);
+    }
+  }
+  From->replaceAllUsesWith(To);
+  InstrToDelete.push_back(From);
+}
+
+Value *NVPTXImageOptimizer::cleanupValue(Value *V) {
+  if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+    return cleanupValue(EVI->getAggregateOperand());
+  }
+  return V;
+}
+
+FunctionPass *llvm::createNVPTXImageOptimizerPass() {
+  return new NVPTXImageOptimizer();
+}
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 86ddd38..cdc8088 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -14,8 +14,6 @@
 #include "NVPTX.h"
 #include "NVPTXInstrInfo.h"
 #include "NVPTXTargetMachine.h"
-#define GET_INSTRINFO_CTOR_DTOR
-#include "NVPTXGenInstrInfo.inc"
 #include "llvm/IR/Function.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -24,6 +22,9 @@
 
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "NVPTXGenInstrInfo.inc"
+
 // Pin the vtable to this file.
 void NVPTXInstrInfo::anchor() {}
 
@@ -256,7 +257,7 @@ unsigned NVPTXInstrInfo::InsertBranch(
          "NVPTX branch conditions have two components!");
 
   // One-way branch.
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) // Unconditional branch
       BuildMI(&MBB, DL, get(NVPTX::GOTO)).addMBB(TBB);
     else // Conditional branch
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.h b/lib/Target/NVPTX/NVPTXInstrInfo.h
index 600fc5c..88a9e45 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.h
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.h
@@ -30,7 +30,7 @@ class NVPTXInstrInfo : public NVPTXGenInstrInfo {
 public:
   explicit NVPTXInstrInfo(NVPTXTargetMachine &TM);
 
-  virtual const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
+  const NVPTXRegisterInfo &getRegisterInfo() const { return RegInfo; }
 
   /* The following virtual functions are used in register allocation.
    * They are not implemented because the existing interface and the logic
@@ -50,9 +50,9 @@ public:
    *                               const TargetRegisterClass *RC) const;
    */
 
-  virtual void copyPhysReg(
+  void copyPhysReg(
       MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL,
-      unsigned DestReg, unsigned SrcReg, bool KillSrc) const;
+      unsigned DestReg, unsigned SrcReg, bool KillSrc) const override;
   virtual bool isMoveInstr(const MachineInstr &MI, unsigned &SrcReg,
                            unsigned &DestReg) const;
   bool isLoadInstr(const MachineInstr &MI, unsigned &AddrSpace) const;
@@ -61,13 +61,13 @@ public:
 
   virtual bool CanTailMerge(const MachineInstr *MI) const;
   // Branch analysis.
-  virtual bool AnalyzeBranch(
+  bool AnalyzeBranch(
       MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(
+      SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(
       MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB,
-      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+      const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
   unsigned getLdStCodeAddrSpace(const MachineInstr &MI) const {
     return MI.getOperand(2).getImm();
   }
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 14049b1..5e228fc 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1666,6 +1666,9 @@ def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
                 (MoveParam texternalsym:$src)))),
                (nvvm_move_ptr32  texternalsym:$src)>;
 
+def texsurf_handles
+  : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
+              "mov.u64 \t$result, $src;", []>;
 
 //-----------------------------------
 // Compiler Error Warn
@@ -1686,6 +1689,1826 @@ def INT_NVVM_COMPILER_ERROR_64 : NVPTXInst<(outs), (ins Int64Regs:$a),
                 [(int_nvvm_compiler_error Int64Regs:$a)]>;
 
 
+//-----------------------------------
+// Texture Intrinsics
+//-----------------------------------
+
+// NOTE: For Fermi support, any new texture/surface/sampler intrinsics must be
+// also defined in NVPTXReplaceImageHandles.cpp
+
+
+// Texture fetch instructions using handles
+def TEX_1D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$lod),
+              "tex.level.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x),
+              "tex.1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x),
+              "tex.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, [$t, $s, \\{$x\\}];",
+              []>;
+def TEX_1D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], $lod;",
+              []>;
+def TEX_1D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_1D_ARRAY_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+def TEX_1D_ARRAY_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "tex.a1d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x),
+              "tex.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}];",
+              []>;
+def TEX_1D_ARRAY_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$lod),
+              "tex.level.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], $lod;",
+              []>;
+def TEX_1D_ARRAY_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$gradx, Float32Regs:$grady),
+              "tex.grad.a1d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x\\}], \\{$gradx\\}, \\{$grady\\};",
+              []>;
+
+def TEX_2D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "tex.2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y),
+              "tex.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}];",
+              []>;
+def TEX_2D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$lod),
+              "tex.level.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], $lod;",
+              []>;
+def TEX_2D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_2D_ARRAY_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+def TEX_2D_ARRAY_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+                   Int32Regs:$y),
+              "tex.a2d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y),
+              "tex.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def TEX_2D_ARRAY_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y, Float32Regs:$lod),
+              "tex.level.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], $lod;",
+              []>;
+def TEX_2D_ARRAY_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$l, Float32Regs:$x,
+                   Float32Regs:$y,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$grady0, Float32Regs:$grady1),
+              "tex.grad.a2d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$l, $x, $y, $y\\}], \\{$gradx0, $gradx1\\}, "
+              "\\{$grady0, $grady1\\};",
+              []>;
+
+def TEX_3D_F32_I32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.f32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_F32_F32_LEVEL
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_F32_F32_GRAD
+  : NVPTXInst<(outs Float32Regs:$r, Float32Regs:$g,
+                    Float32Regs:$b, Float32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.f32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+def TEX_3D_I32_I32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$z),
+              "tex.3d.v4.s32.s32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_I32_F32
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z),
+              "tex.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def TEX_3D_I32_F32_LEVEL
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z, Float32Regs:$lod),
+              "tex.level.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], $lod;",
+              []>;
+def TEX_3D_I32_F32_GRAD
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g,
+                    Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$t, Int64Regs:$s, Float32Regs:$x, Float32Regs:$y,
+                   Float32Regs:$z,
+                   Float32Regs:$gradx0, Float32Regs:$gradx1,
+                   Float32Regs:$gradx2, Float32Regs:$grady0,
+                   Float32Regs:$grady1, Float32Regs:$grady2),
+              "tex.grad.3d.v4.s32.f32\t\\{$r, $g, $b, $a\\}, "
+              "[$t, $s, \\{$x, $y, $z, $z\\}], "
+              "\\{$gradx0, $gradx1, $gradx2, $gradx2\\}, "
+              "\\{$grady0, $grady1, $grady2, $grady2\\};",
+              []>;
+
+
+// Surface load instructions
+def SULD_1D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b8.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b16.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.b32.trap \\{$r\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+def SULD_1D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x),
+              "suld.b.1d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x\\}];",
+              []>;
+
+def SULD_1D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b8.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b16.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.b32.trap \\{$r\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+def SULD_1D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x),
+              "suld.b.a1d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x\\}];",
+              []>;
+
+def SULD_2D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b8.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b16.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.b32.trap \\{$r\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b8.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b16.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+def SULD_2D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.2d.v4.b32.trap \\{$r, $g, $b, $a\\}, [$s, \\{$x, $y\\}];",
+              []>;
+
+def SULD_2D_ARRAY_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b8.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b16.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.b32.trap \\{$r\\}, [$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b8.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b16.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v2.b32.trap \\{$r, $g\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+def SULD_2D_ARRAY_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y),
+              "suld.b.a2d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$l, $x, $y, $y\\}];",
+              []>;
+
+def SULD_3D_I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b8.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b16.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.b32.trap \\{$r\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b8.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b16.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V2I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v2.b32.trap \\{$r, $g\\}, [$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I8_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b8.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I16_TRAP
+  : NVPTXInst<(outs Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b16.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+def SULD_3D_V4I32_TRAP
+  : NVPTXInst<(outs Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z),
+              "suld.b.3d.v4.b32.trap \\{$r, $g, $b, $a\\}, "
+              "[$s, \\{$x, $y, $z, $z\\}];",
+              []>;
+
+
+//-----------------------------------
+// Texture Query Intrinsics
+//-----------------------------------
+def TXQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_order.b32 \t$d, [$a];",
+              []>;
+def TXQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def TXQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.width.b32 \t$d, [$a];",
+              []>;
+def TXQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.height.b32 \t$d, [$a];",
+              []>;
+def TXQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.depth.b32 \t$d, [$a];",
+              []>;
+def TXQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.array_size.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_SAMPLES
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_samples.b32 \t$d, [$a];",
+              []>;
+def TXQ_NUM_MIPMAP_LEVELS
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "txq.num_mipmap_levels.b32 \t$d, [$a];",
+              []>;
+
+def : Pat<(int_nvvm_txq_channel_order Int64Regs:$a),
+          (TXQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_channel_data_type Int64Regs:$a),
+          (TXQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_width Int64Regs:$a),
+          (TXQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_height Int64Regs:$a),
+          (TXQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_depth Int64Regs:$a),
+          (TXQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_array_size Int64Regs:$a),
+          (TXQ_ARRAY_SIZE Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_samples Int64Regs:$a),
+          (TXQ_NUM_SAMPLES Int64Regs:$a)>;
+def : Pat<(int_nvvm_txq_num_mipmap_levels Int64Regs:$a),
+          (TXQ_NUM_MIPMAP_LEVELS Int64Regs:$a)>;
+
+
+//-----------------------------------
+// Surface Query Intrinsics
+//-----------------------------------
+def SUQ_CHANNEL_ORDER
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_order.b32 \t$d, [$a];",
+              []>;
+def SUQ_CHANNEL_DATA_TYPE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.channel_data_type.b32 \t$d, [$a];",
+              []>;
+def SUQ_WIDTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.width.b32 \t$d, [$a];",
+              []>;
+def SUQ_HEIGHT
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.height.b32 \t$d, [$a];",
+              []>;
+def SUQ_DEPTH
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.depth.b32 \t$d, [$a];",
+              []>;
+def SUQ_ARRAY_SIZE
+  : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a),
+              "suq.array_size.b32 \t$d, [$a];",
+              []>;
+
+def : Pat<(int_nvvm_suq_channel_order Int64Regs:$a),
+          (SUQ_CHANNEL_ORDER Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_channel_data_type Int64Regs:$a),
+          (SUQ_CHANNEL_DATA_TYPE Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_width Int64Regs:$a),
+          (SUQ_WIDTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_height Int64Regs:$a),
+          (SUQ_HEIGHT Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_depth Int64Regs:$a),
+          (SUQ_DEPTH Int64Regs:$a)>;
+def : Pat<(int_nvvm_suq_array_size Int64Regs:$a),
+          (SUQ_ARRAY_SIZE Int64Regs:$a)>;
+
+
+//===- Handle Query -------------------------------------------------------===//
+
+// TODO: These intrinsics are not yet finalized, pending PTX ISA design work
+def ISTYPEP_SAMPLER
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.samplerref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_sampler Int64Regs:$a))]>;
+def ISTYPEP_SURFACE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.surfref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_surface Int64Regs:$a))]>;
+def ISTYPEP_TEXTURE
+  : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a),
+              "istypep.texref \t$d, $a;",
+              [(set Int1Regs:$d, (int_nvvm_istypep_texture Int64Regs:$a))]>;
+
+//===- Surface Stores -----------------------------------------------------===//
+
+// Unformatted
+
+def SUST_B_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.b.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.b.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.b.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.b.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.b.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.b.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.b.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.b.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.b.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.b.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_B_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.b.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.b.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.b.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.b.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.b.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.b.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.b.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.b.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.b.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.b.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_B_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.b.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.b.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_B_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.b.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.b.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_B_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.b.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.b.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_B_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.b.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+// Formatted
+
+def SUST_P_1D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b8.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.1d.b16.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.1d.b32.trap \t[$s, \\{$x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b8.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+              "sust.p.1d.v2.b16.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+              "sust.p.1d.v2.b32.trap \t[$s, \\{$x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b8.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g,
+                   Int16Regs:$b, Int16Regs:$a),
+              "sust.p.1d.v4.b16.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g,
+                   Int32Regs:$b, Int32Regs:$a),
+              "sust.p.1d.v4.b32.trap \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_1D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r),
+              "sust.p.a1d.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r),
+              "sust.p.a1d.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b8.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.a1d.v2.b16.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.a1d.v2.b32.trap \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.a1d.v4.b8.trap \t[$s, \\{$idx, $x\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.a1d.v4.b16.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_1D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.a1d.v4.b32.trap \t[$s, \\{$idx, $x\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b8.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+              "sust.p.2d.b16.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+              "sust.p.2d.b32.trap \t[$s, \\{$x, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b8.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g),
+              "sust.p.2d.v2.b16.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g),
+              "sust.p.2d.v2.b32.trap \t[$s, \\{$x, $y\\}], \\{$r, $g\\};",
+              []>;
+def SUST_P_2D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+              "sust.p.2d.v4.b8.trap \t[$s, \\{$x, $y\\}], "
+              "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r,
+                   Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+             "sust.p.2d.v4.b16.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+                   Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+             "sust.p.2d.v4.b32.trap \t[$s, \\{$x, $y\\}], "
+             "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_2D_ARRAY_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r),
+              "sust.p.a2d.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r),
+              "sust.p.a2d.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.a2d.v2.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g),
+             "sust.p.a2d.v2.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g),
+             "sust.p.a2d.v2.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+             "\\{$r, $g\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+      "sust.p.a2d.v4.b8.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+      "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+     "sust.p.a2d.v4.b16.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_2D_ARRAY_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+     "sust.p.a2d.v4.b32.trap \t[$s, \\{$idx, $x, $y, $y\\}], "
+     "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+def SUST_P_3D_B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r),
+              "sust.p.3d.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r),
+              "sust.p.3d.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};",
+              []>;
+def SUST_P_3D_V2B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g),
+              "sust.p.3d.v2.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V2B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g),
+              "sust.p.3d.v2.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+              "\\{$r, $g\\};",
+              []>;
+def SUST_P_3D_V4B8_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+         "sust.p.3d.v4.b8.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+         "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B16_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+        "sust.p.3d.v4.b16.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+def SUST_P_3D_V4B32_TRAP
+  : NVPTXInst<(outs),
+              (ins Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+                   Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+        "sust.p.3d.v4.b32.trap \t[$s, \\{$x, $y, $z, $z\\}], "
+        "\\{$r, $g, $b, $a\\};",
+              []>;
+
+
+// Surface store instruction patterns
+// I'm not sure why we can't just include these in the instruction definitions,
+// but TableGen complains of type errors :(
+
+def : Pat<(int_nvvm_sust_b_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_B_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_B_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_B_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_B_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_B_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_b_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_B_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_B_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_B_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_B_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_B_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_b_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_B_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_1d_array_i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r),
+          (SUST_P_1D_ARRAY_B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r),
+          (SUST_P_1D_ARRAY_B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_1D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_1D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B8_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_1D_ARRAY_V4B16_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_1D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l, Int32Regs:$x,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i8_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i16_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_V2B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v2i32_trap
+          Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_2D_V2B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B8_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_V4B16_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_V4B32_TRAP Int64Regs:$s, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_2d_array_i8_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i16_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r),
+          (SUST_P_2D_ARRAY_B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_i32_trap
+          Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r),
+          (SUST_P_2D_ARRAY_B32_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B8_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_2D_ARRAY_V2B16_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r,
+           Int32Regs:$g),
+          (SUST_P_2D_ARRAY_V2B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_2D_ARRAY_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap
+           Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_2D_ARRAY_V4B32_TRAP Int64Regs:$s, Int32Regs:$l,
+           Int32Regs:$x, Int32Regs:$y,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
+
+def : Pat<(int_nvvm_sust_p_3d_i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r),
+          (SUST_P_3D_B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r),
+          (SUST_P_3D_B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g),
+          (SUST_P_3D_V2B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v2i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g),
+          (SUST_P_3D_V2B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i8_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B8_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i16_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a),
+          (SUST_P_3D_V4B16_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>;
+
+def : Pat<(int_nvvm_sust_p_3d_v4i32_trap
+           Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a),
+          (SUST_P_3D_V4B32_TRAP Int64Regs:$s,
+           Int32Regs:$x, Int32Regs:$y, Int32Regs:$z,
+           Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>;
+
+
 
 //===-- Old PTX Back-end Intrinsics ---------------------------------------===//
 
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
index c9aa87d..5ec1fc9 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.h
@@ -27,17 +27,17 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
   NVPTXLowerAggrCopies() : FunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DataLayoutPass>();
     AU.addPreserved("stack-protector");
     AU.addPreserved<MachineFunctionAnalysis>();
   }
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
   static const unsigned MaxAggrCopySize = 128;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Lower aggregate copies/intrinsics into loops";
   }
 };
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.cpp b/lib/Target/NVPTX/NVPTXMCExpr.cpp
index ca24764..137248b 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.cpp
+++ b/lib/Target/NVPTX/NVPTXMCExpr.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-mcexpr"
 #include "NVPTXMCExpr.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-mcexpr"
+
 const NVPTXFloatMCExpr*
 NVPTXFloatMCExpr::Create(VariantKind Kind, APFloat Flt, MCContext &Ctx) {
   return new (Ctx) NVPTXFloatMCExpr(Kind, Flt);
diff --git a/lib/Target/NVPTX/NVPTXMCExpr.h b/lib/Target/NVPTX/NVPTXMCExpr.h
index 0efb231..0ee018c 100644
--- a/lib/Target/NVPTX/NVPTXMCExpr.h
+++ b/lib/Target/NVPTX/NVPTXMCExpr.h
@@ -61,18 +61,18 @@ public:
 
 /// @}
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const {
+                                 const MCAsmLayout *Layout) const override {
     return false;
   }
-  void AddValueSymbols(MCAssembler *) const {};
-  const MCSection *FindAssociatedSection() const {
-    return NULL;
+  void AddValueSymbols(MCAssembler *) const override {};
+  const MCSection *FindAssociatedSection() const override {
+    return nullptr;
   }
 
   // There are no TLS NVPTXMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
new file mode 100644
index 0000000..67fb390
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXMachineFunctionInfo.h
@@ -0,0 +1,46 @@
+//===-- NVPTXMachineFunctionInfo.h - NVPTX-specific Function Info  --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class is attached to a MachineFunction instance and tracks target-
+// dependent information
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+class NVPTXMachineFunctionInfo : public MachineFunctionInfo {
+private:
+  /// Stores a mapping from index to symbol name for removing image handles
+  /// on Fermi.
+  SmallVector<std::string, 8> ImageHandleList;
+
+public:
+  NVPTXMachineFunctionInfo(MachineFunction &MF) {}
+
+  /// Returns the index for the symbol \p Symbol. If the symbol was previously,
+  /// added, the same index is returned. Otherwise, the symbol is added and the
+  /// new index is returned.
+  unsigned getImageHandleSymbolIndex(const char *Symbol) {
+    // Is the symbol already present?
+    for (unsigned i = 0, e = ImageHandleList.size(); i != e; ++i)
+      if (ImageHandleList[i] == std::string(Symbol))
+        return i;
+    // Nope, insert it
+    ImageHandleList.push_back(Symbol);
+    return ImageHandleList.size()-1;
+  }
+
+  /// Returns the symbol name at the given index.
+  const char *getImageHandleSymbol(unsigned Idx) const {
+    assert(ImageHandleList.size() > Idx && "Bad index");
+    return ImageHandleList[Idx].c_str();
+  }
+};
+}
diff --git a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index d5b042a..348ab0c 100644
--- a/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -25,13 +25,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-prolog-epilog"
+
 namespace {
 class NVPTXPrologEpilogPass : public MachineFunctionPass {
 public:
   static char ID;
   NVPTXPrologEpilogPass() : MachineFunctionPass(ID) {}
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
 private:
   void calculateFrameObjectOffsets(MachineFunction &Fn);
@@ -58,7 +60,7 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
       for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
         if (!MI->getOperand(i).isFI())
           continue;
-        TRI.eliminateFrameIndex(MI, 0, i, NULL);
+        TRI.eliminateFrameIndex(MI, 0, i, nullptr);
         Modified = true;
       }
     }
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 4d3a1d9..62f288b 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "nvptx-reg-info"
-
 #include "NVPTXRegisterInfo.h"
 #include "NVPTX.h"
 #include "NVPTXSubtarget.h"
@@ -25,6 +23,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-reg-info"
+
 namespace llvm {
 std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
   if (RC == &NVPTX::Float32RegsRegClass) {
@@ -78,19 +78,12 @@ NVPTXRegisterInfo::NVPTXRegisterInfo(const NVPTXSubtarget &st)
 #include "NVPTXGenRegisterInfo.inc"
 
 /// NVPTX Callee Saved Registers
-const uint16_t *
+const MCPhysReg *
 NVPTXRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const uint16_t CalleeSavedRegs[] = { 0 };
+  static const MCPhysReg CalleeSavedRegs[] = { 0 };
   return CalleeSavedRegs;
 }
 
-// NVPTX Callee Saved Reg Classes
-const TargetRegisterClass *const *
-NVPTXRegisterInfo::getCalleeSavedRegClasses(const MachineFunction *MF) const {
-  static const TargetRegisterClass *const CalleeSavedRegClasses[] = { 0 };
-  return CalleeSavedRegClasses;
-}
-
 BitVector NVPTXRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
   return Reserved;
@@ -113,12 +106,6 @@ void NVPTXRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
 }
 
-int NVPTXRegisterInfo::getDwarfRegNum(unsigned RegNum, bool isEH) const {
-  return 0;
-}
-
 unsigned NVPTXRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   return NVPTX::VRFrame;
 }
-
-unsigned NVPTXRegisterInfo::getRARegister() const { return 0; }
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.h b/lib/Target/NVPTX/NVPTXRegisterInfo.h
index 0a20f29..a7594be 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.h
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.h
@@ -16,11 +16,10 @@
 
 #include "ManagedStringPool.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <sstream>
 
 #define GET_REGINFO_HEADER
 #include "NVPTXGenRegisterInfo.inc"
-#include "llvm/Target/TargetRegisterInfo.h"
-#include <sstream>
 
 namespace llvm {
 
@@ -42,22 +41,16 @@ public:
   //------------------------------------------------------
 
   // NVPTX callee saved registers
-  virtual const uint16_t *
-  getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-
-  // NVPTX callee saved register classes
-  virtual const TargetRegisterClass *const *
-  getCalleeSavedRegClasses(const MachineFunction *MF) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF = nullptr) const override;
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
-                                   unsigned FIOperandNum,
-                                   RegScavenger *RS = NULL) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           unsigned FIOperandNum,
+                           RegScavenger *RS = nullptr) const override;
 
-  virtual int getDwarfRegNum(unsigned RegNum, bool isEH) const;
-  virtual unsigned getFrameRegister(const MachineFunction &MF) const;
-  virtual unsigned getRARegister() const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   ManagedStringPool *getStrPool() const {
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
diff --git a/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
new file mode 100644
index 0000000..afd53a6
--- /dev/null
+++ b/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -0,0 +1,357 @@
+//===-- NVPTXReplaceImageHandles.cpp - Replace image handles for Fermi ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source 
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// On Fermi, image handles are not supported. To work around this, we traverse
+// the machine code and replace image handles with concrete symbols. For this
+// to work reliably, inlining of all function call must be performed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NVPTX.h"
+#include "NVPTXMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/DenseSet.h"
+
+using namespace llvm;
+
+namespace {
+class NVPTXReplaceImageHandles : public MachineFunctionPass {
+private:
+  static char ID;
+  DenseSet<MachineInstr *> InstrsToRemove;
+
+public:
+  NVPTXReplaceImageHandles();
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+private:
+  bool processInstr(MachineInstr &MI);
+  void replaceImageHandle(MachineOperand &Op, MachineFunction &MF);
+};
+}
+
+char NVPTXReplaceImageHandles::ID = 0;
+
+NVPTXReplaceImageHandles::NVPTXReplaceImageHandles()
+  : MachineFunctionPass(ID) {}
+
+bool NVPTXReplaceImageHandles::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  InstrsToRemove.clear();
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+    for (MachineBasicBlock::iterator I = (*BI).begin(), E = (*BI).end();
+         I != E; ++I) {
+      MachineInstr &MI = *I;
+      Changed |= processInstr(MI);
+    }
+  }
+
+  // Now clean up any handle-access instructions
+  // This is needed in debug mode when code cleanup passes are not executed,
+  // but we need the handle access to be eliminated because they are not
+  // valid instructions when image handles are disabled.
+  for (DenseSet<MachineInstr *>::iterator I = InstrsToRemove.begin(),
+       E = InstrsToRemove.end(); I != E; ++I) {
+    (*I)->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+bool NVPTXReplaceImageHandles::processInstr(MachineInstr &MI) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  // Check if we have a surface/texture instruction
+  switch (MI.getOpcode()) {
+  default: return false;
+  case NVPTX::TEX_1D_F32_I32:
+  case NVPTX::TEX_1D_F32_F32:
+  case NVPTX::TEX_1D_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_F32_F32_GRAD:
+  case NVPTX::TEX_1D_I32_I32:
+  case NVPTX::TEX_1D_I32_F32:
+  case NVPTX::TEX_1D_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_I32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_F32_I32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_1D_ARRAY_I32_I32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_1D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_2D_F32_I32:
+  case NVPTX::TEX_2D_F32_F32:
+  case NVPTX::TEX_2D_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_F32_F32_GRAD:
+  case NVPTX::TEX_2D_I32_I32:
+  case NVPTX::TEX_2D_I32_F32:
+  case NVPTX::TEX_2D_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_I32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_F32_I32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_F32_F32_GRAD:
+  case NVPTX::TEX_2D_ARRAY_I32_I32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_LEVEL:
+  case NVPTX::TEX_2D_ARRAY_I32_F32_GRAD:
+  case NVPTX::TEX_3D_F32_I32:
+  case NVPTX::TEX_3D_F32_F32:
+  case NVPTX::TEX_3D_F32_F32_LEVEL:
+  case NVPTX::TEX_3D_F32_F32_GRAD:
+  case NVPTX::TEX_3D_I32_I32:
+  case NVPTX::TEX_3D_I32_F32:
+  case NVPTX::TEX_3D_I32_F32_LEVEL:
+  case NVPTX::TEX_3D_I32_F32_GRAD: {
+    // This is a texture fetch, so operand 4 is a texref and operand 5 is
+    // a samplerref
+    MachineOperand &TexHandle = MI.getOperand(4);
+    MachineOperand &SampHandle = MI.getOperand(5);
+
+    replaceImageHandle(TexHandle, MF);
+    replaceImageHandle(SampHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_I8_TRAP:
+  case NVPTX::SULD_1D_I16_TRAP:
+  case NVPTX::SULD_1D_I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_2D_I8_TRAP:
+  case NVPTX::SULD_2D_I16_TRAP:
+  case NVPTX::SULD_2D_I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_I32_TRAP:
+  case NVPTX::SULD_3D_I8_TRAP:
+  case NVPTX::SULD_3D_I16_TRAP:
+  case NVPTX::SULD_3D_I32_TRAP: {
+    // This is a V1 surface load, so operand 1 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(1);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_V2I8_TRAP:
+  case NVPTX::SULD_1D_V2I16_TRAP:
+  case NVPTX::SULD_1D_V2I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_2D_V2I8_TRAP:
+  case NVPTX::SULD_2D_V2I16_TRAP:
+  case NVPTX::SULD_2D_V2I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V2I32_TRAP:
+  case NVPTX::SULD_3D_V2I8_TRAP:
+  case NVPTX::SULD_3D_V2I16_TRAP:
+  case NVPTX::SULD_3D_V2I32_TRAP: {
+    // This is a V2 surface load, so operand 2 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(2);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SULD_1D_V4I8_TRAP:
+  case NVPTX::SULD_1D_V4I16_TRAP:
+  case NVPTX::SULD_1D_V4I32_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_1D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_2D_V4I8_TRAP:
+  case NVPTX::SULD_2D_V4I16_TRAP:
+  case NVPTX::SULD_2D_V4I32_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I8_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I16_TRAP:
+  case NVPTX::SULD_2D_ARRAY_V4I32_TRAP:
+  case NVPTX::SULD_3D_V4I8_TRAP:
+  case NVPTX::SULD_3D_V4I16_TRAP:
+  case NVPTX::SULD_3D_V4I32_TRAP: {
+    // This is a V4 surface load, so operand 4 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(4);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::SUST_B_1D_B8_TRAP:
+  case NVPTX::SUST_B_1D_B16_TRAP:
+  case NVPTX::SUST_B_1D_B32_TRAP:
+  case NVPTX::SUST_B_1D_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_V4B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_B8_TRAP:
+  case NVPTX::SUST_B_2D_B16_TRAP:
+  case NVPTX::SUST_B_2D_B32_TRAP:
+  case NVPTX::SUST_B_2D_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_V4B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_B_3D_B8_TRAP:
+  case NVPTX::SUST_B_3D_B16_TRAP:
+  case NVPTX::SUST_B_3D_B32_TRAP:
+  case NVPTX::SUST_B_3D_V2B8_TRAP:
+  case NVPTX::SUST_B_3D_V2B16_TRAP:
+  case NVPTX::SUST_B_3D_V2B32_TRAP:
+  case NVPTX::SUST_B_3D_V4B8_TRAP:
+  case NVPTX::SUST_B_3D_V4B16_TRAP:
+  case NVPTX::SUST_B_3D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_B8_TRAP:
+  case NVPTX::SUST_P_1D_B16_TRAP:
+  case NVPTX::SUST_P_1D_B32_TRAP:
+  case NVPTX::SUST_P_1D_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_V4B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_B8_TRAP:
+  case NVPTX::SUST_P_2D_B16_TRAP:
+  case NVPTX::SUST_P_2D_B32_TRAP:
+  case NVPTX::SUST_P_2D_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_V4B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP:
+  case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP:
+  case NVPTX::SUST_P_3D_B8_TRAP:
+  case NVPTX::SUST_P_3D_B16_TRAP:
+  case NVPTX::SUST_P_3D_B32_TRAP:
+  case NVPTX::SUST_P_3D_V2B8_TRAP:
+  case NVPTX::SUST_P_3D_V2B16_TRAP:
+  case NVPTX::SUST_P_3D_V2B32_TRAP:
+  case NVPTX::SUST_P_3D_V4B8_TRAP:
+  case NVPTX::SUST_P_3D_V4B16_TRAP:
+  case NVPTX::SUST_P_3D_V4B32_TRAP: {
+    // This is a surface store, so operand 0 is a surfref
+    MachineOperand &SurfHandle = MI.getOperand(0);
+
+    replaceImageHandle(SurfHandle, MF);
+
+    return true;
+  }
+  case NVPTX::TXQ_CHANNEL_ORDER:
+  case NVPTX::TXQ_CHANNEL_DATA_TYPE:
+  case NVPTX::TXQ_WIDTH:
+  case NVPTX::TXQ_HEIGHT:
+  case NVPTX::TXQ_DEPTH:
+  case NVPTX::TXQ_ARRAY_SIZE:
+  case NVPTX::TXQ_NUM_SAMPLES:
+  case NVPTX::TXQ_NUM_MIPMAP_LEVELS:
+  case NVPTX::SUQ_CHANNEL_ORDER:
+  case NVPTX::SUQ_CHANNEL_DATA_TYPE:
+  case NVPTX::SUQ_WIDTH:
+  case NVPTX::SUQ_HEIGHT:
+  case NVPTX::SUQ_DEPTH:
+  case NVPTX::SUQ_ARRAY_SIZE: {
+    // This is a query, so operand 1 is a surfref/texref
+    MachineOperand &Handle = MI.getOperand(1);
+
+    replaceImageHandle(Handle, MF);
+
+    return true; 
+  }
+  }
+}
+
+void NVPTXReplaceImageHandles::
+replaceImageHandle(MachineOperand &Op, MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  NVPTXMachineFunctionInfo *MFI = MF.getInfo<NVPTXMachineFunctionInfo>();
+  // Which instruction defines the handle?
+  MachineInstr *MI = MRI.getVRegDef(Op.getReg());
+  assert(MI && "No def for image handle vreg?");
+  MachineInstr &TexHandleDef = *MI;
+
+  switch (TexHandleDef.getOpcode()) {
+  case NVPTX::LD_i64_avar: {
+    // The handle is a parameter value being loaded, replace with the
+    // parameter symbol
+    assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
+    StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
+    std::string ParamBaseName = MF.getName();
+    ParamBaseName += "_param_";
+    assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
+    unsigned Param = atoi(Sym.data()+ParamBaseName.size());
+    std::string NewSym;
+    raw_string_ostream NewSymStr(NewSym);
+    NewSymStr << MF.getFunction()->getName() << "_param_" << Param;
+    Op.ChangeToImmediate(
+      MFI->getImageHandleSymbolIndex(NewSymStr.str().c_str()));
+    InstrsToRemove.insert(&TexHandleDef);
+    break;
+  }
+  case NVPTX::texsurf_handles: {
+    // The handle is a global variable, replace with the global variable name
+    assert(TexHandleDef.getOperand(1).isGlobal() && "Load is not a global!");
+    const GlobalValue *GV = TexHandleDef.getOperand(1).getGlobal();
+    assert(GV->hasName() && "Global sampler must be named!");
+    Op.ChangeToImmediate(MFI->getImageHandleSymbolIndex(GV->getName().data()));
+    InstrsToRemove.insert(&TexHandleDef);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown instruction operating on handle");
+  }
+}
+
+MachineFunctionPass *llvm::createNVPTXReplaceImageHandlesPass() {
+  return new NVPTXReplaceImageHandles();
+}
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index f8a692e..aa0436b 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -31,16 +31,16 @@ public:
 
   /// Override this as NVPTX has its own way of printing switching
   /// to a section.
-  virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
-                                    raw_ostream &OS,
-                                    const MCExpr *Subsection) const {}
+  void PrintSwitchToSection(const MCAsmInfo &MAI,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override {}
 
   /// Base address of PTX sections is zero.
-  virtual bool isBaseAddressKnownZero() const { return true; }
-  virtual bool UseCodeAlign() const { return false; }
-  virtual bool isVirtualSection() const { return false; }
-  virtual std::string getLabelBeginName() const { return ""; }
-  virtual std::string getLabelEndName() const { return ""; }
+  bool isBaseAddressKnownZero() const override { return true; }
+  bool UseCodeAlign() const override { return false; }
+  bool isVirtualSection() const override { return false; }
+  std::string getLabelBeginName() const override { return ""; }
+  std::string getLabelEndName() const override { return ""; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 9771a17..8c7df52 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -12,14 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "NVPTXSubtarget.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "nvptx-subtarget"
+
 #define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "NVPTXGenSubtargetInfo.inc"
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void NVPTXSubtarget::anchor() {}
 
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index f99bebd..581e5ed 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -16,12 +16,11 @@
 
 #include "NVPTX.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <string>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "NVPTXGenSubtargetInfo.inc"
 
-#include <string>
-
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
@@ -65,6 +64,10 @@ public:
   inline bool hasROT32() const { return hasHWROT32() || hasSWROT32(); }
   inline bool hasROT64() const { return SmVersion >= 20; }
 
+  bool hasImageHandles() const {
+    // Currently disabled
+    return false;
+  }
   bool is64Bit() const { return Is64Bit; }
 
   unsigned int getSmVersion() const { return SmVersion; }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 7d7d793..26a4f84 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -16,7 +16,6 @@
 #include "NVPTX.h"
 #include "NVPTXAllocaHoisting.h"
 #include "NVPTXLowerAggrCopies.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunctionAnalysis.h"
@@ -50,6 +49,7 @@ namespace llvm {
 void initializeNVVMReflectPass(PassRegistry&);
 void initializeGenericToNVVMPass(PassRegistry&);
 void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
+void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
 }
 
 extern "C" void LLVMInitializeNVPTXTarget() {
@@ -62,6 +62,8 @@ extern "C" void LLVMInitializeNVPTXTarget() {
   initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
   initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
   initializeNVPTXAssignValidGlobalNamesPass(*PassRegistry::getPassRegistry());
+  initializeNVPTXFavorNonGenericAddrSpacesPass(
+    *PassRegistry::getPassRegistry());
 }
 
 static std::string computeDataLayout(const NVPTXSubtarget &ST) {
@@ -113,14 +115,14 @@ public:
     return getTM<NVPTXTargetMachine>();
   }
 
-  virtual void addIRPasses();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
+  void addIRPasses() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
 
-  virtual FunctionPass *createTargetRegisterAllocator(bool) override;
-  virtual void addFastRegAlloc(FunctionPass *RegAllocPass);
-  virtual void addOptimizedRegAlloc(FunctionPass *RegAllocPass);
+  FunctionPass *createTargetRegisterAllocator(bool) override;
+  void addFastRegAlloc(FunctionPass *RegAllocPass) override;
+  void addOptimizedRegAlloc(FunctionPass *RegAllocPass) override;
 };
 } // end anonymous namespace
 
@@ -140,15 +142,42 @@ void NVPTXPassConfig::addIRPasses() {
   disablePass(&BranchFolderPassID);
   disablePass(&TailDuplicateID);
 
+  addPass(createNVPTXImageOptimizerPass());
   TargetPassConfig::addIRPasses();
   addPass(createNVPTXAssignValidGlobalNamesPass());
   addPass(createGenericToNVVMPass());
+  addPass(createNVPTXFavorNonGenericAddrSpacesPass());
+  addPass(createSeparateConstOffsetFromGEPPass());
+  // The SeparateConstOffsetFromGEP pass creates variadic bases that can be used
+  // by multiple GEPs. Run GVN or EarlyCSE to really reuse them. GVN generates
+  // significantly better code than EarlyCSE for some of our benchmarks.
+  if (getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createGVNPass());
+  else
+    addPass(createEarlyCSEPass());
+  // Both FavorNonGenericAddrSpaces and SeparateConstOffsetFromGEP may leave
+  // some dead code.  We could remove dead code in an ad-hoc manner, but that
+  // requires manual work and might be error-prone.
+  //
+  // The FavorNonGenericAddrSpaces pass shortcuts unnecessary addrspacecasts,
+  // and leave them unused.
+  //
+  // SeparateConstOffsetFromGEP rebuilds a new index from the old index, and the
+  // old index and some of its intermediate results may become unused.
+  addPass(createDeadCodeEliminationPass());
 }
 
 bool NVPTXPassConfig::addInstSelector() {
+  const NVPTXSubtarget &ST =
+    getTM<NVPTXTargetMachine>().getSubtarget<NVPTXSubtarget>();
+
   addPass(createLowerAggrCopies());
   addPass(createAllocaHoisting());
   addPass(createNVPTXISelDag(getNVPTXTargetMachine(), getOptLevel()));
+
+  if (!ST.hasImageHandles())
+    addPass(createNVPTXReplaceImageHandlesPass());
+
   return false;
 }
 
@@ -159,7 +188,7 @@ bool NVPTXPassConfig::addPostRegAlloc() {
 }
 
 FunctionPass *NVPTXPassConfig::createTargetRegisterAllocator(bool) {
-  return 0; // No reg alloc
+  return nullptr; // No reg alloc
 }
 
 void NVPTXPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.h b/lib/Target/NVPTX/NVPTXTargetMachine.h
index 5fbcf73..2db7c18 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.h
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.h
@@ -51,22 +51,22 @@ public:
                      const TargetOptions &Options, Reloc::Model RM,
                      CodeModel::Model CM, CodeGenOpt::Level OP, bool is64bit);
 
-  virtual const TargetFrameLowering *getFrameLowering() const {
+  const TargetFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const NVPTXInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const DataLayout *getDataLayout() const { return &DL; }
-  virtual const NVPTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
+  const NVPTXInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const DataLayout *getDataLayout() const override { return &DL; }
+  const NVPTXSubtarget *getSubtargetImpl() const override { return &Subtarget; }
 
-  virtual const NVPTXRegisterInfo *getRegisterInfo() const {
+  const NVPTXRegisterInfo *getRegisterInfo() const override {
     return &(InstrInfo.getRegisterInfo());
   }
 
-  virtual NVPTXTargetLowering *getTargetLowering() const {
+  NVPTXTargetLowering *getTargetLowering() const override {
     return const_cast<NVPTXTargetLowering *>(&TLInfo);
   }
 
-  virtual const TargetSelectionDAGInfo *getSelectionDAGInfo() const {
+  const TargetSelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
@@ -79,17 +79,17 @@ public:
     return const_cast<ManagedStringPool *>(&ManagedStrPool);
   }
 
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   // Emission of machine code through JITCodeEmitter is not supported.
-  virtual bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
-                                          bool = true) {
+  bool addPassesToEmitMachineCode(PassManagerBase &, JITCodeEmitter &,
+                                  bool = true) override {
     return true;
   }
 
   // Emission of machine code through MCJIT is not supported.
-  virtual bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
-                                 bool = true) {
+  bool addPassesToEmitMC(PassManagerBase &, MCContext *&, raw_ostream &,
+                         bool = true) override {
     return true;
   }
 
diff --git a/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index 2a7281e..0b438c5 100644
--- a/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -22,26 +22,26 @@ class NVPTXTargetObjectFile : public TargetLoweringObjectFile {
 
 public:
   NVPTXTargetObjectFile() {
-    TextSection = 0;
-    DataSection = 0;
-    BSSSection = 0;
-    ReadOnlySection = 0;
+    TextSection = nullptr;
+    DataSection = nullptr;
+    BSSSection = nullptr;
+    ReadOnlySection = nullptr;
 
-    StaticCtorSection = 0;
-    StaticDtorSection = 0;
-    LSDASection = 0;
-    EHFrameSection = 0;
-    DwarfAbbrevSection = 0;
-    DwarfInfoSection = 0;
-    DwarfLineSection = 0;
-    DwarfFrameSection = 0;
-    DwarfPubTypesSection = 0;
-    DwarfDebugInlineSection = 0;
-    DwarfStrSection = 0;
-    DwarfLocSection = 0;
-    DwarfARangesSection = 0;
-    DwarfRangesSection = 0;
-    DwarfMacroInfoSection = 0;
+    StaticCtorSection = nullptr;
+    StaticDtorSection = nullptr;
+    LSDASection = nullptr;
+    EHFrameSection = nullptr;
+    DwarfAbbrevSection = nullptr;
+    DwarfInfoSection = nullptr;
+    DwarfLineSection = nullptr;
+    DwarfFrameSection = nullptr;
+    DwarfPubTypesSection = nullptr;
+    DwarfDebugInlineSection = nullptr;
+    DwarfStrSection = nullptr;
+    DwarfLocSection = nullptr;
+    DwarfARangesSection = nullptr;
+    DwarfRangesSection = nullptr;
+    DwarfMacroInfoSection = nullptr;
   }
 
   virtual ~NVPTXTargetObjectFile();
diff --git a/lib/Target/NVPTX/NVPTXUtilities.cpp b/lib/Target/NVPTX/NVPTXUtilities.cpp
index 60a5173..a9fd190b 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -22,9 +22,9 @@
 #include <map>
 #include <string>
 #include <vector>
-//#include <iostream>
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/Support/MutexGuard.h"
 
 using namespace llvm;
 
@@ -33,8 +33,15 @@ typedef std::map<const GlobalValue *, key_val_pair_t> global_val_annot_t;
 typedef std::map<const Module *, global_val_annot_t> per_module_annot_t;
 
 ManagedStatic<per_module_annot_t> annotationCache;
+static sys::Mutex Lock;
+
+void llvm::clearAnnotationCache(const llvm::Module *Mod) {
+  MutexGuard Guard(Lock);
+  annotationCache->erase(Mod);
+}
 
 static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
+  MutexGuard Guard(Lock);
   assert(md && "Invalid mdnode for annotation");
   assert((md->getNumOperands() % 2) == 1 && "Invalid number of operands");
   // start index = 1, to skip the global variable key
@@ -60,6 +67,7 @@ static void cacheAnnotationFromMD(const MDNode *md, key_val_pair_t &retval) {
 }
 
 static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
+  MutexGuard Guard(Lock);
   NamedMDNode *NMD = m->getNamedMetadata(llvm::NamedMDForAnnotations);
   if (!NMD)
     return;
@@ -92,6 +100,7 @@ static void cacheAnnotationFromMD(const Module *m, const GlobalValue *gv) {
 
 bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
                                  unsigned &retval) {
+  MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -105,6 +114,7 @@ bool llvm::findOneNVVMAnnotation(const GlobalValue *gv, std::string prop,
 
 bool llvm::findAllNVVMAnnotation(const GlobalValue *gv, std::string prop,
                                  std::vector<unsigned> &retval) {
+  MutexGuard Guard(Lock);
   const Module *m = gv->getParent();
   if ((*annotationCache).find(m) == (*annotationCache).end())
     cacheAnnotationFromMD(m, gv);
@@ -195,8 +205,37 @@ bool llvm::isImageWriteOnly(const llvm::Value &val) {
   return false;
 }
 
+bool llvm::isImageReadWrite(const llvm::Value &val) {
+  if (const Argument *arg = dyn_cast<Argument>(&val)) {
+    const Function *func = arg->getParent();
+    std::vector<unsigned> annot;
+    if (llvm::findAllNVVMAnnotation(func,
+                                    llvm::PropertyAnnotationNames[
+                                        llvm::PROPERTY_ISREADWRITE_IMAGE_PARAM],
+                                    annot)) {
+      if (std::find(annot.begin(), annot.end(), arg->getArgNo()) != annot.end())
+        return true;
+    }
+  }
+  return false;
+}
+
 bool llvm::isImage(const llvm::Value &val) {
-  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val);
+  return llvm::isImageReadOnly(val) || llvm::isImageWriteOnly(val) ||
+         llvm::isImageReadWrite(val);
+}
+
+bool llvm::isManaged(const llvm::Value &val) {
+  if(const GlobalValue *gv = dyn_cast<GlobalValue>(&val)) {
+    unsigned annot;
+    if(llvm::findOneNVVMAnnotation(gv,
+                          llvm::PropertyAnnotationNames[llvm::PROPERTY_MANAGED],
+                                   annot)) {
+      assert((annot == 1) && "Unexpected annotation on a managed symbol");
+      return true;
+    }
+  }
+  return false;
 }
 
 std::string llvm::getTextureName(const llvm::Value &val) {
@@ -354,12 +393,12 @@ llvm::skipPointerTransfer(const Value *V, bool ignore_GEP_indices) {
 const Value *
 llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
   if (processed.find(V) != processed.end())
-    return NULL;
+    return nullptr;
   processed.insert(V);
 
   const Value *V2 = V->stripPointerCasts();
   if (V2 != V && processed.find(V2) != processed.end())
-    return NULL;
+    return nullptr;
   processed.insert(V2);
 
   V = V2;
@@ -375,20 +414,20 @@ llvm::skipPointerTransfer(const Value *V, std::set<const Value *> &processed) {
       continue;
     } else if (const PHINode *PN = dyn_cast<PHINode>(V)) {
       if (V != V2 && processed.find(V) != processed.end())
-        return NULL;
+        return nullptr;
       processed.insert(PN);
-      const Value *common = 0;
+      const Value *common = nullptr;
       for (unsigned i = 0; i != PN->getNumIncomingValues(); ++i) {
         const Value *pv = PN->getIncomingValue(i);
         const Value *base = skipPointerTransfer(pv, processed);
         if (base) {
-          if (common == 0)
+          if (!common)
             common = base;
           else if (common != base)
             return PN;
         }
       }
-      if (common == 0)
+      if (!common)
         return PN;
       V = common;
     }
@@ -406,7 +445,7 @@ BasicBlock *llvm::getParentBlock(Value *v) {
   if (Instruction *I = dyn_cast<Instruction>(v))
     return I->getParent();
 
-  return 0;
+  return nullptr;
 }
 
 Function *llvm::getParentFunction(Value *v) {
@@ -419,13 +458,13 @@ Function *llvm::getParentFunction(Value *v) {
   if (BasicBlock *B = dyn_cast<BasicBlock>(v))
     return B->getParent();
 
-  return 0;
+  return nullptr;
 }
 
 // Dump a block by name
 void llvm::dumpBlock(Value *v, char *blockName) {
   Function *F = getParentFunction(v);
-  if (F == 0)
+  if (!F)
     return;
 
   for (Function::iterator it = F->begin(), ie = F->end(); it != ie; ++it) {
@@ -440,8 +479,8 @@ void llvm::dumpBlock(Value *v, char *blockName) {
 // Find an instruction by name
 Instruction *llvm::getInst(Value *base, char *instName) {
   Function *F = getParentFunction(base);
-  if (F == 0)
-    return 0;
+  if (!F)
+    return nullptr;
 
   for (inst_iterator it = inst_begin(F), ie = inst_end(F); it != ie; ++it) {
     Instruction *I = &*it;
@@ -450,7 +489,7 @@ Instruction *llvm::getInst(Value *base, char *instName) {
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 // Dump an instruction by nane
diff --git a/lib/Target/NVPTX/NVPTXUtilities.h b/lib/Target/NVPTX/NVPTXUtilities.h
index a208004..446bfa1 100644
--- a/lib/Target/NVPTX/NVPTXUtilities.h
+++ b/lib/Target/NVPTX/NVPTXUtilities.h
@@ -28,6 +28,8 @@ namespace llvm {
 #define NVCL_IMAGE2D_READONLY_FUNCNAME "__is_image2D_readonly"
 #define NVCL_IMAGE3D_READONLY_FUNCNAME "__is_image3D_readonly"
 
+void clearAnnotationCache(const llvm::Module *);
+
 bool findOneNVVMAnnotation(const llvm::GlobalValue *, std::string, unsigned &);
 bool findAllNVVMAnnotation(const llvm::GlobalValue *, std::string,
                            std::vector<unsigned> &);
@@ -38,6 +40,8 @@ bool isSampler(const llvm::Value &);
 bool isImage(const llvm::Value &);
 bool isImageReadOnly(const llvm::Value &);
 bool isImageWriteOnly(const llvm::Value &);
+bool isImageReadWrite(const llvm::Value &);
+bool isManaged(const llvm::Value &);
 
 std::string getTextureName(const llvm::Value &);
 std::string getSurfaceName(const llvm::Value &);
diff --git a/lib/Target/NVPTX/NVVMReflect.cpp b/lib/Target/NVPTX/NVVMReflect.cpp
index 8b5444a..cb8bd72 100644
--- a/lib/Target/NVPTX/NVVMReflect.cpp
+++ b/lib/Target/NVPTX/NVVMReflect.cpp
@@ -38,6 +38,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "nvptx-reflect"
+
 namespace llvm { void initializeNVVMReflectPass(PassRegistry &); }
 
 namespace {
@@ -49,13 +51,13 @@ private:
 
 public:
   static char ID;
-  NVVMReflect() : ModulePass(ID), ReflectFunction(0) {
+  NVVMReflect() : ModulePass(ID), ReflectFunction(nullptr) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     VarMap.clear();
   }
 
   NVVMReflect(const StringMap<int> &Mapping)
-  : ModulePass(ID), ReflectFunction(0) {
+  : ModulePass(ID), ReflectFunction(nullptr) {
     initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
     for (StringMap<int>::const_iterator I = Mapping.begin(), E = Mapping.end();
          I != E; ++I) {
@@ -63,8 +65,10 @@ public:
     }
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); }
-  virtual bool runOnModule(Module &);
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+  }
+  bool runOnModule(Module &) override;
 
   void setVarMap();
 };
@@ -126,7 +130,7 @@ bool NVVMReflect::runOnModule(Module &M) {
 
   // If reflect function is not used, then there will be
   // no entry in the module.
-  if (ReflectFunction == 0)
+  if (!ReflectFunction)
     return false;
 
   // Validate _reflect function
diff --git a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
index 02ebf1d..801f27b 100644
--- a/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
+++ b/lib/Target/PowerPC/AsmParser/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/PowerPC/AsmParser/LLVMBuild.txt --------------*- Conf -*--===;
+;===- ./lib/Target/PowerPC/AsmParser/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCAsmParser
 parent = PowerPC
-required_libraries = PowerPCDesc PowerPCInfo MC MCParser Support
+required_libraries = MC MCParser PowerPCDesc PowerPCInfo Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 8bb91cf..3ac037d 100644
--- a/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -230,7 +230,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool MatchRegisterName(const AsmToken &Tok,
                          unsigned &RegNo, int64_t &IntVal);
 
-  virtual bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
                                         PPCMCExpr::VariantKind &Variant);
@@ -248,7 +248,7 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
+                               bool MatchingInlineAsm) override;
 
   void ProcessInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
@@ -264,7 +264,8 @@ class PPCAsmParser : public MCTargetAsmParser {
 
 public:
   PPCAsmParser(MCSubtargetInfo &_STI, MCAsmParser &_Parser,
-               const MCInstrInfo &_MII)
+               const MCInstrInfo &_MII,
+               const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(_STI), Parser(_Parser), MII(_MII) {
     // Check for 64-bit vs. 32-bit pointer mode.
     Triple TheTriple(STI.getTargetTriple());
@@ -275,17 +276,18 @@ public:
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
   }
 
-  virtual bool ParseInstruction(ParseInstructionInfo &Info,
-                                StringRef Name, SMLoc NameLoc,
-                                SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool ParseInstruction(ParseInstructionInfo &Info,
+                        StringRef Name, SMLoc NameLoc,
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
 
-  virtual bool ParseDirective(AsmToken DirectiveID);
+  bool ParseDirective(AsmToken DirectiveID) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op, unsigned Kind);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
 
-  virtual const MCExpr *applyModifierToExpr(const MCExpr *E,
-                                            MCSymbolRefExpr::VariantKind,
-                                            MCContext &Ctx);
+  const MCExpr *applyModifierToExpr(const MCExpr *E,
+                                    MCSymbolRefExpr::VariantKind,
+                                    MCContext &Ctx) override;
 };
 
 /// PPCOperand - Instances of this class represent a parsed PowerPC machine
@@ -350,10 +352,10 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const { return StartLoc; }
+  SMLoc getStartLoc() const override { return StartLoc; }
 
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const { return EndLoc; }
+  SMLoc getEndLoc() const override { return EndLoc; }
 
   /// isPPC64 - True if this operand is for an instruction in 64-bit mode.
   bool isPPC64() const { return IsPPC64; }
@@ -378,7 +380,7 @@ public:
     return TLSReg.Sym;
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert(isRegNumber() && "Invalid access!");
     return (unsigned) Imm.Val;
   }
@@ -403,8 +405,8 @@ public:
     return 7 - countTrailingZeros<uint64_t>(Imm.Val);
   }
 
-  bool isToken() const { return Kind == Token; }
-  bool isImm() const { return Kind == Immediate || Kind == Expression; }
+  bool isToken() const override { return Kind == Token; }
+  bool isImm() const override { return Kind == Immediate || Kind == Expression; }
   bool isU2Imm() const { return Kind == Immediate && isUInt<2>(getImm()); }
   bool isU5Imm() const { return Kind == Immediate && isUInt<5>(getImm()); }
   bool isS5Imm() const { return Kind == Immediate && isInt<5>(getImm()); }
@@ -437,8 +439,8 @@ public:
                                        && isUInt<5>(getImm())); }
   bool isCRBitMask() const { return Kind == Immediate && isUInt<8>(getImm()) &&
                                     isPowerOf2_32(getImm()); }
-  bool isMem() const { return false; }
-  bool isReg() const { return false; }
+  bool isMem() const override { return false; }
+  bool isReg() const override { return false; }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
     llvm_unreachable("addRegOperands");
@@ -544,7 +546,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  virtual void print(raw_ostream &OS) const;
+  void print(raw_ostream &OS) const override;
 
   static PPCOperand *CreateToken(StringRef Str, SMLoc S, bool IsPPC64) {
     PPCOperand *Op = new PPCOperand(Token);
@@ -1021,7 +1023,7 @@ ExtractModifierFromExpr(const MCExpr *E,
   switch (E->getKind()) {
   case MCExpr::Target:
   case MCExpr::Constant:
-    return 0;
+    return nullptr;
 
   case MCExpr::SymbolRef: {
     const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
@@ -1049,7 +1051,7 @@ ExtractModifierFromExpr(const MCExpr *E,
       Variant = PPCMCExpr::VK_PPC_HIGHESTA;
       break;
     default:
-      return 0;
+      return nullptr;
     }
 
     return MCSymbolRefExpr::Create(&SRE->getSymbol(), Context);
@@ -1059,7 +1061,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
     const MCExpr *Sub = ExtractModifierFromExpr(UE->getSubExpr(), Variant);
     if (!Sub)
-      return 0;
+      return nullptr;
     return MCUnaryExpr::Create(UE->getOpcode(), Sub, Context);
   }
 
@@ -1070,7 +1072,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     const MCExpr *RHS = ExtractModifierFromExpr(BE->getRHS(), RHSVariant);
 
     if (!LHS && !RHS)
-      return 0;
+      return nullptr;
 
     if (!LHS) LHS = BE->getLHS();
     if (!RHS) RHS = BE->getRHS();
@@ -1082,7 +1084,7 @@ ExtractModifierFromExpr(const MCExpr *E,
     else if (LHSVariant == RHSVariant)
       Variant = LHSVariant;
     else
-      return 0;
+      return nullptr;
 
     return MCBinaryExpr::Create(BE->getOpcode(), LHS, RHS, Context);
   }
@@ -1593,6 +1595,6 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
   case MCSymbolRefExpr::VK_PPC_HIGHESTA:
     return PPCMCExpr::Create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
   default:
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
index 7f29040..c1011ff 100644
--- a/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
+++ b/lib/Target/PowerPC/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = PowerPCDisassembler
 parent = PowerPC
-required_libraries = MC Support PowerPCDesc PowerPCInfo
+required_libraries = MC PowerPCDesc PowerPCInfo Support
 add_to_library_groups = PowerPC
diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index c4a7544..a2305a9 100644
--- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -17,13 +17,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class PPCDisassembler : public MCDisassembler {
 public:
-  PPCDisassembler(const MCSubtargetInfo &STI)
-    : MCDisassembler(STI) {}
+  PPCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
   virtual ~PPCDisassembler() {}
 
   // Override MCDisassembler.
@@ -37,8 +39,9 @@ public:
 } // end anonymous namespace
 
 static MCDisassembler *createPPCDisassembler(const Target &T,
-                                             const MCSubtargetInfo &STI) {
-  return new PPCDisassembler(STI);
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  return new PPCDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializePowerPCDisassembler() {
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
index dc54b52..7279b09 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCPredicates.h"
@@ -23,6 +22,8 @@
 #include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // FIXME: Once the integrated assembler supports full register names, tie this
 // to the verbose-asm setting.
 static cl::opt<bool>
diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
index 4d1df78..211a628 100644
--- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
+++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.h
@@ -31,8 +31,8 @@ public:
     return IsDarwin;
   }
   
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
   
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
@@ -41,7 +41,7 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printPredicateOperand(const MCInst *MI, unsigned OpNo,
-                             raw_ostream &O, const char *Modifier = 0);
+                             raw_ostream &O, const char *Modifier = nullptr);
 
   void printU2ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS5ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index f7309bb..12584be 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -77,9 +77,11 @@ public:
   PPCAsmBackend(const Target &T, bool isLittle) : MCAsmBackend(), TheTarget(T),
     IsLittleEndian(isLittle) {}
 
-  unsigned getNumFixupKinds() const { return PPC::NumTargetFixupKinds; }
+  unsigned getNumFixupKinds() const override {
+    return PPC::NumTargetFixupKinds;
+  }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
     const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
@@ -110,7 +112,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const {
+                  uint64_t Value, bool IsPCRel) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
@@ -126,7 +128,7 @@ public:
     }
   }
 
-  bool mayNeedRelaxation(const MCInst &Inst) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override {
     // FIXME.
     return false;
   }
@@ -134,18 +136,18 @@ public:
   bool fixupNeedsRelaxation(const MCFixup &Fixup,
                             uint64_t Value,
                             const MCRelaxableFragment *DF,
-                            const MCAsmLayout &Layout) const {
+                            const MCAsmLayout &Layout) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
 
-  void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     uint64_t NumNops = Count / 4;
     for (uint64_t i = 0; i != NumNops; ++i)
       OW->Write32(0x60000000);
@@ -180,7 +182,7 @@ namespace {
   public:
     DarwinPPCAsmBackend(const Target &T) : PPCAsmBackend(T, false) { }
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCMachObjectWriter(
           OS,
@@ -197,7 +199,7 @@ namespace {
       PPCAsmBackend(T, IsLittleEndian), OSABI(OSABI) { }
 
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       bool is64 = getPointerSize() == 8;
       return createPPCELFObjectWriter(OS, is64, isLittleEndian(), OSABI);
     }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index d19f6a0..cd3b4f4 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -41,11 +41,12 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
 PPCELFObjectWriter::~PPCELFObjectWriter() {
 }
 
-static MCSymbolRefExpr::VariantKind getAccessVariant(const MCFixup &Fixup) {
+static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
+                                                     const MCFixup &Fixup) {
   const MCExpr *Expr = Fixup.getValue();
 
   if (Expr->getKind() != MCExpr::Target)
-    return Fixup.getAccessVariant();
+    return Target.getAccessVariant();
 
   switch (cast<PPCMCExpr>(Expr)->getKind()) {
   case PPCMCExpr::VK_PPC_None:
@@ -72,7 +73,7 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel) const
 {
-  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Fixup);
+  MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
 
   // determine the type of the relocation
   unsigned Type;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index 18609e1..b95a2ac 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -28,7 +28,7 @@ PPCMCAsmInfoDarwin::PPCMCAsmInfoDarwin(bool is64Bit, const Triple& T) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
   if (!is64Bit)
-    Data64bitsDirective = 0;      // We can't emit a 64-bit unit in PPC32 mode.
+    Data64bitsDirective = nullptr; // We can't emit a 64-bit unit in PPC32 mode.
 
   AssemblerDialect = 1;           // New-Style mnemonics.
   SupportsDebugInformation= true; // Debug information.
@@ -71,7 +71,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit, const Triple& T) {
   ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
-  Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
+  Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
   AssemblerDialect = 1;           // New-Style mnemonics.
 
   if (T.getOS() == llvm::Triple::FreeBSD ||
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index cee2cb7..754330b 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -21,13 +21,13 @@ namespace llvm {
 class Triple;
 
   class PPCMCAsmInfoDarwin : public MCAsmInfoDarwin {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit PPCMCAsmInfoDarwin(bool is64Bit, const Triple&);
   };
 
   class PPCLinuxMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit PPCLinuxMCAsmInfo(bool is64Bit, const Triple&);
   };
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index b259c5d..a4983ad 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/ADT/Statistic.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetOpcodes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
@@ -88,7 +89,7 @@ public:
                                  const MCSubtargetInfo &STI) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const {
+                         const MCSubtargetInfo &STI) const override {
     // For fast-isel, a float COPY_TO_REGCLASS can survive this long.
     // It's just a nop to keep the register classes happy, so don't
     // generate anything.
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index c181e03..10d068d 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppcmcexpr"
 #include "PPCMCExpr.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
@@ -15,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ppcmcexpr"
+
 const PPCMCExpr*
 PPCMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                   bool isDarwin, MCContext &Ctx) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index 5fc7918..3421b91 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -76,16 +76,16 @@ public:
 
   /// @}
 
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
   // There are no TLS PPCMCExprs at the moment.
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override {}
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index 105c511..7057797 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "PPCGenInstrInfo.inc"
 
@@ -35,8 +37,6 @@
 #define GET_REGINFO_MC_DESC
 #include "PPCGenRegisterInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtable to this file.
 PPCTargetStreamer::~PPCTargetStreamer() {}
 PPCTargetStreamer::PPCTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
@@ -80,7 +80,7 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   // Initial state of the frame pointer is R1.
   unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(Reg, true), 0);
+      MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -115,14 +115,14 @@ class PPCTargetAsmStreamer : public PPCTargetStreamer {
 public:
   PPCTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS)
       : PPCTargetStreamer(S), OS(OS) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     OS << "\t.tc ";
     OS << S.getName();
     OS << "[TC],";
     OS << S.getName();
     OS << '\n';
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     OS << "\t.machine " << CPU << '\n';
   }
 };
@@ -130,11 +130,11 @@ public:
 class PPCTargetELFStreamer : public PPCTargetStreamer {
 public:
   PPCTargetELFStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
     Streamer.EmitSymbolValue(&S, 8);
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     // FIXME: Is there anything to do in here or does this directive only
     // limit the parser?
   }
@@ -143,10 +143,10 @@ public:
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
 public:
   PPCTargetMachOStreamer(MCStreamer &S) : PPCTargetStreamer(S) {}
-  virtual void emitTCEntry(const MCSymbol &S) {
+  void emitTCEntry(const MCSymbol &S) override {
     llvm_unreachable("Unknown pseudo-op: .tc");
   }
-  virtual void emitMachine(StringRef CPU) {
+  void emitMachine(StringRef CPU) override {
     // FIXME: We should update the CPUType, CPUSubType in the Object file if
     // the new values are different from the defaults.
   }
@@ -175,13 +175,12 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
 
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new PPCTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
index bbafe2e..cff27ba 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
@@ -44,7 +44,7 @@ public:
   void RecordRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
                         const MCAsmLayout &Layout, const MCFragment *Fragment,
                         const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) {
+                        uint64_t &FixedValue) override {
     if (Writer->is64Bit()) {
       report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
     } else
@@ -206,7 +206,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
@@ -219,7 +219,7 @@ bool PPCMachObjectWriter::RecordScatteredRelocation(
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
@@ -324,7 +324,7 @@ void PPCMachObjectWriter::RecordPPCRelocation(
 
   // this doesn't seem right for RIT_PPC_BR24
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9ce8ea9..e89fb2d 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -16,7 +16,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asmprinter"
 #include "PPC.h"
 #include "InstPrinter/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCExpr.h"
@@ -59,6 +58,8 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asmprinter"
+
 namespace {
   class PPCAsmPrinter : public AsmPrinter {
   protected:
@@ -70,22 +71,22 @@ namespace {
       : AsmPrinter(TM, Streamer),
         Subtarget(TM.getSubtarget<PPCSubtarget>()), TOCLabelID(0) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC Assembly Printer";
     }
 
     MCSymbol *lookUpOrCreateTOCEntry(MCSymbol *Sym);
 
-    virtual void EmitInstruction(const MachineInstr *MI);
+    void EmitInstruction(const MachineInstr *MI) override;
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O);
+                               raw_ostream &O) override;
   };
 
   /// PPCLinuxAsmPrinter - PowerPC assembly printer, customized for Linux
@@ -94,15 +95,15 @@ namespace {
     explicit PPCLinuxAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : PPCAsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Linux PPC Assembly Printer";
     }
 
-    bool doFinalization(Module &M);
+    bool doFinalization(Module &M) override;
 
-    virtual void EmitFunctionEntryLabel();
+    void EmitFunctionEntryLabel() override;
 
-    void EmitFunctionBodyEnd();
+    void EmitFunctionBodyEnd() override;
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -112,12 +113,12 @@ namespace {
     explicit PPCDarwinAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : PPCAsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Darwin PPC Assembly Printer";
     }
 
-    bool doFinalization(Module &M);
-    void EmitStartOfAsmFile(Module &M);
+    bool doFinalization(Module &M) override;
+    void EmitStartOfAsmFile(Module &M) override;
 
     void EmitFunctionStubs(const MachineModuleInfoMachO::SymbolListTy &Stubs);
   };
@@ -180,7 +181,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>()
             .getGVStubEntry(SymToPrint);
-        if (StubSym.getPointer() == 0)
+        if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else if (GV->isDeclaration() || GV->hasCommonLinkage() ||
@@ -190,7 +191,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
         MachineModuleInfoImpl::StubValueTy &StubSym = 
           MMI->getObjFileInfo<MachineModuleInfoMachO>().
                     getHiddenGVStubEntry(SymToPrint);
-        if (StubSym.getPointer() == 0)
+        if (!StubSym.getPointer())
           StubSym = MachineModuleInfoImpl::
             StubValueTy(getSymbol(GV), !GV->hasInternalLinkage());
       } else {
@@ -207,7 +208,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
   }
 
   default:
-    O << "<unknown operand type: " << MO.getType() << ">";
+    O << "<unknown operand type: " << (unsigned)MO.getType() << ">";
     return;
   }
 }
@@ -288,9 +289,9 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(MCSymbol *Sym) {
   MCSymbol *&TOCEntry = TOC[Sym];
 
   // To avoid name clash check if the name already exists.
-  while (TOCEntry == 0) {
+  while (!TOCEntry) {
     if (OutContext.LookupSymbol(Twine(DL->getPrivateGlobalPrefix()) +
-                                "C" + Twine(TOCLabelID++)) == 0) {
+                                "C" + Twine(TOCLabelID++)) == nullptr) {
       TOCEntry = GetTempSymbol("C", TOCLabelID);
     }
   }
@@ -342,7 +343,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // Map symbol -> label of TOC entry
     assert(MO.isGlobal() || MO.isCPI() || MO.isJTI());
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     if (MO.isGlobal())
       MOSymbol = getSymbol(MO.getGlobal());
     else if (MO.isCPI())
@@ -372,23 +373,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(2);
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI()) &&
            "Invalid operand for ADDIStocHA!");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
     bool IsFunction = false;
     bool IsCommon = false;
     bool IsAvailExt = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsCommon = GVar && RealGValue->hasCommonLinkage();
-      IsFunction = !GVar;
-      IsAvailExt = GVar && RealGValue->hasAvailableExternallyLinkage();
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsCommon = GV->hasCommonLinkage();
+      IsFunction = GV->getType()->getElementType()->isFunctionTy();
+      IsAvailExt = GV->hasAvailableExternallyLinkage();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
     else if (MO.isJTI())
@@ -416,7 +413,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineOperand &MO = MI->getOperand(1);
     assert((MO.isGlobal() || MO.isJTI() || MO.isCPI()) &&
            "Invalid operand for LDtocL!");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
 
     if (MO.isJTI())
       MOSymbol = lookUpOrCreateTOCEntry(GetJTISymbol(MO.getIndex()));
@@ -427,14 +424,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     else if (MO.isGlobal()) {
       const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-    
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage() ||
+      MOSymbol = getSymbol(GValue);
+      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+          GValue->hasAvailableExternallyLinkage() ||
           TM.getCodeModel() == CodeModel::Large)
         MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
     }
@@ -456,19 +448,15 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     TmpInst.setOpcode(PPC::ADDI8);
     const MachineOperand &MO = MI->getOperand(2);
     assert((MO.isGlobal() || MO.isCPI()) && "Invalid operand for ADDItocL");
-    MCSymbol *MOSymbol = 0;
+    MCSymbol *MOSymbol = nullptr;
     bool IsExternal = false;
     bool IsFunction = false;
 
     if (MO.isGlobal()) {
-      const GlobalValue *GValue = MO.getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      MOSymbol = getSymbol(RealGValue);
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      IsExternal = GVar && !GVar->hasInitializer();
-      IsFunction = !GVar;
+      const GlobalValue *GV = MO.getGlobal();
+      MOSymbol = getSymbol(GV);
+      IsExternal = GV->isDeclaration();
+      IsFunction = GV->getType()->getElementType()->isFunctionTy();
     } else if (MO.isCPI())
       MOSymbol = GetCPISymbol(MO.getIndex());
 
diff --git a/lib/Target/PowerPC/PPCBranchSelector.cpp b/lib/Target/PowerPC/PPCBranchSelector.cpp
index 9276211..ee90671 100644
--- a/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-branch-select"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCInstrBuilder.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-branch-select"
+
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
 
 namespace llvm {
@@ -42,9 +43,9 @@ namespace {
     /// BlockSizes - The sizes of the basic blocks in the function.
     std::vector<unsigned> BlockSizes;
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC Branch Selector";
     }
   };
@@ -112,7 +113,7 @@ bool PPCBSel::runOnMachineFunction(MachineFunction &Fn) {
       unsigned MBBStartOffset = 0;
       for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
            I != E; ++I) {
-        MachineBasicBlock *Dest = 0;
+        MachineBasicBlock *Dest = nullptr;
         if (I->getOpcode() == PPC::BCC && !I->getOperand(2).isImm())
           Dest = I->getOperand(2).getMBB();
         else if ((I->getOpcode() == PPC::BC || I->getOpcode() == PPC::BCn) &&
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 9c5db50..ec1e34d 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -23,8 +23,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ctrloops"
-
 #include "llvm/Transforms/Scalar.h"
 #include "PPC.h"
 #include "PPCTargetMachine.h"
@@ -61,6 +59,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "ctrloops"
+
 #ifndef NDEBUG
 static cl::opt<int> CTRLoopLimit("ppc-max-ctrloop", cl::Hidden, cl::init(-1));
 #endif
@@ -84,16 +84,16 @@ namespace {
   public:
     static char ID;
 
-    PPCCTRLoops() : FunctionPass(ID), TM(0) {
+    PPCCTRLoops() : FunctionPass(ID), TM(nullptr) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
     PPCCTRLoops(PPCTargetMachine &TM) : FunctionPass(ID), TM(&TM) {
       initializePPCCTRLoopsPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnFunction(Function &F);
+    bool runOnFunction(Function &F) override;
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LoopInfo>();
       AU.addPreserved<LoopInfo>();
       AU.addRequired<DominatorTreeWrapperPass>();
@@ -128,12 +128,12 @@ namespace {
       initializePPCCTRLoopsVerifyPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
   private:
     MachineDominatorTree *MDT;
@@ -172,7 +172,7 @@ bool PPCCTRLoops::runOnFunction(Function &F) {
   SE = &getAnalysis<ScalarEvolution>();
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
-  DL = DLP ? &DLP->getDataLayout() : 0;
+  DL = DLP ? &DLP->getDataLayout() : nullptr;
   LibInfo = getAnalysisIfAvailable<TargetLibraryInfo>();
 
   bool MadeChange = false;
@@ -370,6 +370,14 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
                 J->getOpcode() == Instruction::URem ||
                 J->getOpcode() == Instruction::SRem)) {
       return true;
+    } else if (TT.isArch32Bit() &&
+               isLargeIntegerTy(false, J->getType()->getScalarType()) &&
+               (J->getOpcode() == Instruction::Shl ||
+                J->getOpcode() == Instruction::AShr ||
+                J->getOpcode() == Instruction::LShr)) {
+      // Only on PPC32, for 128-bit integers (specifically not 64-bit
+      // integers), these might be runtime calls.
+      return true;
     } else if (isa<IndirectBrInst>(J) || isa<InvokeInst>(J)) {
       // On PowerPC, indirect jumps use the counter register.
       return true;
@@ -424,9 +432,9 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) {
   SmallVector<BasicBlock*, 4> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  BasicBlock *CountedExitBlock = 0;
-  const SCEV *ExitCount = 0;
-  BranchInst *CountedExitBranch = 0;
+  BasicBlock *CountedExitBlock = nullptr;
+  const SCEV *ExitCount = nullptr;
+  BranchInst *CountedExitBranch = nullptr;
   for (SmallVectorImpl<BasicBlock *>::iterator I = ExitingBlocks.begin(),
        IE = ExitingBlocks.end(); I != IE; ++I) {
     const SCEV *EC = SE->getExitCount(L, *I);
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 84fc888..0875523 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -32,7 +32,7 @@ namespace {
     JITCodeEmitter &MCE;
     MachineModuleInfo *MMI;
     
-    void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<MachineModuleInfo>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
@@ -73,11 +73,13 @@ namespace {
     unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getTLSCallEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
-    const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
+    const char *getPassName() const override {
+      return "PowerPC Machine Code Emitter";
+    }
 
     /// runOnMachineFunction - emits the given MachineFunction to memory
     ///
-    bool runOnMachineFunction(MachineFunction &MF);
+    bool runOnMachineFunction(MachineFunction &MF) override;
 
     /// emitBasicBlock - emits the given MachineBasicBlock to memory
     ///
@@ -102,7 +104,7 @@ bool PPCCodeEmitter::runOnMachineFunction(MachineFunction &MF) {
   MMI = &getAnalysis<MachineModuleInfo>();
   MCE.setModuleInfo(MMI);
   do {
-    MovePCtoLROffset = 0;
+    MovePCtoLROffset = nullptr;
     MCE.startFunction(MF);
     for (MachineFunction::iterator BB = MF.begin(), E = MF.end(); BB != E; ++BB)
       emitBasicBlock(*BB);
diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp
index dd45683..ed3cb4d 100644
--- a/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/lib/Target/PowerPC/PPCFastISel.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppcfastisel"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCISelLowering.h"
@@ -58,6 +57,8 @@
 //===----------------------------------------------------------------------===//
 using namespace llvm;
 
+#define DEBUG_TYPE "ppcfastisel"
+
 namespace {
 
 typedef struct Address {
@@ -85,7 +86,7 @@ class PPCFastISel final : public FastISel {
   const TargetMachine &TM;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
-  const PPCSubtarget &PPCSubTarget;
+  const PPCSubtarget *PPCSubTarget;
   LLVMContext *Context;
 
   public:
@@ -95,31 +96,29 @@ class PPCFastISel final : public FastISel {
       TM(FuncInfo.MF->getTarget()),
       TII(*TM.getInstrInfo()),
       TLI(*TM.getTargetLowering()),
-      PPCSubTarget(
-       *((static_cast<const PPCTargetMachine *>(&TM))->getSubtargetImpl())
-      ),
+      PPCSubTarget(&TM.getSubtarget<PPCSubtarget>()),
       Context(&FuncInfo.Fn->getContext()) { }
 
   // Backend specific FastISel code.
   private:
-    virtual bool TargetSelectInstruction(const Instruction *I);
-    virtual unsigned TargetMaterializeConstant(const Constant *C);
-    virtual unsigned TargetMaterializeAlloca(const AllocaInst *AI);
-    virtual bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
-                                     const LoadInst *LI);
-    virtual bool FastLowerArguments();
-    virtual unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm);
-    virtual unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     uint64_t Imm);
-    virtual unsigned FastEmitInst_r(unsigned MachineInstOpcode,
-                                    const TargetRegisterClass *RC,
-                                    unsigned Op0, bool Op0IsKill);
-    virtual unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Op0, bool Op0IsKill,
-                                     unsigned Op1, bool Op1IsKill);
+    bool TargetSelectInstruction(const Instruction *I) override;
+    unsigned TargetMaterializeConstant(const Constant *C) override;
+    unsigned TargetMaterializeAlloca(const AllocaInst *AI) override;
+    bool tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
+                             const LoadInst *LI) override;
+    bool FastLowerArguments() override;
+    unsigned FastEmit_i(MVT Ty, MVT RetTy, unsigned Opc, uint64_t Imm) override;
+    unsigned FastEmitInst_ri(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             uint64_t Imm);
+    unsigned FastEmitInst_r(unsigned MachineInstOpcode,
+                            const TargetRegisterClass *RC,
+                            unsigned Op0, bool Op0IsKill);
+    unsigned FastEmitInst_rr(unsigned MachineInstOpcode,
+                             const TargetRegisterClass *RC,
+                             unsigned Op0, bool Op0IsKill,
+                             unsigned Op1, bool Op1IsKill);
 
   // Instruction selection routines.
   private:
@@ -282,7 +281,7 @@ bool PPCFastISel::isLoadTypeLegal(Type *Ty, MVT &VT) {
 // Given a value Obj, create an Address object Addr that represents its
 // address.  Return false if we can't handle it.
 bool PPCFastISel::PPCComputeAddress(const Value *Obj, Address &Addr) {
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(Obj)) {
     // Don't walk into other basic blocks unless the object is an alloca from
@@ -556,7 +555,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
   // to constrain RA from using R0/X0 when this is not legal.
   unsigned AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
-    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
   unsigned ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC))
@@ -739,7 +738,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  if (SrcVT == MVT::i1 && PPCSubTarget.useCRBits())
+  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
     return false;
 
   // See if operand 2 is an immediate encodeable in the compare.
@@ -900,7 +899,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
     if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
       Addr.Offset = 4;
-    } else if (PPCSubTarget.hasLFIWAX()) {
+    } else if (PPCSubTarget->hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
       Addr.Offset = 4;
     }
@@ -941,7 +940,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // We can only lower an unsigned convert if we have the newer
   // floating-point conversion operations.
-  if (!IsSigned && !PPCSubTarget.hasFPCVT())
+  if (!IsSigned && !PPCSubTarget->hasFPCVT())
     return false;
 
   // FIXME: For now we require the newer floating-point conversion operations
@@ -949,7 +948,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
   // to single-precision float.  Otherwise we have to generate a lot of
   // fiddly code to avoid double rounding.  If necessary, the fiddly code
   // can be found in PPCTargetLowering::LowerINT_TO_FP().
-  if (DstVT == MVT::f32 && !PPCSubTarget.hasFPCVT())
+  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
     return false;
 
   // Extend the input if necessary.
@@ -1012,7 +1011,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   // to determine the required register class.
   unsigned AssignedReg = FuncInfo.ValueMap[I];
   const TargetRegisterClass *RC =
-    AssignedReg ? MRI.getRegClass(AssignedReg) : 0;
+    AssignedReg ? MRI.getRegClass(AssignedReg) : nullptr;
 
   unsigned ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC, !IsSigned))
@@ -1064,7 +1063,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     if (IsSigned)
       Opc = PPC::FCTIWZ;
     else
-      Opc = PPCSubTarget.hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+      Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
   else
     Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
 
@@ -1863,7 +1862,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
   if (!GVar) {
     // If GV is an alias, use the aliasee for determining thread-locality.
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal());
+      GVar = dyn_cast_or_null<GlobalVariable>(GA->getAliasee());
   }
 
   // FIXME: We don't yet handle the complexity of TLS.
@@ -2001,7 +2000,7 @@ unsigned PPCFastISel::PPCMaterialize64BitInt(int64_t Imm,
 unsigned PPCFastISel::PPCMaterializeInt(const Constant *C, MVT VT) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     const ConstantInt *CI = cast<ConstantInt>(C);
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2149,7 +2148,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   unsigned ResultReg = MI->getOperand(0).getReg();
 
-  if (!PPCEmitLoad(VT, ResultReg, Addr, 0, IsZExt))
+  if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt))
     return false;
 
   MI->eraseFromParent();
@@ -2175,7 +2174,7 @@ unsigned PPCFastISel::FastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
 
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget.useCRBits()) {
+  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
@@ -2261,6 +2260,6 @@ namespace llvm {
     if (Subtarget->isPPC64() && Subtarget->isSVR4ABI())
       return new PPCFastISel(FuncInfo, LibInfo);
 
-    return 0;
+    return nullptr;
   }
 }
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index d8f491f..e294156 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -222,7 +222,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   if (!DisableRedZone &&
       (Subtarget.isPPC64() ||                      // 32-bit SVR4, no stack-
        !Subtarget.isSVR4ABI() ||                   //   allocated locals.
-	FrameSize == 0) &&
+        FrameSize == 0) &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
@@ -281,8 +281,8 @@ bool PPCFrameLowering::needsFP(const MachineFunction &MF) const {
 
   // Naked functions have no stack frame pushed, so we don't have a frame
   // pointer.
-  if (MF.getFunction()->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                                     Attribute::Naked))
+  if (MF.getFunction()->getAttributes().hasAttribute(
+          AttributeSet::FunctionIndex, Attribute::Naked))
     return false;
 
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
@@ -426,7 +426,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -562,13 +563,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     assert(NegFrameSize);
     unsigned CFIIndex = MMI.addFrameInst(
         MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+    BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
 
     if (HasFP) {
       unsigned Reg = MRI->getDwarfRegNum(FPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, FPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -576,7 +578,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned Reg = MRI->getDwarfRegNum(BPReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, BPOffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -584,7 +586,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned Reg = MRI->getDwarfRegNum(LRReg, true);
       CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createOffset(nullptr, Reg, LROffset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -601,7 +603,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
 
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -629,7 +631,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (isSVR4ABI && isPPC64 && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
             nullptr, MRI->getDwarfRegNum(PPC::CR2, true), 8));
-        BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+        BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
       }
@@ -637,7 +639,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned CFIIndex = MMI.addFrameInst(MCCFIInstruction::createOffset(
           nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
   }
@@ -712,7 +714,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
       assert(FPIndex && "No Frame Pointer Save Slot!");
       FPOffset = FFI->getObjectOffset(FPIndex);
     } else {
-      FPOffset = PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
+      FPOffset =
+          PPCFrameLowering::getFramePointerSaveOffset(isPPC64, isDarwinABI);
     }
   }
 
@@ -930,9 +933,9 @@ PPCFrameLowering::processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
     MFI->CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
-  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the 
+  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
   // function uses CR 2, 3, or 4.
-  if (!isPPC64 && !isDarwinABI && 
+  if (!isPPC64 && !isDarwinABI &&
       (MRI.isPhysRegUsed(PPC::CR2) ||
        MRI.isPhysRegUsed(PPC::CR3) ||
        MRI.isPhysRegUsed(PPC::CR4))) {
@@ -1106,10 +1109,10 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
       unsigned Reg = CSI[i].getReg();
 
       if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
-	  // Leave Darwin logic as-is.
-	  || (!Subtarget.isSVR4ABI() &&
-	      (PPC::CRBITRCRegClass.contains(Reg) ||
-	       PPC::CRRCRegClass.contains(Reg)))) {
+          // Leave Darwin logic as-is.
+          || (!Subtarget.isSVR4ABI() &&
+              (PPC::CRBITRCRegClass.contains(Reg) ||
+               PPC::CRRCRegClass.contains(Reg)))) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -1190,11 +1193,11 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
   }
 }
 
-bool 
+bool
 PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-				     MachineBasicBlock::iterator MI,
-				     const std::vector<CalleeSavedInfo> &CSI,
-				     const TargetRegisterInfo *TRI) const {
+                                     MachineBasicBlock::iterator MI,
+                                     const std::vector<CalleeSavedInfo> &CSI,
+                                     const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1207,7 +1210,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
-  
+
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
     // Only Darwin actually uses the VRSAVE register, but it can still appear
@@ -1237,21 +1240,21 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         CRSpilled = true;
         FuncInfo->setSpillsCR();
 
-	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
-	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
-	CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
+        // 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+        // the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+        CRMIB = BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12)
                   .addReg(Reg, RegState::ImplicitKill);
 
-	MBB.insert(MI, CRMIB);
-	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
-					 .addReg(PPC::R12,
-						 getKillRegState(true)),
-					 CSI[i].getFrameIdx()));
+        MBB.insert(MI, CRMIB);
+        MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+                                         .addReg(PPC::R12,
+                                                 getKillRegState(true)),
+                                         CSI[i].getFrameIdx()));
       }
     } else {
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.storeRegToStackSlot(MBB, MI, Reg, true,
-			      CSI[i].getFrameIdx(), RC, TRI);
+                              CSI[i].getFrameIdx(), RC, TRI);
     }
   }
   return true;
@@ -1260,8 +1263,8 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 static void
 restoreCRs(bool isPPC64, bool is31,
            bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
-	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII =
@@ -1275,12 +1278,12 @@ restoreCRs(bool isPPC64, bool is31,
   else {
     // 32-bit:  FP-relative
     MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
-					     PPC::R12),
-				     CSI[CSIIndex].getFrameIdx()));
+                                             PPC::R12),
+                                     CSI[CSIIndex].getFrameIdx()));
     RestoreOp = PPC::MTOCRF;
     MoveReg = PPC::R12;
   }
-  
+
   if (CR2Spilled)
     MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
                .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
@@ -1335,11 +1338,11 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   MBB.erase(I);
 }
 
-bool 
+bool
 PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-					MachineBasicBlock::iterator MI,
-				        const std::vector<CalleeSavedInfo> &CSI,
-					const TargetRegisterInfo *TRI) const {
+                                        MachineBasicBlock::iterator MI,
+                                        const std::vector<CalleeSavedInfo> &CSI,
+                                        const TargetRegisterInfo *TRI) const {
 
   // Currently, this function only handles SVR4 32- and 64-bit ABIs.
   // Return false otherwise to maintain pre-existing behavior.
@@ -1387,20 +1390,20 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       // When we first encounter a non-CR register after seeing at
       // least one CR register, restore all spilled CRs together.
       if ((CR2Spilled || CR3Spilled || CR4Spilled)
-	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
         bool is31 = needsFP(*MF);
         restoreCRs(Subtarget.isPPC64(), is31,
                    CR2Spilled, CR3Spilled, CR4Spilled,
-		   MBB, I, CSI, CSIIndex);
-	CR2Spilled = CR3Spilled = CR4Spilled = false;
+                   MBB, I, CSI, CSIIndex);
+        CR2Spilled = CR3Spilled = CR4Spilled = false;
       }
 
       // Default behavior for non-CR saves.
       const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
       TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
-			       RC, TRI);
+                               RC, TRI);
       assert(I != MBB.begin() &&
-	     "loadRegFromStackSlot didn't insert any code!");
+             "loadRegFromStackSlot didn't insert any code!");
       }
 
     // Insert in reverse order.
@@ -1409,16 +1412,15 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     else {
       I = BeforeI;
       ++I;
-    }	    
+    }
   }
 
   // If we haven't yet spilled the CRs, do so now.
   if (CR2Spilled || CR3Spilled || CR4Spilled) {
-    bool is31 = needsFP(*MF); 
+    bool is31 = needsFP(*MF);
     restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
-	       MBB, I, CSI, CSIIndex);
+               MBB, I, CSI, CSIIndex);
   }
 
   return true;
 }
-
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index 7aab37e..94e9b67 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -38,37 +38,37 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasFP(const MachineFunction &MF) const override;
   bool needsFP(const MachineFunction &MF) const;
   void replaceFPWithRealFP(MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
   void addScavengingSpillSlot(MachineFunction &MF, RegScavenger *RS) const;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  const std::vector<CalleeSavedInfo> &CSI,
-                                 const TargetRegisterInfo *TRI) const;
+                                 const TargetRegisterInfo *TRI) const override;
 
   void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
   bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
   /// time).
-  bool targetHandlesStackFrameRounding() const { return true; }
+  bool targetHandlesStackFrameRounding() const override { return true; }
 
   /// getReturnSaveOffset - Return the previous frame offset to save the
   /// return address.
@@ -141,7 +141,7 @@ public:
 
   // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
   const SpillSlot *
-  getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
     if (Subtarget.isDarwinABI()) {
       NumEntries = 1;
       if (Subtarget.isPPC64()) {
@@ -156,7 +156,7 @@ public:
     // Early exit if not using the SVR4 ABI.
     if (!Subtarget.isSVR4ABI()) {
       NumEntries = 0;
-      return 0;
+      return nullptr;
     }
 
     // Note that the offsets here overlap, but this is fixed up in
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.cpp b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
index 37c85b3..7ca706b 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.cpp
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "pre-RA-sched"
 #include "PPCHazardRecognizers.h"
 #include "PPC.h"
 #include "PPCInstrInfo.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "pre-RA-sched"
+
 bool PPCDispatchGroupSBHazardRecognizer::isLoadAfterStore(SUnit *SU) {
   // FIXME: Move this.
   if (isBCTRAfterSet(SU))
@@ -226,7 +227,7 @@ void PPCDispatchGroupSBHazardRecognizer::EmitNoop() {
     CurGroup.clear();
     CurSlots = CurBranches = 0;
   } else {
-    CurGroup.push_back(0);
+    CurGroup.push_back(nullptr);
     ++CurSlots;
   }
 }
diff --git a/lib/Target/PowerPC/PPCHazardRecognizers.h b/lib/Target/PowerPC/PPCHazardRecognizers.h
index 6b7fe41..cf4332c 100644
--- a/lib/Target/PowerPC/PPCHazardRecognizers.h
+++ b/lib/Target/PowerPC/PPCHazardRecognizers.h
@@ -37,14 +37,14 @@ public:
     ScoreboardHazardRecognizer(ItinData, DAG_), DAG(DAG_),
     CurSlots(0), CurBranches(0) {}
 
-  virtual HazardType getHazardType(SUnit *SU, int Stalls);
-  virtual bool ShouldPreferAnother(SUnit* SU);
-  virtual unsigned PreEmitNoops(SUnit *SU);
-  virtual void EmitInstruction(SUnit *SU);
-  virtual void AdvanceCycle();
-  virtual void RecedeCycle();
-  virtual void Reset();
-  virtual void EmitNoop();
+  HazardType getHazardType(SUnit *SU, int Stalls) override;
+  bool ShouldPreferAnother(SUnit* SU) override;
+  unsigned PreEmitNoops(SUnit *SU) override;
+  void EmitInstruction(SUnit *SU) override;
+  void AdvanceCycle() override;
+  void RecedeCycle() override;
+  void Reset() override;
+  void EmitNoop() override;
 };
 
 /// PPCHazardRecognizer970 - This class defines a finite state automata that
@@ -76,10 +76,10 @@ class PPCHazardRecognizer970 : public ScheduleHazardRecognizer {
 
 public:
   PPCHazardRecognizer970(const TargetMachine &TM);
-  virtual HazardType getHazardType(SUnit *SU, int Stalls);
-  virtual void EmitInstruction(SUnit *SU);
-  virtual void AdvanceCycle();
-  virtual void Reset();
+  virtual HazardType getHazardType(SUnit *SU, int Stalls) override;
+  virtual void EmitInstruction(SUnit *SU) override;
+  virtual void AdvanceCycle() override;
+  virtual void Reset() override;
 
 private:
   /// EndDispatchGroup - Called when we are finishing a new dispatch group.
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 3bbc839..251e8b6 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppc-codegen"
 #include "PPC.h"
 #include "MCTargetDesc/PPCPredicates.h"
 #include "PPCTargetMachine.h"
@@ -35,6 +34,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppc-codegen"
+
 // FIXME: Remove this once the bug has been fixed!
 cl::opt<bool> ANDIGlueBug("expose-ppc-andi-glue-bug",
 cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden);
@@ -50,29 +51,31 @@ namespace {
   ///
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
-    const PPCTargetLowering &PPCLowering;
-    const PPCSubtarget &PPCSubTarget;
+    const PPCTargetLowering *PPCLowering;
+    const PPCSubtarget *PPCSubTarget;
     unsigned GlobalBaseReg;
   public:
     explicit PPCDAGToDAGISel(PPCTargetMachine &tm)
       : SelectionDAGISel(tm), TM(tm),
-        PPCLowering(*TM.getTargetLowering()),
-        PPCSubTarget(*TM.getSubtargetImpl()) {
+        PPCLowering(TM.getTargetLowering()),
+        PPCSubTarget(TM.getSubtargetImpl()) {
       initializePPCDAGToDAGISelPass(*PassRegistry::getPassRegistry());
     }
 
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
+      PPCLowering = TM.getTargetLowering();
+      PPCSubTarget = TM.getSubtargetImpl();
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      if (!PPCSubTarget.isSVR4ABI())
+      if (!PPCSubTarget->isSVR4ABI())
         InsertVRSaveCode(MF);
 
       return true;
     }
 
-    virtual void PostprocessISelDAG();
+    void PostprocessISelDAG() override;
 
     /// getI32Imm - Return a target constant with the specified value, of type
     /// i32.
@@ -88,7 +91,7 @@ namespace {
 
     /// getSmallIPtrImm - Return a target constant of pointer type.
     inline SDValue getSmallIPtrImm(unsigned Imm) {
-      return CurDAG->getTargetConstant(Imm, PPCLowering.getPointerTy());
+      return CurDAG->getTargetConstant(Imm, PPCLowering->getPointerTy());
     }
 
     /// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s
@@ -109,7 +112,7 @@ namespace {
 
     // Select - Convert the specified operand from a target-independent to a
     // target-specific node if it hasn't already been changed.
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
 
     SDNode *SelectBitfieldInsert(SDNode *N);
 
@@ -121,7 +124,7 @@ namespace {
     /// a base register plus a signed 16-bit displacement [r+imm].
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false);
     }
 
     /// SelectAddrImmOffs - Return true if the operand is valid for a preinc
@@ -141,20 +144,20 @@ namespace {
     /// represented as an indexed [r+r] operation.  Returns false if it can
     /// be represented by [r+imm], which are preferred.
     bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegReg(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrIdxOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering.SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
+      return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
 
     /// SelectAddrImmX4 - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement that is a multiple of 4.
     /// Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering.SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true);
     }
 
     // Select an address into a single register.
@@ -168,16 +171,16 @@ namespace {
     /// a register.  The case of adding a (possibly relocatable) constant to a
     /// register can be improved, but it is wrong to substitute Reg+Reg for
     /// Reg in an asm, because the load or store opcode would have to change.
-   virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                              char ConstraintCode,
-                                              std::vector<SDValue> &OutOps) {
+   bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                      char ConstraintCode,
+                                      std::vector<SDValue> &OutOps) override {
       OutOps.push_back(Op);
       return false;
     }
 
     void InsertVRSaveCode(MachineFunction &MF);
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "PowerPC DAG->DAG Pattern Instruction Selection";
     }
 
@@ -188,7 +191,7 @@ private:
     SDNode *SelectSETCC(SDNode *N);
 
     void PeepholePPC64();
-    void PeepholdCROps();
+    void PeepholeCROps();
 
     bool AllUsersSelectZero(SDNode *N);
     void SwapAllSelectUsers(SDNode *N);
@@ -271,7 +274,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
     DebugLoc dl;
 
-    if (PPCLowering.getPointerTy() == MVT::i32) {
+    if (PPCLowering->getPointerTy() == MVT::i32) {
       GlobalBaseReg = RegInfo->createVirtualRegister(&PPC::GPRC_NOR0RegClass);
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MovePCtoLR));
       BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
@@ -282,7 +285,7 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     }
   }
   return CurDAG->getRegister(GlobalBaseReg,
-                             PPCLowering.getPointerTy()).getNode();
+                             PPCLowering->getPointerTy()).getNode();
 }
 
 /// isIntS16Immediate - This method tests to see if the node is either a 32-bit
@@ -414,8 +417,8 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
   SDLoc dl(N);
 
   APInt LKZ, LKO, RKZ, RKO;
-  CurDAG->ComputeMaskedBits(Op0, LKZ, LKO);
-  CurDAG->ComputeMaskedBits(Op1, RKZ, RKO);
+  CurDAG->computeKnownBits(Op0, LKZ, LKO);
+  CurDAG->computeKnownBits(Op1, RKZ, RKO);
 
   unsigned TargetMask = LKZ.getZExtValue();
   unsigned InsertMask = RKZ.getZExtValue();
@@ -458,11 +461,18 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
         SH  = (Op1Opc == ISD::SHL) ? Value : 32 - Value;
       }
       if (Op1Opc == ISD::AND) {
+       // The AND mask might not be a constant, and we need to make sure that
+       // if we're going to fold the masking with the insert, all bits not
+       // know to be zero in the mask are known to be one.
+        APInt MKZ, MKO;
+        CurDAG->computeKnownBits(Op1.getOperand(1), MKZ, MKO);
+        bool CanFoldMask = InsertMask == MKO.getZExtValue();
+
         unsigned SHOpc = Op1.getOperand(0).getOpcode();
-        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) &&
+        if ((SHOpc == ISD::SHL || SHOpc == ISD::SRL) && CanFoldMask &&
             isInt32Immediate(Op1.getOperand(0).getOperand(1), Value)) {
-	  // Note that Value must be in range here (less than 32) because
-	  // otherwise there would not be any bits set in InsertMask.
+          // Note that Value must be in range here (less than 32) because
+          // otherwise there would not be any bits set in InsertMask.
           Op1 = Op1.getOperand(0).getOperand(0);
           SH  = (SHOpc == ISD::SHL) ? Value : 32 - Value;
         }
@@ -474,7 +484,7 @@ SDNode *PPCDAGToDAGISel::SelectBitfieldInsert(SDNode *N) {
       return CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// SelectCC - Select a comparison of the specified values with the specified
@@ -572,7 +582,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS,
     Opc = PPC::FCMPUS;
   } else {
     assert(LHS.getValueType() == MVT::f64 && "Unknown vt!");
-    Opc = PPCSubTarget.hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+    Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
 }
@@ -738,7 +748,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   EVT PtrVT = CurDAG->getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
 
-  if (!PPCSubTarget.useCRBits() &&
+  if (!PPCSubTarget->useCRBits() &&
       isInt32Immediate(N->getOperand(1), Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
@@ -750,7 +760,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETEQ: {
         Op = SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Op), 0);
         SDValue Ops[] = { Op, getI32Imm(27), getI32Imm(5), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETNE: {
         if (isPPC64) break;
@@ -762,14 +772,14 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       }
       case ISD::SETLT: {
         SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETGT: {
         SDValue T =
           SDValue(CurDAG->getMachineNode(PPC::NEG, dl, MVT::i32, Op), 0);
         T = SDValue(CurDAG->getMachineNode(PPC::ANDC, dl, MVT::i32, T, Op), 0);
         SDValue Ops[] = { T, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       }
     } else if (Imm == ~0U) {        // setcc op, -1
@@ -799,7 +809,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
         SDValue AN = SDValue(CurDAG->getMachineNode(PPC::AND, dl, MVT::i32, AD,
                                                     Op), 0);
         SDValue Ops[] = { AN, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
-        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+        return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
       }
       case ISD::SETGT: {
         SDValue Ops[] = { Op, getI32Imm(1), getI32Imm(31), getI32Imm(31) };
@@ -820,7 +830,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   if (LHS.getValueType().isVector()) {
     EVT VecVT = LHS.getValueType();
     MVT::SimpleValueType VT = VecVT.getSimpleVT().SimpleTy;
-    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget.hasVSX());
+    unsigned int VCmpInst = getVCmpInst(VT, CC, PPCSubTarget->hasVSX());
 
     switch (CC) {
       case ISD::SETEQ:
@@ -831,7 +841,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETONE:
       case ISD::SETUNE: {
         SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLNOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR :
                                                                PPC::VNOR,
                                     VecVT, VCmp, VCmp);
       } 
@@ -853,9 +863,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
           return CurDAG->SelectNodeTo(N, VCmpInst, VecVT, LHS, RHS);
         } else {
           SDValue VCmpGT(CurDAG->getMachineNode(VCmpInst, dl, VecVT, LHS, RHS), 0);
-          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+          unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
           SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-          return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+          return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                  PPC::VOR,
                                       VecVT, VCmpGT, VCmpEQ);
         }
@@ -864,9 +874,9 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
       case ISD::SETOLE:
       case ISD::SETULE: {
         SDValue VCmpLE(CurDAG->getMachineNode(VCmpInst, dl, VecVT, RHS, LHS), 0);
-        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget.hasVSX());
+        unsigned int VCmpEQInst = getVCmpEQInst(VT, PPCSubTarget->hasVSX());
         SDValue VCmpEQ(CurDAG->getMachineNode(VCmpEQInst, dl, VecVT, LHS, RHS), 0);
-        return CurDAG->SelectNodeTo(N, PPCSubTarget.hasVSX() ? PPC::XXLOR :
+        return CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLOR :
                                                                PPC::VOR,
                                     VecVT, VCmpLE, VCmpEQ);
       }
@@ -875,8 +885,8 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
     }
   }
 
-  if (PPCSubTarget.useCRBits())
-    return 0;
+  if (PPCSubTarget->useCRBits())
+    return nullptr;
 
   bool Inv;
   unsigned Idx = getCRIdxForSetCC(CC, Inv);
@@ -886,7 +896,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   // Force the ccreg into CR7.
   SDValue CR7Reg = CurDAG->getRegister(PPC::CR7, MVT::i32);
 
-  SDValue InFlag(0, 0);  // Null incoming flag value.
+  SDValue InFlag(nullptr, 0);  // Null incoming flag value.
   CCReg = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, CR7Reg, CCReg,
                                InFlag).getValue(1);
 
@@ -896,7 +906,7 @@ SDNode *PPCDAGToDAGISel::SelectSETCC(SDNode *N) {
   SDValue Ops[] = { IntCR, getI32Imm((32-(3-Idx)) & 31),
                       getI32Imm(31), getI32Imm(31) };
   if (!Inv)
-    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+    return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
 
   // Get the specified bit.
   SDValue Tmp =
@@ -911,7 +921,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
@@ -1093,7 +1103,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Offset, Base, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     } else {
       unsigned Opcode;
@@ -1128,7 +1138,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       SDValue Base = LD->getBasePtr();
       SDValue Ops[] = { Base, Offset, Chain };
       return CurDAG->getMachineNode(Opcode, dl, LD->getValueType(0),
-                                    PPCLowering.getPointerTy(),
+                                    PPCLowering->getPointerTy(),
                                     MVT::Other, Ops);
     }
   }
@@ -1143,7 +1153,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
       SDValue Val = N->getOperand(0).getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
     // If this is just a masked value where the input is not handled above, and
     // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
@@ -1152,7 +1162,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         N->getOperand(0).getOpcode() != ISD::ROTL) {
       SDValue Val = N->getOperand(0);
       SDValue Ops[] = { Val, getI32Imm(0), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
     // If this is a 64-bit zero-extension mask, emit rldicl.
     if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
@@ -1174,12 +1184,12 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       }
 
       SDValue Ops[] = { Val, getI32Imm(SH), getI32Imm(MB) };
-      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
     }
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
-      return NULL;
+      return nullptr;
     }
     // ISD::OR doesn't get all the bitfield insertion fun.
     // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) is a bitfield insert
@@ -1212,7 +1222,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N, Imm, true, SH, MB, ME)) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
 
     // Other cases are autogenerated.
@@ -1224,7 +1234,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
         isRotateAndMask(N, Imm, true, SH, MB, ME)) {
       SDValue Ops[] = { N->getOperand(0).getOperand(0),
                           getI32Imm(SH), getI32Imm(MB), getI32Imm(ME) };
-      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops, 4);
+      return CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     }
 
     // Other cases are autogenerated.
@@ -1259,7 +1269,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
-    if (PPCSubTarget.useCRBits() &&
+    if (PPCSubTarget->useCRBits() &&
         N->getOperand(0).getValueType() == MVT::i1)
       break;
 
@@ -1327,17 +1337,17 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     SDValue Ops[] = { CCReg, N->getOperand(2), N->getOperand(3),
                         getI32Imm(BROpc) };
-    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops, 4);
+    return CurDAG->SelectNodeTo(N, SelectCCOp, N->getValueType(0), Ops);
   }
   case ISD::VSELECT:
-    if (PPCSubTarget.hasVSX()) {
+    if (PPCSubTarget->hasVSX()) {
       SDValue Ops[] = { N->getOperand(2), N->getOperand(1), N->getOperand(0) };
-      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::XXSEL, N->getValueType(0), Ops);
     }
 
     break;
   case ISD::VECTOR_SHUFFLE:
-    if (PPCSubTarget.hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
                                   N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
       
@@ -1364,23 +1374,23 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
           SDValue Chain = LD->getChain();
           SDValue Ops[] = { Base, Offset, Chain };
           return CurDAG->SelectNodeTo(N, PPC::LXVDSX,
-                                      N->getValueType(0), Ops, 3);
+                                      N->getValueType(0), Ops);
         }
       }
 
       SDValue Ops[] = { Op1, Op2, DMV };
-      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops, 3);
+      return CurDAG->SelectNodeTo(N, PPC::XXPERMDI, N->getValueType(0), Ops);
     }
 
     break;
   case PPCISD::BDNZ:
   case PPCISD::BDZ: {
-    bool IsPPC64 = PPCSubTarget.isPPC64();
+    bool IsPPC64 = PPCSubTarget->isPPC64();
     SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
     return CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ ?
                                    (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ) :
                                    (IsPPC64 ? PPC::BDZ8 : PPC::BDZ),
-                                MVT::Other, Ops, 2);
+                                MVT::Other, Ops);
   }
   case PPCISD::COND_BRANCH: {
     // Op #0 is the Chain.
@@ -1393,7 +1403,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
       getI32Imm(cast<ConstantSDNode>(N->getOperand(1))->getZExtValue());
     SDValue Ops[] = { Pred, N->getOperand(2), N->getOperand(3),
       N->getOperand(0), N->getOperand(4) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 5);
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
   }
   case ISD::BR_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
@@ -1422,7 +1432,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     SDValue CondCode = SelectCC(N->getOperand(2), N->getOperand(3), CC, dl);
     SDValue Ops[] = { getI32Imm(PCC), CondCode,
                         N->getOperand(4), N->getOperand(0) };
-    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops, 4);
+    return CurDAG->SelectNodeTo(N, PPC::BCC, MVT::Other, Ops);
   }
   case ISD::BRIND: {
     // FIXME: Should custom lower this.
@@ -1435,7 +1445,7 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->SelectNodeTo(N, Reg, MVT::Other, Chain);
   }
   case PPCISD::TOC_ENTRY: {
-    assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
+    assert (PPCSubTarget->isPPC64() && "Only supported for 64-bit ABI");
 
     // For medium and large code model, we generate two instructions as
     // described below.  Otherwise we allow SelectCodeCommon to handle this,
@@ -1462,18 +1472,8 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
 
     if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(GA)) {
       const GlobalValue *GValue = G->getGlobal();
-      const GlobalAlias *GAlias = dyn_cast<GlobalAlias>(GValue);
-      const GlobalValue *RealGValue =
-          GAlias ? GAlias->getAliasedGlobal() : GValue;
-      const GlobalVariable *GVar = dyn_cast<GlobalVariable>(RealGValue);
-      assert((GVar || isa<Function>(RealGValue)) &&
-             "Unexpected global value subclass!");
-
-      // An external variable is one without an initializer.  For these,
-      // for variables with common linkage, and for Functions, generate
-      // the LDtocL form.
-      if (!GVar || !GVar->hasInitializer() || RealGValue->hasCommonLinkage() ||
-          RealGValue->hasAvailableExternallyLinkage())
+      if (GValue->isDeclaration() || GValue->hasCommonLinkage() ||
+        GValue->hasAvailableExternallyLinkage())
         return CurDAG->getMachineNode(PPC::LDtocL, dl, MVT::i64, GA,
                                       SDValue(Tmp, 0));
     }
@@ -1566,7 +1566,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
     return;
 
   PeepholePPC64();
-  PeepholdCROps();
+  PeepholeCROps();
 }
 
 // Check if all users of this node will become isel where the second operand
@@ -1576,7 +1576,7 @@ void PPCDAGToDAGISel::PostprocessISelDAG() {
 // containing zero.
 bool PPCDAGToDAGISel::AllUsersSelectZero(SDNode *N) {
   // If we're not using isel, then this does not matter.
-  if (!PPCSubTarget.hasISEL())
+  if (!PPCSubTarget->hasISEL())
     return false;
 
   for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
@@ -1637,7 +1637,7 @@ void PPCDAGToDAGISel::SwapAllSelectUsers(SDNode *N) {
   }
 }
 
-void PPCDAGToDAGISel::PeepholdCROps() {
+void PPCDAGToDAGISel::PeepholeCROps() {
   bool IsModified;
   do {
     IsModified = false;
@@ -2038,7 +2038,7 @@ void PPCDAGToDAGISel::PeepholdCROps() {
 
 void PPCDAGToDAGISel::PeepholePPC64() {
   // These optimizations are currently supported only for 64-bit SVR4.
-  if (PPCSubTarget.isDarwin() || !PPCSubTarget.isPPC64())
+  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
     return;
 
   SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
@@ -2196,8 +2196,8 @@ FunctionPass *llvm::createPPCISelDag(PPCTargetMachine &TM) {
 
 static void initializePassOnce(PassRegistry &Registry) {
   const char *Name = "PowerPC DAG->DAG Pattern Instruction Selection";
-  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID, 0,
-                              false, false);
+  PassInfo *PI = new PassInfo(Name, "ppc-codegen", &SelectionDAGISel::ID,
+                              nullptr, false, false);
   Registry.registerPass(*PI, true);
 }
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 32ac1dc..cf4c9e6 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -18,6 +18,7 @@
 #include "PPCTargetMachine.h"
 #include "PPCTargetObjectFile.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -459,6 +460,7 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
       setOperationAction(ISD::SDIVREM, VT, Expand);
       setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Expand);
       setOperationAction(ISD::FPOW, VT, Expand);
+      setOperationAction(ISD::BSWAP, VT, Expand);
       setOperationAction(ISD::CTPOP, VT, Expand);
       setOperationAction(ISD::CTLZ, VT, Expand);
       setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
@@ -758,7 +760,7 @@ unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty) const {
 
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case PPCISD::FSEL:            return "PPCISD::FSEL";
   case PPCISD::FCFID:           return "PPCISD::FCFID";
   case PPCISD::FCTIDZ:          return "PPCISD::FCTIDZ";
@@ -929,7 +931,7 @@ bool PPC::isVMRGHShuffleMask(ShuffleVectorSDNode *N, unsigned UnitSize,
 /// amount, otherwise return -1.
 int PPC::isVSLDOIShuffleMask(SDNode *N, bool isUnary) {
   if (N->getValueType(0) != MVT::v16i8)
-    return false;
+    return -1;
 
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
 
@@ -1019,7 +1021,7 @@ unsigned PPC::getVSPLTImmediate(SDNode *N, unsigned EltSize) {
 /// the constant being splatted.  The ByteSize field indicates the number of
 /// bytes of each element [124] -> [bhw].
 SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
-  SDValue OpVal(0, 0);
+  SDValue OpVal(nullptr, 0);
 
   // If ByteSize of the splat is bigger than the element size of the
   // build_vector, then we have a case where we are checking for a splat where
@@ -1038,7 +1040,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
       if (!isa<ConstantSDNode>(N->getOperand(i))) return SDValue();
 
 
-      if (UniquedVals[i&(Multiple-1)].getNode() == 0)
+      if (!UniquedVals[i&(Multiple-1)].getNode())
         UniquedVals[i&(Multiple-1)] = N->getOperand(i);
       else if (UniquedVals[i&(Multiple-1)] != N->getOperand(i))
         return SDValue();  // no match.
@@ -1053,21 +1055,21 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
     bool LeadingZero = true;
     bool LeadingOnes = true;
     for (unsigned i = 0; i != Multiple-1; ++i) {
-      if (UniquedVals[i].getNode() == 0) continue;  // Must have been undefs.
+      if (!UniquedVals[i].getNode()) continue;  // Must have been undefs.
 
       LeadingZero &= cast<ConstantSDNode>(UniquedVals[i])->isNullValue();
       LeadingOnes &= cast<ConstantSDNode>(UniquedVals[i])->isAllOnesValue();
     }
     // Finally, check the least significant entry.
     if (LeadingZero) {
-      if (UniquedVals[Multiple-1].getNode() == 0)
+      if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(0, MVT::i32);  // 0,0,0,undef
       int Val = cast<ConstantSDNode>(UniquedVals[Multiple-1])->getZExtValue();
       if (Val < 16)
         return DAG.getTargetConstant(Val, MVT::i32);  // 0,0,0,4 -> vspltisw(4)
     }
     if (LeadingOnes) {
-      if (UniquedVals[Multiple-1].getNode() == 0)
+      if (!UniquedVals[Multiple-1].getNode())
         return DAG.getTargetConstant(~0U, MVT::i32);  // -1,-1,-1,undef
       int Val =cast<ConstantSDNode>(UniquedVals[Multiple-1])->getSExtValue();
       if (Val >= -16)                            // -1,-1,-1,-2 -> vspltisw(-2)
@@ -1080,13 +1082,13 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
   // Check to see if this buildvec has a single non-undef value in its elements.
   for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
     if (N->getOperand(i).getOpcode() == ISD::UNDEF) continue;
-    if (OpVal.getNode() == 0)
+    if (!OpVal.getNode())
       OpVal = N->getOperand(i);
     else if (OpVal != N->getOperand(i))
       return SDValue();
   }
 
-  if (OpVal.getNode() == 0) return SDValue();  // All UNDEF: use implicit def.
+  if (!OpVal.getNode()) return SDValue();  // All UNDEF: use implicit def.
 
   unsigned ValSizeInBytes = EltSize;
   uint64_t Value = 0;
@@ -1135,7 +1137,7 @@ SDValue PPC::get_VSPLTI_elt(SDNode *N, unsigned ByteSize, SelectionDAG &DAG) {
 /// sign extension from a 16-bit value.  If so, this returns true and the
 /// immediate.
 static bool isIntS16Immediate(SDNode *N, short &Imm) {
-  if (N->getOpcode() != ISD::Constant)
+  if (!isa<ConstantSDNode>(N))
     return false;
 
   Imm = (short)cast<ConstantSDNode>(N)->getZExtValue();
@@ -1174,12 +1176,12 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
     // disjoint.
     APInt LHSKnownZero, LHSKnownOne;
     APInt RHSKnownZero, RHSKnownOne;
-    DAG.ComputeMaskedBits(N.getOperand(0),
-                          LHSKnownZero, LHSKnownOne);
+    DAG.computeKnownBits(N.getOperand(0),
+                         LHSKnownZero, LHSKnownOne);
 
     if (LHSKnownZero.getBoolValue()) {
-      DAG.ComputeMaskedBits(N.getOperand(1),
-                            RHSKnownZero, RHSKnownOne);
+      DAG.computeKnownBits(N.getOperand(1),
+                           RHSKnownZero, RHSKnownOne);
       // If all of the bits are known zero on the LHS or RHS, the add won't
       // carry.
       if (~(LHSKnownZero | RHSKnownZero) == 0) {
@@ -1279,7 +1281,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
       APInt LHSKnownZero, LHSKnownOne;
-      DAG.ComputeMaskedBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
+      DAG.computeKnownBits(N.getOperand(0), LHSKnownZero, LHSKnownOne);
 
       if ((LHSKnownZero.getZExtValue()|~(uint64_t)imm) == ~0ULL) {
         // If all of the bits are known zero on the LHS or RHS, the add won't
@@ -1439,7 +1441,8 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 /// GetLabelAccessInfo - Return true if we should reference labels using a
 /// PICBase, set the HiOpFlags and LoOpFlags to the target MO flags.
 static bool GetLabelAccessInfo(const TargetMachine &TM, unsigned &HiOpFlags,
-                               unsigned &LoOpFlags, const GlobalValue *GV = 0) {
+                               unsigned &LoOpFlags,
+                               const GlobalValue *GV = nullptr) {
   HiOpFlags = PPCII::MO_HA;
   LoOpFlags = PPCII::MO_LO;
 
@@ -1885,17 +1888,12 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
   Entry.Node = Nest; Args.push_back(Entry);
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                                       Type::getVoidTy(*DAG.getContext()),
-                                       false, false, false, false, 0,
-                                       CallingConv::C,
-                /*isTailCall=*/false,
-                                       /*doesNotRet=*/false,
-                                       /*isReturnValueUsed=*/true,
-                DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-                Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__trampoline_setup", PtrVT), &Args, 0);
 
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
@@ -2016,7 +2014,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
                                              CCValAssign::LocInfo &LocInfo,
                                              ISD::ArgFlagsTy &ArgFlags,
                                              CCState &State) {
-  static const uint16_t ArgRegs[] = {
+  static const MCPhysReg ArgRegs[] = {
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
@@ -2043,7 +2041,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
                                                CCValAssign::LocInfo &LocInfo,
                                                ISD::ArgFlagsTy &ArgFlags,
                                                CCState &State) {
-  static const uint16_t ArgRegs[] = {
+  static const MCPhysReg ArgRegs[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8
   };
@@ -2067,8 +2065,8 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignFPArgRegs(unsigned &ValNo, MVT &ValVT,
 
 /// GetFPR - Get the set of FP registers that should be allocated for arguments,
 /// on Darwin.
-static const uint16_t *GetFPR() {
-  static const uint16_t FPR[] = {
+static const MCPhysReg *GetFPR() {
+  static const MCPhysReg FPR[] = {
     PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
     PPC::F8, PPC::F9, PPC::F10, PPC::F11, PPC::F12, PPC::F13
   };
@@ -2265,13 +2263,13 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
   if (isVarArg) {
-    static const uint16_t GPArgRegs[] = {
+    static const MCPhysReg GPArgRegs[] = {
       PPC::R3, PPC::R4, PPC::R5, PPC::R6,
       PPC::R7, PPC::R8, PPC::R9, PPC::R10,
     };
     const unsigned NumGPArgRegs = array_lengthof(GPArgRegs);
 
-    static const uint16_t FPArgRegs[] = {
+    static const MCPhysReg FPArgRegs[] = {
       PPC::F1, PPC::F2, PPC::F3, PPC::F4, PPC::F5, PPC::F6, PPC::F7,
       PPC::F8
     };
@@ -2333,8 +2331,7 @@ PPCTargetLowering::LowerFormalArguments_32SVR4(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -2405,18 +2402,18 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
-  static const uint16_t GPR[] = {
+  static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
-  static const uint16_t VSRH[] = {
+  static const MCPhysReg VSRH[] = {
     PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
     PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
   };
@@ -2683,8 +2680,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -2714,18 +2710,18 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   // Area that is at least reserved in caller of this function.
   unsigned MinReservedArea = ArgOffset;
 
-  static const uint16_t GPR_32[] = {           // 32-bit registers.
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const uint16_t GPR_64[] = {           // 64-bit registers.
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
 
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -2736,7 +2732,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   // In 32-bit non-varargs functions, the stack space for vectors is after the
   // stack space for non-vectors.  We do not use this space unless we have
@@ -3039,8 +3035,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   }
 
   if (!MemOps.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl,
-                        MVT::Other, &MemOps[0], MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
@@ -3174,12 +3169,12 @@ PPCTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
 /// 32-bit value is representable in the immediate field of a BxA instruction.
 static SDNode *isBLACompatibleAddress(SDValue Op, SelectionDAG &DAG) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-  if (!C) return 0;
+  if (!C) return nullptr;
 
   int Addr = C->getZExtValue();
   if ((Addr & 3) != 0 ||  // Low 2 bits are implicitly zero.
       SignExtend32<26>(Addr) != Addr)
-    return 0;  // Top 6 bits have to be sext of immediate.
+    return nullptr;  // Top 6 bits have to be sext of immediate.
 
   return DAG.getConstant((int)C->getZExtValue() >> 2,
                          DAG.getTargetLoweringInfo().getPointerTy()).getNode();
@@ -3315,8 +3310,8 @@ CreateCopyOfByValArgument(SDValue Src, SDValue Dst, SDValue Chain,
                           SDLoc dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), MVT::i32);
   return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       false, false, MachinePointerInfo(0),
-                       MachinePointerInfo(0));
+                       false, false, MachinePointerInfo(),
+                       MachinePointerInfo());
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
@@ -3361,8 +3356,7 @@ void PrepareTailCall(SelectionDAG &DAG, SDValue &InFlag, SDValue &Chain,
   StoreTailCallArgumentsToStackSlot(DAG, Chain, TailCallArguments,
                                     MemOpChains2, dl);
   if (!MemOpChains2.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains2[0], MemOpChains2.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
   // Store the return address to the appropriate stack slot.
   Chain = EmitTailCallStoreFPAndRetAddr(DAG, MF, Chain, LROp, FPOp, SPDiff,
@@ -3476,8 +3470,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // Load the address of the function entry point from the function
       // descriptor.
       SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other, MVT::Glue);
-      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs, MTCTROps,
-                                        InFlag.getNode() ? 3 : 2);
+      SDValue LoadFuncPtr = DAG.getNode(PPCISD::LOAD, dl, VTs,
+                              makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
       Chain = LoadFuncPtr.getValue(1);
       InFlag = LoadFuncPtr.getValue(2);
 
@@ -3513,8 +3507,8 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       MTCTROps[2] = InFlag;
     }
 
-    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys, MTCTROps,
-                        2 + (InFlag.getNode() != 0));
+    Chain = DAG.getNode(PPCISD::MTCTR, dl, NodeTys,
+                        makeArrayRef(MTCTROps, InFlag.getNode() ? 3 : 2));
     InFlag = Chain.getValue(1);
 
     NodeTys.clear();
@@ -3522,7 +3516,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
     NodeTys.push_back(MVT::Glue);
     Ops.push_back(Chain);
     CallOpc = PPCISD::BCTRL;
-    Callee.setNode(0);
+    Callee.setNode(nullptr);
     // Add use of X11 (holding environment pointer)
     if (isSVR4ABI && isPPC64)
       Ops.push_back(DAG.getRegister(PPC::X11, PtrVT));
@@ -3650,7 +3644,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
             isa<ConstantSDNode>(Callee)) &&
     "Expecting an global address, external symbol, absolute value or register");
 
-    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, &Ops[0], Ops.size());
+    return DAG.getNode(PPCISD::TC_RETURN, dl, MVT::Other, Ops);
   }
 
   // Add a NOP immediately after the branch instruction when using the 64-bit
@@ -3683,7 +3677,7 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, SDLoc dl,
     }
   }
 
-  Chain = DAG.getNode(CallOpc, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   if (needsTOCRestore) {
@@ -3720,6 +3714,10 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                    Ins, DAG);
 
+  if (!isTailCall && CLI.CS && CLI.CS->isMustTailCall())
+    report_fatal_error("failed to perform tail call elimination on a call "
+                       "site marked musttail");
+
   if (PPCSubTarget.isSVR4ABI()) {
     if (PPCSubTarget.isPPC64())
       return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
@@ -3800,7 +3798,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
         errs() << "Call operand #" << i << " has unhandled type "
              << EVT(ArgVT).getEVTString() << "\n";
 #endif
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
     }
   } else {
@@ -3921,8 +3919,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
@@ -3940,7 +3937,7 @@ PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
     SDValue Ops[] = { Chain, InFlag };
 
     Chain = DAG.getNode(seenFloatArg ? PPCISD::CR6SET : PPCISD::CR6UNSET,
-                        dl, VTs, Ops, InFlag.getNode() ? 2 : 1);
+                        dl, VTs, makeArrayRef(Ops, InFlag.getNode() ? 2 : 1));
 
     InFlag = Chain.getValue(1);
   }
@@ -4044,17 +4041,17 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(true, true);
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  static const uint16_t GPR[] = {
+  static const MCPhysReg GPR[] = {
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
-  static const uint16_t VSRH[] = {
+  static const MCPhysReg VSRH[] = {
     PPC::VSH2, PPC::VSH3, PPC::VSH4, PPC::VSH5, PPC::VSH6, PPC::VSH7, PPC::VSH8,
     PPC::VSH9, PPC::VSH10, PPC::VSH11, PPC::VSH12, PPC::VSH13
   };
@@ -4333,8 +4330,7 @@ PPCTargetLowering::LowerCall_64SVR4(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See PrepareCall() for more information about calls through function
@@ -4448,17 +4444,17 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   unsigned ArgOffset = PPCFrameLowering::getLinkageSize(isPPC64, true);
   unsigned GPR_idx = 0, FPR_idx = 0, VR_idx = 0;
 
-  static const uint16_t GPR_32[] = {           // 32-bit registers.
+  static const MCPhysReg GPR_32[] = {           // 32-bit registers.
     PPC::R3, PPC::R4, PPC::R5, PPC::R6,
     PPC::R7, PPC::R8, PPC::R9, PPC::R10,
   };
-  static const uint16_t GPR_64[] = {           // 64-bit registers.
+  static const MCPhysReg GPR_64[] = {           // 64-bit registers.
     PPC::X3, PPC::X4, PPC::X5, PPC::X6,
     PPC::X7, PPC::X8, PPC::X9, PPC::X10,
   };
-  static const uint16_t *FPR = GetFPR();
+  static const MCPhysReg *FPR = GetFPR();
 
-  static const uint16_t VR[] = {
+  static const MCPhysReg VR[] = {
     PPC::V2, PPC::V3, PPC::V4, PPC::V5, PPC::V6, PPC::V7, PPC::V8,
     PPC::V9, PPC::V10, PPC::V11, PPC::V12, PPC::V13
   };
@@ -4466,7 +4462,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   const unsigned NumFPRs = 13;
   const unsigned NumVRs  = array_lengthof(VR);
 
-  const uint16_t *GPR = isPPC64 ? GPR_64 : GPR_32;
+  const MCPhysReg *GPR = isPPC64 ? GPR_64 : GPR_32;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
   SmallVector<TailCallArgumentInfo, 8> TailCallArguments;
@@ -4696,8 +4692,7 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
@@ -4785,8 +4780,7 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(PPCISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG,
@@ -4889,7 +4883,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   // Build a DYNALLOC node.
   SDValue Ops[3] = { Chain, NegSize, FPSIdx };
   SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
-  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops, 3);
+  return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
 }
 
 SDValue PPCTargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
@@ -4925,7 +4919,7 @@ SDValue PPCTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewLD);
 
   SDValue Ops[] = { Result, SDValue(NewLD.getNode(), 1) };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue PPCTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
@@ -5097,8 +5091,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
       MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
     SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
-              MVT::i32, MMO);
+              DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
     Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr,
                          MPI, false, false, 0);
@@ -5225,7 +5218,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     Ld = DAG.getMemIntrinsicNode(Op.getOpcode() == ISD::UINT_TO_FP ?
                                    PPCISD::LFIWZX : PPCISD::LFIWAX,
                                  dl, DAG.getVTList(MVT::f64, MVT::Other),
-                                 Ops, 2, MVT::i32, MMO);
+                                 Ops, MVT::i32, MMO);
   } else {
     assert(PPCSubTarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
@@ -5279,14 +5272,13 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
   EVT VT = Op.getValueType();
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
-  SDValue MFFSreg, InFlag;
 
   // Save FP Control Word to register
   EVT NodeTys[] = {
     MVT::f64,    // return register
     MVT::Glue    // unused in this context
   };
-  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, &InFlag, 0);
+  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
 
   // Save FP register to stack slot
   int SSFI = MF.getFrameInfo()->CreateStackObject(8, 8, false);
@@ -5345,7 +5337,7 @@ SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutHi = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutLo = DAG.getNode(PPCISD::SHL, dl, VT, Lo, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -5374,7 +5366,7 @@ SDValue PPCTargetLowering::LowerSRL_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutLo = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp6);
   SDValue OutHi = DAG.getNode(PPCISD::SRL, dl, VT, Hi, Amt);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -5403,7 +5395,7 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
   SDValue OutLo = DAG.getSelectCC(dl, Tmp5, DAG.getConstant(0, AmtVT),
                                   Tmp4, Tmp6, ISD::SETLE);
   SDValue OutOps[] = { OutLo, OutHi };
-  return DAG.getMergeValues(OutOps, 2, dl);
+  return DAG.getMergeValues(OutOps, dl);
 }
 
 //===----------------------------------------------------------------------===//
@@ -5432,8 +5424,7 @@ static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
   SDValue Elt = DAG.getConstant(Val, MVT::i32);
   SmallVector<SDValue, 8> Ops;
   Ops.assign(CanonicalVT.getVectorNumElements(), Elt);
-  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT,
-                              &Ops[0], Ops.size());
+  SDValue Res = DAG.getNode(ISD::BUILD_VECTOR, dl, CanonicalVT, Ops);
   return DAG.getNode(ISD::BITCAST, dl, ReqVT, Res);
 }
 
@@ -5492,7 +5483,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   SDLoc dl(Op);
   BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(Op.getNode());
-  assert(BVN != 0 && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
+  assert(BVN && "Expected a BuildVectorSDNode in LowerBUILD_VECTOR");
 
   // Check if this is a splat of a constant value.
   APInt APSplatBits, APSplatUndef;
@@ -5540,10 +5531,14 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // we convert to a pseudo that will be expanded later into one of
     // the above forms.
     SDValue Elt = DAG.getConstant(SextVal, MVT::i32);
-    EVT VT = Op.getValueType();
-    int Size = VT == MVT::v16i8 ? 1 : (VT == MVT::v8i16 ? 2 : 4);
-    SDValue EltSize = DAG.getConstant(Size, MVT::i32);
-    return DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    EVT VT = (SplatSize == 1 ? MVT::v16i8 :
+              (SplatSize == 2 ? MVT::v8i16 : MVT::v4i32));
+    SDValue EltSize = DAG.getConstant(SplatSize, MVT::i32);
+    SDValue RetVal = DAG.getNode(PPCISD::VADD_SPLAT, dl, VT, Elt, EltSize);
+    if (VT == Op.getValueType())
+      return RetVal;
+    else
+      return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), RetVal);
   }
 
   // If this is 0x8000_0000 x 4, turn into vspltisw + vslw.  If it is
@@ -5838,7 +5833,7 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   }
 
   SDValue VPermMask = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i8,
-                                    &ResultMask[0], ResultMask.size());
+                                  ResultMask);
   return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(), V1, V2, VPermMask);
 }
 
@@ -5913,7 +5908,7 @@ SDValue PPCTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     DAG.getConstant(CompareOpc, MVT::i32)
   };
   EVT VTs[] = { Op.getOperand(2).getValueType(), MVT::Glue };
-  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+  SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
   // Now that we have the comparison, emit a copy from the CR to a GPR.
   // This is flagged to the above dot comparison.
@@ -7232,8 +7227,8 @@ static bool isConsecutiveLS(LSBaseSDNode *LS, LSBaseSDNode *Base,
     return true;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const GlobalValue *GV1 = NULL;
-  const GlobalValue *GV2 = NULL;
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
   int64_t Offset1 = 0;
   int64_t Offset2 = 0;
   bool isGA1 = TLI.isGAPlusOffset(Loc.getNode(), GV1, Offset1);
@@ -7360,8 +7355,8 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
       // that the high bits are equal.
       APInt Op1Zero, Op1One;
       APInt Op2Zero, Op2One;
-      DAG.ComputeMaskedBits(N->getOperand(0), Op1Zero, Op1One);
-      DAG.ComputeMaskedBits(N->getOperand(1), Op2Zero, Op2One);
+      DAG.computeKnownBits(N->getOperand(0), Op1Zero, Op1One);
+      DAG.computeKnownBits(N->getOperand(1), Op2Zero, Op2One);
 
       // We don't really care about what is known about the first bit (if
       // anything), so clear it in all masks prior to comparing them.
@@ -7579,8 +7574,7 @@ SDValue PPCTargetLowering::DAGCombineTruncBoolExt(SDNode *N,
         Ops[C+i] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ops[C+i]);
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
-      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1,
-                  Ops.data(), Ops.size()));
+      DAG.getNode(PromOp.getOpcode(), dl, MVT::i1, Ops));
   }
 
   // Now we're left with the initial truncation itself.
@@ -7816,8 +7810,7 @@ SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N,
     }
 
     DAG.ReplaceAllUsesOfValueWith(PromOp,
-      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0),
-                  Ops.data(), Ops.size()));
+      DAG.getNode(PromOp.getOpcode(), dl, N->getValueType(0), Ops));
   }
 
   // Now we're left with the initial extension itself.
@@ -7883,7 +7876,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     if (N->getOperand(1).getOpcode() == ISD::FSQRT) {
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0), DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
                            N->getOperand(0), RV);
@@ -7893,7 +7886,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
                                  DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV);
@@ -7906,7 +7899,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue RV =
         DAGCombineFastRecipFSQRT(N->getOperand(1).getOperand(0).getOperand(0),
                                  DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
         DCI.AddToWorklist(RV.getNode());
         RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N->getOperand(1)),
                          N->getValueType(0), RV,
@@ -7918,7 +7911,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     SDValue RV = DAGCombineFastRecip(N->getOperand(1), DCI);
-    if (RV.getNode() != 0) {
+    if (RV.getNode()) {
       DCI.AddToWorklist(RV.getNode());
       return DAG.getNode(ISD::FMUL, dl, N->getValueType(0),
                          N->getOperand(0), RV);
@@ -7933,10 +7926,10 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     // Compute this as 1/(1/sqrt(X)), which is the reciprocal of the
     // reciprocal sqrt.
     SDValue RV = DAGCombineFastRecipFSQRT(N->getOperand(0), DCI);
-    if (RV.getNode() != 0) {
+    if (RV.getNode()) {
       DCI.AddToWorklist(RV.getNode());
       RV = DAGCombineFastRecip(RV, DCI);
-      if (RV.getNode() != 0) {
+      if (RV.getNode()) {
 	// Unfortunately, RV is now NaN if the input was exactly 0. Select out
 	// this case and force the answer to 0.
 
@@ -8014,7 +8007,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       };
 
       Val = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
-              DAG.getVTList(MVT::Other), Ops, array_lengthof(Ops),
+              DAG.getVTList(MVT::Other), Ops,
               cast<StoreSDNode>(N)->getMemoryVT(),
               cast<StoreSDNode>(N)->getMemOperand());
       DCI.AddToWorklist(Val.getNode());
@@ -8041,8 +8034,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
-                                Ops, array_lengthof(Ops),
-                                cast<StoreSDNode>(N)->getMemoryVT(),
+                                Ops, cast<StoreSDNode>(N)->getMemoryVT(),
                                 cast<StoreSDNode>(N)->getMemOperand());
     }
     break;
@@ -8167,7 +8159,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
             Ops.push_back(*O);
         }
 
-        DAG.UpdateNodeOperands(User, Ops.data(), Ops.size());
+        DAG.UpdateNodeOperands(User, Ops);
       }
 
       return SDValue(N, 0);
@@ -8220,7 +8212,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         DAG.getMemIntrinsicNode(PPCISD::LBRX, dl,
                                 DAG.getVTList(N->getValueType(0) == MVT::i64 ?
                                               MVT::i64 : MVT::i32, MVT::Other),
-                                Ops, 3, LD->getMemoryVT(), LD->getMemOperand());
+                                Ops, LD->getMemoryVT(), LD->getMemOperand());
 
       // If this is an i16 load, insert the truncate.
       SDValue ResVal = BSLoad;
@@ -8250,7 +8242,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         !N->getOperand(2).hasOneUse()) {
 
       // Scan all of the users of the LHS, looking for VCMPo's that match.
-      SDNode *VCMPoNode = 0;
+      SDNode *VCMPoNode = nullptr;
 
       SDNode *LHSN = N->getOperand(0).getNode();
       for (SDNode::use_iterator UI = LHSN->use_begin(), E = LHSN->use_end();
@@ -8271,9 +8263,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       // Look at the (necessarily single) use of the flag value.  If it has a
       // chain, this transformation is more complex.  Note that multiple things
       // could use the value result, which we should ignore.
-      SDNode *FlagUser = 0;
+      SDNode *FlagUser = nullptr;
       for (SDNode::use_iterator UI = VCMPoNode->use_begin();
-           FlagUser == 0; ++UI) {
+           FlagUser == nullptr; ++UI) {
         assert(UI != VCMPoNode->use_end() && "Didn't find user!");
         SDNode *User = *UI;
         for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
@@ -8378,7 +8370,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
         DAG.getConstant(CompareOpc, MVT::i32)
       };
       EVT VTs[] = { LHS.getOperand(2).getValueType(), MVT::Glue };
-      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops, 3);
+      SDValue CompNode = DAG.getNode(PPCISD::VCMPo, dl, VTs, Ops);
 
       // Unpack the result based on how the target uses it.
       PPC::Predicate CompOpc;
@@ -8414,11 +8406,11 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 // Inline Assembly Support
 //===----------------------------------------------------------------------===//
 
-void PPCTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
@@ -8493,7 +8485,7 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
 
@@ -8599,7 +8591,7 @@ void PPCTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0,0);
+  SDValue Result;
 
   // Only support length 1 constraints.
   if (Constraint.length() > 1) return;
@@ -8766,6 +8758,30 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned PPCTargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  bool isPPC64 = PPCSubTarget.isPPC64();
+  bool isDarwinABI = PPCSubTarget.isDarwinABI();
+
+  if ((isPPC64 && VT != MVT::i64 && VT != MVT::i32) ||
+      (!isPPC64 && VT != MVT::i32))
+    report_fatal_error("Invalid register global variable type");
+
+  bool is64Bit = isPPC64 && VT == MVT::i64;
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                   .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+                   .Case("r2", isDarwinABI ? 0 : (is64Bit ? PPC::X2 : PPC::R2))
+                   .Case("r13", (!isPPC64 && isDarwinABI) ? 0 :
+                                  (is64Bit ? PPC::X13 : PPC::R13))
+                   .Default(0);
+
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 bool
 PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The PowerPC target isn't yet aware of offsets.
@@ -8795,6 +8811,42 @@ EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
   }
 }
 
+/// \brief Returns true if it is beneficial to convert a load of a constant
+/// to just the constant itself.
+bool PPCTargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                                          Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0 || BitSize > 64)
+    return false;
+  return true;
+}
+
+bool PPCTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 64 && NumBits2 == 32;
+}
+
+bool PPCTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
+bool PPCTargetLowering::isLegalAddImmediate(int64_t Imm) const {
+  return isInt<16>(Imm) || isUInt<16>(Imm);
+}
+
 bool PPCTargetLowering::allowsUnalignedMemoryAccesses(EVT VT,
                                                       unsigned,
                                                       bool *Fast) const {
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index da6d4dc..080ef5d 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -351,20 +351,20 @@ namespace llvm {
 
     /// getTargetNodeName() - This method returns the name of a target specific
     /// DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
     /// getPreIndexedAddressParts - returns true by value, base pointer and
     /// offset pointer and addressing mode by reference if the node's address
     /// can be legally represented as pre-indexed load / store address.
-    virtual bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
-                                           SDValue &Offset,
-                                           ISD::MemIndexedMode &AM,
-                                           SelectionDAG &DAG) const;
+    bool getPreIndexedAddressParts(SDNode *N, SDValue &Base,
+                                   SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG) const override;
 
     /// SelectAddressRegReg - Given the specified addressed, check to see if it
     /// can be represented as an indexed [r+r] operation.  Returns false if it
@@ -384,29 +384,31 @@ namespace llvm {
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
                                  SelectionDAG &DAG) const;
 
-    Sched::Preference getSchedulingPreference(SDNode *N) const;
+    Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
     ///
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
 
-    virtual MachineBasicBlock *
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
+
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
     MachineBasicBlock *EmitAtomicBinary(MachineInstr *MI,
                                         MachineBasicBlock *MBB, bool is64Bit,
                                         unsigned BinOpcode) const;
@@ -420,34 +422,58 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr *MI,
                                          MachineBasicBlock *MBB) const;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType
+    getConstraintType(const std::string &Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
     /// The operand object must already have been set up with the operand type.
     ConstraintWeight getSingleConstraintMatchWeight(
-      AsmOperandInfo &info, const char *constraint) const;
+      AsmOperandInfo &info, const char *constraint) const override;
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   MVT VT) const;
+                                   MVT VT) const override;
 
     /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
     /// function arguments in the caller parameter area.  This is the actual
     /// alignment, not its logarithm.
-    unsigned getByValTypeAlignment(Type *Ty) const;
+    unsigned getByValTypeAlignment(Type *Ty) const override;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops.
-    virtual void LowerAsmOperandForConstraint(SDValue Op,
-                                              std::string &Constraint,
-                                              std::vector<SDValue> &Ops,
-                                              SelectionDAG &DAG) const;
+    void LowerAsmOperandForConstraint(SDValue Op,
+                                      std::string &Constraint,
+                                      std::vector<SDValue> &Ops,
+                                      SelectionDAG &DAG) const override;
 
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
-    virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
+
+    /// isLegalICmpImmediate - Return true if the specified immediate is legal
+    /// icmp immediate, that is the target has icmp instructions which can
+    /// compare a register against the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalICmpImmediate(int64_t Imm) const override;
+
+    /// isLegalAddImmediate - Return true if the specified immediate is legal
+    /// add immediate, that is the target has add instructions which can
+    /// add a register and the immediate without having to materialize
+    /// the immediate into a register.
+    bool isLegalAddImmediate(int64_t Imm) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
+    /// isTruncateFree - Return true if it's free to truncate a value of
+    /// type Ty1 to type Ty2. e.g. On PPC it's free to truncate a i64 value in
+    /// register X1 to i32 by referencing its sub-register R1.
+    bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+    bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+    /// \brief Returns true if it is beneficial to convert a load of a constant
+    /// to just the constant itself.
+    bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                           Type *Ty) const override;
+
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     /// getOptimalMemOpType - Returns the target specific optimal type for load
     /// and store operations as a result of memset, memcpy, and memmove
@@ -460,32 +486,32 @@ namespace llvm {
     /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
-    virtual EVT
+    EVT
     getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
                         bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                        MachineFunction &MF) const;
+                        MachineFunction &MF) const override;
 
     /// Is unaligned memory access allowed for the given type, and is it fast
     /// relative to software emulation.
-    virtual bool allowsUnalignedMemoryAccesses(EVT VT,
-                                               unsigned AddrSpace,
-                                               bool *Fast = 0) const;
+    bool allowsUnalignedMemoryAccesses(EVT VT,
+                                       unsigned AddrSpace,
+                                       bool *Fast = nullptr) const override;
 
     /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
     /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
     /// expanded to FMAs when this method returns true, otherwise fmuladd is
     /// expanded to fmul + fadd.
-    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
+    bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
 
     // Should we expand the build vector with shuffles?
-    virtual bool
+    bool
     shouldExpandBuildVectorWithShuffles(EVT VT,
-                                        unsigned DefinedValues) const;
+                                        unsigned DefinedValues) const override;
 
     /// createFastISel - This method returns a target-specific FastISel object,
     /// or null if the target does not support "fast" instruction selection.
-    virtual FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
-                                     const TargetLibraryInfo *LibInfo) const;
+    FastISel *createFastISel(FunctionLoweringInfo &FuncInfo,
+                             const TargetLibraryInfo *LibInfo) const override;
 
   private:
     SDValue getFramePointerFrameIndex(SelectionDAG & DAG) const;
@@ -559,29 +585,29 @@ namespace llvm {
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual bool
+    bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                    bool isVarArg,
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   LLVMContext &Context) const;
+                   LLVMContext &Context) const override;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
 
     SDValue
       extendArgForPPC64(ISD::ArgFlagsTy Flags, EVT ObjectVT, SelectionDAG &DAG,
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 2fd4a3e..f3c2eab 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -223,7 +223,7 @@ class VX2_Int_Ty2<bits<11> xo, string opc, Intrinsic IntID, ValueType OutTy,
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
-def HasAltivec : Predicate<"PPCSubTarget.hasAltivec()">;
+def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
 let Predicates = [HasAltivec] in {
 
 let isCodeGenOnly = 1 in {
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 939bbdc..fd72384 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -35,12 +35,14 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-instr-info"
+
 #define GET_INSTRMAP_INFO
 #define GET_INSTRINFO_CTOR_DTOR
 #include "PPCGenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::
 opt<bool> DisableCTRLoopAnal("disable-ppc-ctrloop-analysis", cl::Hidden,
             cl::desc("Disable analysis for CTR loops"));
@@ -230,7 +232,7 @@ PPCInstrInfo::commuteInstruction(MachineInstr *MI, bool NewMI) const {
 
   // Cannot commute if it has a non-zero rotate count.
   if (MI->getOperand(3).getImm() != 0)
-    return 0;
+    return nullptr;
 
   // If we have a zero rotate count, we have:
   //   M = mask(MB,ME)
@@ -539,7 +541,7 @@ PPCInstrInfo::InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
   bool isPPC64 = TM.getSubtargetImpl()->isPPC64();
 
   // One-way branch.
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty())   // Unconditional branch
       BuildMI(&MBB, DL, get(PPC::B)).addMBB(TBB);
     else if (Cond[1].getReg() == PPC::CTR || Cond[1].getReg() == PPC::CTR8)
@@ -1399,10 +1401,10 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr *CmpInstr,
   // There are two possible candidates which can be changed to set CR[01].
   // One is MI, the other is a SUB instruction.
   // For CMPrr(r1,r2), we are looking for SUB(r1,r2) or SUB(r2,r1).
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   if (SrcReg2 != 0)
     // MI is not a candidate for CMPrr.
-    MI = NULL;
+    MI = nullptr;
   // FIXME: Conservatively refuse to convert an instruction which isn't in the
   // same BB as the comparison. This is to allow the check below to avoid calls
   // (and other explicit clobbers); instead we should really check for these
@@ -1810,10 +1812,15 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX then go ahead and return without doing
+      // anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
+
       LIS = &getAnalysis<LiveIntervals>();
 
-      TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -1830,7 +1837,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<LiveIntervals>();
       AU.addPreserved<LiveIntervals>();
       AU.addRequired<SlotIndexes>();
@@ -1962,8 +1969,11 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX on the subtarget, don't do anything.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -1977,7 +1987,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -2036,8 +2046,11 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
+      // If we don't have VSX don't bother doing anything here.
+      if (!TM->getSubtargetImpl()->hasVSX())
+        return false;
       TII = TM->getInstrInfo();
 
       bool Changed = false;
@@ -2051,7 +2064,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
@@ -2193,7 +2206,7 @@ protected:
     }
 
 public:
-    virtual bool runOnMachineFunction(MachineFunction &MF) {
+    bool runOnMachineFunction(MachineFunction &MF) override {
       TM = static_cast<const PPCTargetMachine *>(&MF.getTarget());
       TII = TM->getInstrInfo();
 
@@ -2213,7 +2226,7 @@ public:
       return Changed;
     }
 
-    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
       MachineFunctionPass::getAnalysisUsage(AU);
     }
   };
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 3c8117c..d9db3e1 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -86,151 +86,148 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const PPCRegisterInfo &getRegisterInfo() const { return RI; }
+  const PPCRegisterInfo &getRegisterInfo() const { return RI; }
 
   ScheduleHazardRecognizer *
   CreateTargetHazardRecognizer(const TargetMachine *TM,
-                               const ScheduleDAG *DAG) const;
+                               const ScheduleDAG *DAG) const override;
   ScheduleHazardRecognizer *
   CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
-                                     const ScheduleDAG *DAG) const;
+                                     const ScheduleDAG *DAG) const override;
 
-  virtual
   int getOperandLatency(const InstrItineraryData *ItinData,
                         const MachineInstr *DefMI, unsigned DefIdx,
-                        const MachineInstr *UseMI, unsigned UseIdx) const;
-  virtual
+                        const MachineInstr *UseMI,
+                        unsigned UseIdx) const override;
   int getOperandLatency(const InstrItineraryData *ItinData,
                         SDNode *DefNode, unsigned DefIdx,
-                        SDNode *UseNode, unsigned UseIdx) const {
+                        SDNode *UseNode, unsigned UseIdx) const override {
     return PPCGenInstrInfo::getOperandLatency(ItinData, DefNode, DefIdx,
                                               UseNode, UseIdx);
   }
 
   bool isCoalescableExtInstr(const MachineInstr &MI,
                              unsigned &SrcReg, unsigned &DstReg,
-                             unsigned &SubIdx) const;
+                             unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                               int &FrameIndex) const;
+                               int &FrameIndex) const override;
   unsigned isStoreToStackSlot(const MachineInstr *MI,
-                              int &FrameIndex) const;
+                              int &FrameIndex) const override;
 
   // commuteInstruction - We can commute rlwimi instructions, but only if the
   // rotate amt is zero.  We also have to munge the immediates a bit.
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI, bool NewMI) const override;
 
-  virtual bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
-                                     unsigned &SrcOpIdx2) const;
+  bool findCommutedOpIndices(MachineInstr *MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
 
-  virtual void insertNoop(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const override;
 
 
   // Branch analysis.
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
 
   // Select analysis.
-  virtual bool canInsertSelect(const MachineBasicBlock&,
-                               const SmallVectorImpl<MachineOperand> &Cond,
-                               unsigned, unsigned, int&, int&, int&) const;
-  virtual void insertSelect(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, DebugLoc DL,
-                            unsigned DstReg,
-                            const SmallVectorImpl<MachineOperand> &Cond,
-                            unsigned TrueReg, unsigned FalseReg) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
-
-  virtual bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
-                             unsigned Reg, MachineRegisterInfo *MRI) const;
+  bool canInsertSelect(const MachineBasicBlock&,
+                       const SmallVectorImpl<MachineOperand> &Cond,
+                       unsigned, unsigned, int&, int&, int&) const override;
+  void insertSelect(MachineBasicBlock &MBB,
+                    MachineBasicBlock::iterator MI, DebugLoc DL,
+                    unsigned DstReg,
+                    const SmallVectorImpl<MachineOperand> &Cond,
+                    unsigned TrueReg, unsigned FalseReg) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+
+  bool FoldImmediate(MachineInstr *UseMI, MachineInstr *DefMI,
+                     unsigned Reg, MachineRegisterInfo *MRI) const override;
 
   // If conversion by predication (only supported by some branch instructions).
   // All of the profitability checks always return true; it is always
   // profitable to use the predicated branches.
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCycles, unsigned ExtraPredCycles,
-                                   const BranchProbability &Probability) const {
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB,
+                          unsigned NumCycles, unsigned ExtraPredCycles,
+                          const BranchProbability &Probability) const override {
     return true;
   }
 
-  virtual bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
-                                   unsigned NumT, unsigned ExtraT,
-                                   MachineBasicBlock &FMBB,
-                                   unsigned NumF, unsigned ExtraF,
-                                   const BranchProbability &Probability) const;
+  bool isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                           unsigned NumT, unsigned ExtraT,
+                           MachineBasicBlock &FMBB,
+                           unsigned NumF, unsigned ExtraF,
+                           const BranchProbability &Probability) const override;
 
-  virtual bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCycles,
-                                         const BranchProbability
-                                         &Probability) const {
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                 unsigned NumCycles,
+                                 const BranchProbability
+                                 &Probability) const override {
     return true;
   }
 
-  virtual bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                         MachineBasicBlock &FMBB) const {
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                 MachineBasicBlock &FMBB) const override {
     return false;
   }
 
   // Predication support.
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  virtual bool isUnpredicatedTerminator(const MachineInstr *MI) const;
+  bool isUnpredicatedTerminator(const MachineInstr *MI) const override;
 
-  virtual
   bool PredicateInstruction(MachineInstr *MI,
-                            const SmallVectorImpl<MachineOperand> &Pred) const;
+                    const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  virtual
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
-  virtual bool DefinesPredicate(MachineInstr *MI,
-                                std::vector<MachineOperand> &Pred) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const override;
 
-  virtual bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   // Comparison optimization.
 
 
-  virtual bool analyzeCompare(const MachineInstr *MI,
-                              unsigned &SrcReg, unsigned &SrcReg2,
-                              int &Mask, int &Value) const;
+  bool analyzeCompare(const MachineInstr *MI,
+                      unsigned &SrcReg, unsigned &SrcReg2,
+                      int &Mask, int &Value) const override;
 
-  virtual bool optimizeCompareInstr(MachineInstr *CmpInstr,
-                                    unsigned SrcReg, unsigned SrcReg2,
-                                    int Mask, int Value,
-                                    const MachineRegisterInfo *MRI) const;
+  bool optimizeCompareInstr(MachineInstr *CmpInstr,
+                            unsigned SrcReg, unsigned SrcReg2,
+                            int Mask, int Value,
+                            const MachineRegisterInfo *MRI) const override;
 
   /// GetInstSize - Return the number of bytes of code the specified
   /// instruction may be.  This returns the maximum number of bytes.
   ///
-  virtual unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
+  unsigned GetInstSizeInBytes(const MachineInstr *MI) const;
 };
 
 }
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index 1d984ab..e421f8e 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -610,10 +610,10 @@ def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def In32BitMode  : Predicate<"!PPCSubTarget.isPPC64()">;
-def In64BitMode  : Predicate<"PPCSubTarget.isPPC64()">;
-def IsBookE  : Predicate<"PPCSubTarget.isBookE()">;
-def IsNotBookE  : Predicate<"!PPCSubTarget.isBookE()">;
+def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
+def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
+def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
+def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 9cc919e..49bcc48 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -39,7 +39,7 @@ multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, dag OOL, dag IOL,
   }
 }
 
-def HasVSX : Predicate<"PPCSubTarget.hasVSX()">;
+def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
 let Predicates = [HasVSX] in {
 let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
 let neverHasSideEffects = 1 in { // VSX instructions don't have side effects.
diff --git a/lib/Target/PowerPC/PPCJITInfo.cpp b/lib/Target/PowerPC/PPCJITInfo.cpp
index 227919c..7bbc71b 100644
--- a/lib/Target/PowerPC/PPCJITInfo.cpp
+++ b/lib/Target/PowerPC/PPCJITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "PPCJITInfo.h"
 #include "PPCRelocations.h"
 #include "PPCTargetMachine.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 
 #define BUILD_ADDIS(RD,RS,IMM16) \
diff --git a/lib/Target/PowerPC/PPCJITInfo.h b/lib/Target/PowerPC/PPCJITInfo.h
index 46d4a08..0693e3e 100644
--- a/lib/Target/PowerPC/PPCJITInfo.h
+++ b/lib/Target/PowerPC/PPCJITInfo.h
@@ -30,19 +30,19 @@ namespace llvm {
       is64Bit = tmIs64Bit;
     }
 
-    virtual StubLayout getStubLayout();
-    virtual void *emitFunctionStub(const Function* F, void *Fn,
-                                   JITCodeEmitter &JCE);
-    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
-    virtual void relocate(void *Function, MachineRelocation *MR,
-                          unsigned NumRelocs, unsigned char* GOTBase);
-    
+    StubLayout getStubLayout() override;
+    void *emitFunctionStub(const Function* F, void *Fn,
+                           JITCodeEmitter &JCE) override;
+    LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
+    void relocate(void *Function, MachineRelocation *MR,
+                  unsigned NumRelocs, unsigned char* GOTBase) override;
+
     /// replaceMachineCodeForFunction - Make it so that calling the function
     /// whose machine code is at OLD turns into a call to NEW, perhaps by
     /// overwriting OLD with a branch to NEW.  This is used for self-modifying
     /// code.
     ///
-    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+    void replaceMachineCodeForFunction(void *Old, void *New) override;
   };
 }
 
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 029bb8a..f8e84a5 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -96,7 +96,7 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO, AsmPrinter &AP){
       (MO.getTargetFlags() & PPCII::MO_NLP_HIDDEN_FLAG) ? 
          MachO.getHiddenGVStubEntry(Sym) : MachO.getGVStubEntry(Sym);
     
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym = MachineModuleInfoImpl::
                    StubValueTy(AP.getSymbol(MO.getGlobal()),
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 4ff282e..e333b51 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "reginfo"
 #include "PPCRegisterInfo.h"
 #include "PPC.h"
 #include "PPCFrameLowering.h"
@@ -42,11 +41,13 @@
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "reginfo"
+
 #define GET_REGINFO_TARGET_DESC
 #include "PPCGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 EnableBasePointer("ppc-use-base-pointer", cl::Hidden, cl::init(true),
          cl::desc("Enable use of a base pointer for complex stack frames"));
@@ -96,7 +97,7 @@ PPCRegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind)
   return &PPC::GPRCRegClass;
 }
 
-const uint16_t*
+const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? (Subtarget.hasAltivec() ?
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index c3e54b4..13a35f6 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -34,36 +34,37 @@ public:
   
   /// getPointerRegClass - Return the register class to use to hold pointers.
   /// This is used for addressing modes.
-  virtual const TargetRegisterClass *
-  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const;
+  const TargetRegisterClass *
+  getPointerRegClass(const MachineFunction &MF, unsigned Kind=0) const override;
 
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                               MachineFunction &MF) const;
+                               MachineFunction &MF) const override;
 
   const TargetRegisterClass*
-  getLargestLegalSuperClass(const TargetRegisterClass *RC) const;
+  getLargestLegalSuperClass(const TargetRegisterClass *RC) const override;
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction* MF =nullptr) const override;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const override;
   const uint32_t *getNoPreservedMask() const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// We require the register scavenger.
-  bool requiresRegisterScavenging(const MachineFunction &MF) const {
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool requiresFrameIndexScavenging(const MachineFunction &MF) const {
+  bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
     return true;
   }
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
 
-  virtual bool requiresVirtualBaseRegisters(const MachineFunction &MF) const {
+  bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
     return true;
   }
 
@@ -82,28 +83,29 @@ public:
                           unsigned FrameIndex) const;
 
   bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
-			    int &FrameIdx) const;
+			    int &FrameIdx) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Support for virtual base registers.
-  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const;
+  bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
   void materializeFrameBaseRegister(MachineBasicBlock *MBB,
                                     unsigned BaseReg, int FrameIdx,
-                                    int64_t Offset) const;
+                                    int64_t Offset) const override;
   void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                         int64_t Offset) const;
-  bool isFrameOffsetLegal(const MachineInstr *MI, int64_t Offset) const;
+                         int64_t Offset) const override;
+  bool isFrameOffsetLegal(const MachineInstr *MI,
+                          int64_t Offset) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   // Base pointer (stack realignment) support.
   unsigned getBaseRegister(const MachineFunction &MF) const;
   bool hasBasePointer(const MachineFunction &MF) const;
   bool canRealignStack(const MachineFunction &MF) const;
-  bool needsStackRealignment(const MachineFunction &MF) const;
+  bool needsStackRealignment(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.td b/lib/Target/PowerPC/PPCRegisterInfo.td
index e11f7d4..b3d145b 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -188,6 +188,13 @@ def CR6 : CR<6, "cr6", [CR6LT, CR6GT, CR6EQ, CR6UN]>, DwarfRegNum<[74, 74]>;
 def CR7 : CR<7, "cr7", [CR7LT, CR7GT, CR7EQ, CR7UN]>, DwarfRegNum<[75, 75]>;
 }
 
+// The full condition-code register. This is not modeled fully, but defined
+// here primarily, for compatibility with gcc, to allow the inline asm "cc"
+// clobber specification to work.
+def CC : PPCReg<"cc">, DwarfRegAlias<CR0> {
+  let Aliases = [CR0, CR1, CR2, CR3, CR4, CR5, CR6, CR7];
+}
+
 // Link register
 def LR  : SPR<8, "lr">, DwarfRegNum<[-2, 65]>;
 //let Aliases = [LR] in
@@ -300,3 +307,8 @@ def VRSAVERC : RegisterClass<"PPC", [i32], 32, (add VRSAVE)>;
 def CARRYRC : RegisterClass<"PPC", [i32], 32, (add CARRY)> {
   let CopyCost = -1;
 }
+
+def CCRC : RegisterClass<"PPC", [i32], 32, (add CC)> {
+  let isAllocatable = 0;
+}
+
diff --git a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
index d4258b4..f742f72 100644
--- a/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
+++ b/lib/Target/PowerPC/PPCSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "powerpc-selectiondag-info"
 #include "PPCTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "powerpc-selectiondag-info"
+
 PPCSelectionDAGInfo::PPCSelectionDAGInfo(const PPCTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index b07abe4..ea9daee 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -24,31 +24,21 @@
 #include "llvm/Target/TargetMachine.h"
 #include <cstdlib>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "ppc-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "PPCGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
                            const std::string &FS, bool is64Bit,
                            CodeGenOpt::Level OptLevel)
-  : PPCGenSubtargetInfo(TT, CPU, FS)
-  , IsPPC64(is64Bit)
-  , TargetTriple(TT) {
+    : PPCGenSubtargetInfo(TT, CPU, FS), IsPPC64(is64Bit), TargetTriple(TT),
+      OptLevel(OptLevel) {
   initializeEnvironment();
-
-  std::string FullFS = FS;
-
-  // At -O2 and above, track CR bits as individual registers.
-  if (OptLevel >= CodeGenOpt::Default) {
-    if (!FullFS.empty())
-      FullFS = "+crbits," + FullFS;
-    else
-      FullFS = "+crbits";
-  }
-
-  resetSubtargetFeatures(CPU, FullFS);
+  resetSubtargetFeatures(CPU, FS);
 }
 
 /// SetJITMode - This is called to inform the subtarget info that we are
@@ -138,6 +128,14 @@ void PPCSubtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+64bit";
   }
 
+  // At -O2 and above, track CR bits as individual registers.
+  if (OptLevel >= CodeGenOpt::Default) {
+    if (!FullFS.empty())
+      FullFS = "+crbits," + FullFS;
+    else
+      FullFS = "+crbits";
+  }
+
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FullFS);
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 87e012e..ee43fd5 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -99,6 +99,9 @@ protected:
   /// TargetTriple - What processor and OS we're targeting.
   Triple TargetTriple;
 
+  /// OptLevel - What default optimization level we're emitting code for.
+  CodeGenOpt::Level OptLevel;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified triple.
@@ -129,7 +132,7 @@ public:
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
 
   /// \brief Reset the features for the PowerPC target.
-  virtual void resetSubtargetFeatures(const MachineFunction *MF);
+  void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
   void initializeEnvironment();
   void resetSubtargetFeatures(StringRef CPU, StringRef FS);
@@ -200,15 +203,17 @@ public:
   /// enablePostRAScheduler - True at 'More' optimization.
   bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                              TargetSubtargetInfo::AntiDepBreakMode& Mode,
-                             RegClassVector& CriticalPathRCs) const;
+                             RegClassVector& CriticalPathRCs) const override;
+
+  bool enableEarlyIfConversion() const override { return hasISEL(); }
 
   // Scheduling customization.
-  bool enableMachineScheduler() const;
+  bool enableMachineScheduler() const override;
   void overrideSchedPolicy(MachineSchedPolicy &Policy,
                            MachineInstr *begin,
                            MachineInstr *end,
-                           unsigned NumRegionInstrs) const;
-  bool useAA() const;
+                           unsigned NumRegionInstrs) const override;
+  bool useAA() const override;
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index e7438f3..2323add 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -127,12 +127,12 @@ public:
     return *getPPCTargetMachine().getSubtargetImpl();
   }
 
-  virtual bool addPreISel();
-  virtual bool addILPOpts();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addILPOpts() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
@@ -148,12 +148,8 @@ bool PPCPassConfig::addPreISel() {
 }
 
 bool PPCPassConfig::addILPOpts() {
-  if (getPPCSubtarget().hasISEL()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool PPCPassConfig::addInstSelector() {
@@ -165,25 +161,19 @@ bool PPCPassConfig::addInstSelector() {
     addPass(createPPCCTRLoopsVerify());
 #endif
 
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyPass());
-
+  addPass(createPPCVSXCopyPass());
   return false;
 }
 
 bool PPCPassConfig::addPreRegAlloc() {
-  if (getPPCSubtarget().hasVSX()) {
-    initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
-    insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
-               &PPCVSXFMAMutateID);
-  }
-
+  initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry());
+  insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID,
+             &PPCVSXFMAMutateID);
   return false;
 }
 
 bool PPCPassConfig::addPreSched2() {
-  if (getPPCSubtarget().hasVSX())
-    addPass(createPPCVSXCopyCleanupPass());
+  addPass(createPPCVSXCopyCleanupPass());
 
   if (getOptLevel() != CodeGenOpt::None)
     addPass(&IfConverterID);
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 606ccb3..9e92494 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -43,34 +43,34 @@ public:
                    Reloc::Model RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL, bool is64Bit);
 
-  virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
-  virtual const PPCFrameLowering  *getFrameLowering() const {
+  const PPCInstrInfo      *getInstrInfo() const override { return &InstrInfo; }
+  const PPCFrameLowering  *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual       PPCJITInfo        *getJITInfo()         { return &JITInfo; }
-  virtual const PPCTargetLowering *getTargetLowering() const {
+        PPCJITInfo        *getJITInfo() override         { return &JITInfo; }
+  const PPCTargetLowering *getTargetLowering() const override {
    return &TLInfo;
   }
-  virtual const PPCSelectionDAGInfo* getSelectionDAGInfo() const {
+  const PPCSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual const PPCRegisterInfo   *getRegisterInfo() const {
+  const PPCRegisterInfo   *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
 
-  virtual const DataLayout    *getDataLayout() const    { return &DL; }
-  virtual const PPCSubtarget  *getSubtargetImpl() const { return &Subtarget; }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const DataLayout    *getDataLayout() const override    { return &DL; }
+  const PPCSubtarget  *getSubtargetImpl() const override { return &Subtarget; }
+  const InstrItineraryData *getInstrItineraryData() const override {
     return &InstrItins;
   }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM,
-                              JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM,
+                      JITCodeEmitter &JCE) override;
 
   /// \brief Register PPC analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 /// PPC32TargetMachine - PowerPC 32-bit target machine.
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 2f4d5c1..007901b 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -14,17 +14,22 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "ppctti"
 #include "PPC.h"
 #include "PPCTargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "ppctti"
+
+static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
+cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializePPCTTIPass(PassRegistry &);
@@ -33,21 +38,16 @@ void initializePPCTTIPass(PassRegistry &);
 namespace {
 
 class PPCTTI final : public ImmutablePass, public TargetTransformInfo {
-  const PPCTargetMachine *TM;
   const PPCSubtarget *ST;
   const PPCTargetLowering *TLI;
 
-  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
-  /// are set if the result needs to be inserted and/or extracted from vectors.
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
-
 public:
-  PPCTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  PPCTTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
   PPCTTI(const PPCTargetMachine *TM)
-      : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()),
+      : ImmutablePass(ID), ST(TM->getSubtargetImpl()),
         TLI(TM->getTargetLowering()) {
     initializePPCTTIPass(*PassRegistry::getPassRegistry());
   }
@@ -72,6 +72,13 @@ public:
 
   /// \name Scalar TTI Implementations
   /// @{
+  unsigned getIntImmCost(const APInt &Imm, Type *Ty) const override;
+
+  unsigned getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+  unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+                         Type *Ty) const override;
+
   virtual PopcntSupportKind
   getPopcntSupport(unsigned TyWidth) const override;
   virtual void getUnrollingPreferences(
@@ -128,6 +135,142 @@ PPCTTI::PopcntSupportKind PPCTTI::getPopcntSupport(unsigned TyWidth) const {
   return PSK_Software;
 }
 
+unsigned PPCTTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  if (Imm == 0)
+    return TCC_Free;
+
+  if (Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Basic;
+
+    if (isInt<32>(Imm.getSExtValue())) {
+      // A constant that can be materialized using lis.
+      if ((Imm.getZExtValue() & 0xFFFF) == 0)
+        return TCC_Basic;
+
+      return 2 * TCC_Basic;
+    }
+  }
+
+  return 4 * TCC_Basic;
+}
+
+unsigned PPCTTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
+                               const APInt &Imm, Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(IID, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  switch (IID) {
+  default: return TCC_Free;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_with_overflow:
+    if ((Idx == 1) && Imm.getBitWidth() <= 64 && isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+    break;
+  }
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
+unsigned PPCTTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                               Type *Ty) const {
+  if (DisablePPCConstHoist)
+    return TargetTransformInfo::getIntImmCost(Opcode, Idx, Imm, Ty);
+
+  assert(Ty->isIntegerTy());
+
+  unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  if (BitSize == 0)
+    return ~0U;
+
+  unsigned ImmIdx = ~0U;
+  bool ShiftedFree = false, RunFree = false, UnsignedFree = false,
+       ZeroFree = false;
+  switch (Opcode) {
+  default: return TCC_Free;
+  case Instruction::GetElementPtr:
+    // Always hoist the base address of a GetElementPtr. This prevents the
+    // creation of new constants for every base constant that gets constant
+    // folded with the offset.
+    if (Idx == 0)
+      return 2 * TCC_Basic;
+    return TCC_Free;
+  case Instruction::And:
+    RunFree = true; // (for the rotate-and-mask instructions)
+    // Fallthrough...
+  case Instruction::Add:
+  case Instruction::Or:
+  case Instruction::Xor:
+    ShiftedFree = true;
+    // Fallthrough...
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    ImmIdx = 1;
+    break;
+  case Instruction::ICmp:
+    UnsignedFree = true;
+    ImmIdx = 1;
+    // Fallthrough... (zero comparisons can use record-form instructions)
+  case Instruction::Select:
+    ZeroFree = true;
+    break;
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Ret:
+  case Instruction::Load:
+  case Instruction::Store:
+    break;
+  }
+
+  if (ZeroFree && Imm == 0)
+    return TCC_Free;
+
+  if (Idx == ImmIdx && Imm.getBitWidth() <= 64) {
+    if (isInt<16>(Imm.getSExtValue()))
+      return TCC_Free;
+
+    if (RunFree) {
+      if (Imm.getBitWidth() <= 32 &&
+          (isShiftedMask_32(Imm.getZExtValue()) ||
+           isShiftedMask_32(~Imm.getZExtValue())))
+        return TCC_Free;
+
+
+      if (ST->isPPC64() &&
+          (isShiftedMask_64(Imm.getZExtValue()) ||
+           isShiftedMask_64(~Imm.getZExtValue())))
+        return TCC_Free;
+    }
+
+    if (UnsignedFree && isUInt<16>(Imm.getZExtValue()))
+      return TCC_Free;
+
+    if (ShiftedFree && (Imm.getZExtValue() & 0xFFFF) == 0)
+      return TCC_Free;
+  }
+
+  return PPCTTI::getIntImmCost(Imm, Ty);
+}
+
 void PPCTTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
   if (ST->getDarwinDirective() == PPC::DIR_A2) {
     // The A2 is in-order with a deep pipeline, and concatenation unrolling
@@ -220,7 +363,9 @@ unsigned PPCTTI::getVectorInstrCost(unsigned Opcode, Type *Val,
   // experimentally as a minimum needed to prevent unprofitable
   // vectorization for the paq8p benchmark.  It may need to be
   // raised further if other unprofitable cases remain.
-  unsigned LHSPenalty = 12;
+  unsigned LHSPenalty = 2;
+  if (ISD == ISD::INSERT_VECTOR_ELT)
+    LHSPenalty += 7;
 
   // Vector element insert/extract with Altivec is very expensive,
   // because they require store and reload with the attendant
@@ -244,14 +389,32 @@ unsigned PPCTTI::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   unsigned Cost =
     TargetTransformInfo::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
 
-  // FIXME: Update this for VSX loads/stores that support unaligned access.
+  // VSX loads/stores support unaligned access.
+  if (ST->hasVSX()) {
+    if (LT.second == MVT::v2f64 || LT.second == MVT::v2i64)
+      return Cost;
+  }
+
+  bool UnalignedAltivec =
+    Src->isVectorTy() &&
+    Src->getPrimitiveSizeInBits() >= LT.second.getSizeInBits() &&
+    LT.second.getSizeInBits() == 128 &&
+    Opcode == Instruction::Load;
 
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
   unsigned SrcBytes = LT.second.getStoreSize();
-  if (SrcBytes && Alignment && Alignment < SrcBytes)
+  if (SrcBytes && Alignment && Alignment < SrcBytes && !UnalignedAltivec) {
     Cost += LT.first*(SrcBytes/Alignment-1);
 
+    // For a vector type, there is also scalarization overhead (only for
+    // stores, loads are expanded using the vector-load + permutation sequence,
+    // which is much less expensive).
+    if (Src->isVectorTy() && Opcode == Instruction::Store)
+      for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+        Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
+  }
+
   return Cost;
 }
 
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
index 3e1848b..949fdfb 100644
--- a/lib/Target/R600/AMDGPU.h
+++ b/lib/Target/R600/AMDGPU.h
@@ -37,11 +37,15 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 // SI Passes
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
+FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
 FunctionPass *createSIFixSGPRCopiesPass(TargetMachine &tm);
 FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
 FunctionPass *createSIInsertWaits(TargetMachine &tm);
 
+void initializeSILowerI1CopiesPass(PassRegistry &);
+extern char &SILowerI1CopiesID;
+
 // Passes common to R600 and SI
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
@@ -76,8 +80,8 @@ enum AddressSpaces {
   GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
   CONSTANT_ADDRESS = 2, ///< Address space for constant memory
   LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  REGION_ADDRESS   = 4, ///< Address space for region memory.
-  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
+  REGION_ADDRESS   = 5, ///< Address space for region memory.
   PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
   PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
 
@@ -102,7 +106,8 @@ enum AddressSpaces {
   CONSTANT_BUFFER_13 = 21,
   CONSTANT_BUFFER_14 = 22,
   CONSTANT_BUFFER_15 = 23,
-  LAST_ADDRESS     = 24
+  ADDRESS_NONE = 24, ///< Address space for unknown memory.
+  LAST_ADDRESS = ADDRESS_NONE
 };
 
 } // namespace AMDGPUAS
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index d1e2cf5..2edc115 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -120,6 +120,17 @@ def AMDGPU : Target {
   let InstructionSet = AMDGPUInstrInfo;
 }
 
+//===----------------------------------------------------------------------===//
+// Predicate helper class
+//===----------------------------------------------------------------------===//
+
+class PredicateControl {
+  Predicate SubtargetPredicate;
+  list<Predicate> OtherPredicates = [];
+  list<Predicate> Predicates = !listconcat([SubtargetPredicate],
+                                            OtherPredicates);
+}
+
 // Include AMDGPU TD files
 include "R600Schedule.td"
 include "SISchedule.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index b166c45..170f479 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -64,7 +64,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
   if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    findNumUsedRegistersSI(MF, KernelInfo.NumSGPR, KernelInfo.NumVGPR);
+    getSIProgramInfo(KernelInfo, MF);
     EmitProgramInfoSI(MF, KernelInfo);
   } else {
     EmitProgramInfoR600(MF);
@@ -84,8 +84,10 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                               SectionKind::getReadOnly());
     OutStreamer.SwitchSection(CommentSection);
 
-    if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+    if (STM.getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       OutStreamer.emitRawComment(" Kernel info:", false);
+      OutStreamer.emitRawComment(" codeLenInByte = " + Twine(KernelInfo.CodeLen),
+                                 false);
       OutStreamer.emitRawComment(" NumSgprs: " + Twine(KernelInfo.NumSGPR),
                                  false);
       OutStreamer.emitRawComment(" NumVgprs: " + Twine(KernelInfo.NumVGPR),
@@ -184,9 +186,9 @@ void AMDGPUAsmPrinter::EmitProgramInfoR600(MachineFunction &MF) {
   }
 }
 
-void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
-                                              unsigned &NumSGPR,
-                                              unsigned &NumVGPR) const {
+void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
+                                        MachineFunction &MF) const {
+  uint64_t CodeSize = 0;
   unsigned MaxSGPR = 0;
   unsigned MaxVGPR = 0;
   bool VCCUsed = false;
@@ -200,6 +202,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
                                                     I != E; ++I) {
       MachineInstr &MI = *I;
 
+      // TODO: CodeSize should account for multiple functions.
+      CodeSize += MI.getDesc().Size;
+
       unsigned numOperands = MI.getNumOperands();
       for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
         MachineOperand &MO = MI.getOperand(op_idx);
@@ -274,13 +279,9 @@ void AMDGPUAsmPrinter::findNumUsedRegistersSI(MachineFunction &MF,
   if (VCCUsed)
     MaxSGPR += 2;
 
-  NumSGPR = MaxSGPR;
-  NumVGPR = MaxVGPR;
-}
-
-void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &Out,
-                                        MachineFunction &MF) const {
-  findNumUsedRegistersSI(MF, Out.NumSGPR, Out.NumVGPR);
+  ProgInfo.CodeLen = CodeSize;
+  ProgInfo.NumSGPR = MaxSGPR;
+  ProgInfo.NumVGPR = MaxVGPR;
 }
 
 void AMDGPUAsmPrinter::EmitProgramInfoSI(MachineFunction &MF,
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
index a2b8337..71adc9a 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.h
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -24,7 +24,12 @@ namespace llvm {
 class AMDGPUAsmPrinter : public AsmPrinter {
 private:
   struct SIProgramInfo {
-    SIProgramInfo() : NumSGPR(0), NumVGPR(0) {}
+    SIProgramInfo() :
+      CodeLen(0),
+      NumSGPR(0),
+      NumVGPR(0) {}
+
+    uint64_t CodeLen;
     unsigned NumSGPR;
     unsigned NumVGPR;
   };
@@ -42,14 +47,14 @@ private:
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer);
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "AMDGPU Assembly Printer";
   }
 
   /// Implemented in AMDGPUMCInstLower.cpp
-  virtual void EmitInstruction(const MachineInstr *MI);
+  void EmitInstruction(const MachineInstr *MI) override;
 
 protected:
   bool DisasmEnabled;
diff --git a/lib/Target/R600/AMDGPUCallingConv.td b/lib/Target/R600/AMDGPUCallingConv.td
index 65cdb24..5f8ad8c 100644
--- a/lib/Target/R600/AMDGPUCallingConv.td
+++ b/lib/Target/R600/AMDGPUCallingConv.td
@@ -20,7 +20,7 @@ def CC_SI : CallingConv<[
   CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
-    SGPR16
+    SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21
   ]>>>,
 
   CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
index 50297d1..91aeee2 100644
--- a/lib/Target/R600/AMDGPUConvertToISA.cpp
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -31,9 +31,9 @@ public:
   AMDGPUConvertToISAPass(TargetMachine &tm) :
     MachineFunctionPass(ID), TM(tm) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+  const char *getPassName() const override {return "AMDGPU Convert to ISA";}
 
 };
 
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 0325a00..e7e90d3 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -97,7 +97,7 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
 const TargetFrameLowering::SpillSlot *
 AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
   NumEntries = 0;
-  return 0;
+  return nullptr;
 }
 void
 AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
diff --git a/lib/Target/R600/AMDGPUFrameLowering.h b/lib/Target/R600/AMDGPUFrameLowering.h
index cf5742e..d18ede5 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.h
+++ b/lib/Target/R600/AMDGPUFrameLowering.h
@@ -33,12 +33,13 @@ public:
 
   /// \returns The number of 32-bit sub-registers that are used when storing
   /// values to the stack.
-  virtual unsigned getStackWidth(const MachineFunction &MF) const;
-  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
-  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
-  virtual void emitPrologue(MachineFunction &MF) const;
-  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
-  virtual bool hasFP(const MachineFunction &MF) const;
+  unsigned getStackWidth(const MachineFunction &MF) const;
+  int getFrameIndexOffset(const MachineFunction &MF, int FI) const override;
+  const SpillSlot *
+    getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  bool hasFP(const MachineFunction &MF) const override;
 };
 } // namespace llvm
 #endif // AMDILFRAME_LOWERING_H
diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
index e8c5f5b..f1f0bfa 100644
--- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
@@ -16,15 +16,11 @@
 #include "AMDGPURegisterInfo.h"
 #include "R600InstrInfo.h"
 #include "SIISelLowering.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/IR/ValueMap.h"
-#include "llvm/Support/Compiler.h"
-#include <list>
-#include <queue>
+#include "llvm/IR/Function.h"
 
 using namespace llvm;
 
@@ -43,11 +39,12 @@ public:
   AMDGPUDAGToDAGISel(TargetMachine &TM);
   virtual ~AMDGPUDAGToDAGISel();
 
-  SDNode *Select(SDNode *N);
-  virtual const char *getPassName() const;
-  virtual void PostprocessISelDAG();
+  SDNode *Select(SDNode *N) override;
+  const char *getPassName() const override;
+  void PostprocessISelDAG() override;
 
 private:
+  bool isInlineImmediate(SDNode *N) const;
   inline SDValue getSmallIPtrImm(unsigned Imm);
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -58,11 +55,9 @@ private:
   bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
   bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
   bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
-  SDValue SimplifyI24(SDValue &Op);
-  bool SelectI24(SDValue Addr, SDValue &Op);
-  bool SelectU24(SDValue Addr, SDValue &Op);
 
   static bool checkType(const Value *ptr, unsigned int addrspace);
+  static bool checkPrivateAddress(const MachineMemOperand *Op);
 
   static bool isGlobalStore(const StoreSDNode *N);
   static bool isPrivateStore(const StoreSDNode *N);
@@ -77,10 +72,15 @@ private:
   bool isLocalLoad(const LoadSDNode *N) const;
   bool isRegionLoad(const LoadSDNode *N) const;
 
+  /// \returns True if the current basic block being selected is at control
+  ///          flow depth 0.  Meaning that the current block dominates the
+  //           exit block.
+  bool isCFDepth0() const;
+
   const TargetRegisterClass *getOperandRegClass(SDNode *N, unsigned OpNo) const;
   bool SelectGlobalValueConstantOffset(SDValue Addr, SDValue& IntPtr);
-  bool SelectGlobalValueVariableOffset(SDValue Addr,
-      SDValue &BaseReg, SDValue& Offset);
+  bool SelectGlobalValueVariableOffset(SDValue Addr, SDValue &BaseReg,
+                                       SDValue& Offset);
   bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
   bool SelectADDRIndirect(SDValue Addr, SDValue &Base, SDValue &Offset);
 
@@ -91,8 +91,7 @@ private:
 
 /// \brief This pass converts a legalized DAG into a AMDGPU-specific
 // DAG, ready for instruction scheduling.
-FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
-                                       ) {
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM) {
   return new AMDGPUDAGToDAGISel(TM);
 }
 
@@ -103,32 +102,39 @@ AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM)
 AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
 }
 
+bool AMDGPUDAGToDAGISel::isInlineImmediate(SDNode *N) const {
+  const SITargetLowering *TL
+      = static_cast<const SITargetLowering *>(getTargetLowering());
+  return TL->analyzeImmediate(N) == 0;
+}
+
 /// \brief Determine the register class for \p OpNo
 /// \returns The register class of the virtual register that will be used for
 /// the given operand number \OpNo or NULL if the register class cannot be
 /// determined.
 const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
                                                           unsigned OpNo) const {
-  if (!N->isMachineOpcode()) {
-    return NULL;
-  }
+  if (!N->isMachineOpcode())
+    return nullptr;
+
   switch (N->getMachineOpcode()) {
   default: {
     const MCInstrDesc &Desc = TM.getInstrInfo()->get(N->getMachineOpcode());
     unsigned OpIdx = Desc.getNumDefs() + OpNo;
     if (OpIdx >= Desc.getNumOperands())
-      return NULL;
+      return nullptr;
     int RegClass = Desc.OpInfo[OpIdx].RegClass;
-    if (RegClass == -1) {
-      return NULL;
-    }
+    if (RegClass == -1)
+      return nullptr;
+
     return TM.getRegisterInfo()->getRegClass(RegClass);
   }
   case AMDGPU::REG_SEQUENCE: {
-    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(
-                      cast<ConstantSDNode>(N->getOperand(0))->getZExtValue());
-    unsigned SubRegIdx =
-            dyn_cast<ConstantSDNode>(N->getOperand(OpNo + 1))->getZExtValue();
+    unsigned RCID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    const TargetRegisterClass *SuperRC = TM.getRegisterInfo()->getRegClass(RCID);
+
+    SDValue SubRegOp = N->getOperand(OpNo + 1);
+    unsigned SubRegIdx = cast<ConstantSDNode>(SubRegOp)->getZExtValue();
     return TM.getRegisterInfo()->getSubClassWithSubReg(SuperRC, SubRegIdx);
   }
   }
@@ -139,7 +145,7 @@ SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
 }
 
 bool AMDGPUDAGToDAGISel::SelectADDRParam(
-    SDValue Addr, SDValue& R1, SDValue& R2) {
+  SDValue Addr, SDValue& R1, SDValue& R2) {
 
   if (Addr.getOpcode() == ISD::FrameIndex) {
     if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
@@ -196,15 +202,16 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
+
+  const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
   switch (Opc) {
   default: break;
   // We are selecting i64 ADD here instead of custom lower it during
   // DAG legalization, so we can fold some i64 ADDs used for address
   // calculation into the LOAD and STORE instructions.
   case ISD::ADD: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (N->getValueType(0) != MVT::i64 ||
         ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
       break;
@@ -232,12 +239,13 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     AddLoArgs.push_back(SDValue(Lo0, 0));
     AddLoArgs.push_back(SDValue(Lo1, 0));
 
-    SDNode *AddLo = CurDAG->getMachineNode(AMDGPU::S_ADD_I32, DL,
-                                           VTList, AddLoArgs);
+    SDNode *AddLo = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADD_I32 : AMDGPU::V_ADD_I32_e32,
+        DL, VTList, AddLoArgs);
     SDValue Carry = SDValue(AddLo, 1);
-    SDNode *AddHi = CurDAG->getMachineNode(AMDGPU::S_ADDC_U32, DL,
-                                           MVT::i32, SDValue(Hi0, 0),
-                                           SDValue(Hi1, 0), Carry);
+    SDNode *AddHi = CurDAG->getMachineNode(
+        isCFDepth0() ? AMDGPU::S_ADDC_U32 : AMDGPU::V_ADDC_U32_e32,
+        DL, MVT::i32, SDValue(Hi0, 0), SDValue(Hi1, 0), Carry);
 
     SDValue Args[5] = {
       CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
@@ -246,11 +254,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SDValue(AddHi,0),
       Sub1,
     };
-    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args, 5);
+    return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
   }
   case ISD::BUILD_VECTOR: {
     unsigned RegClassID;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     const AMDGPURegisterInfo *TRI =
                    static_cast<const AMDGPURegisterInfo*>(TM.getRegisterInfo());
     const SIRegisterInfo *SIRI =
@@ -316,7 +323,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     // 16 = Max Num Vector Elements
     // 2 = 2 REG_SEQUENCE operands per element (value, subreg index)
     // 1 = Vector Register Class
-    SDValue RegSeqArgs[16 * 2 + 1];
+    SmallVector<SDValue, 16 * 2 + 1> RegSeqArgs(N->getNumOperands() * 2 + 1);
 
     RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, MVT::i32);
     bool IsRegSeq = true;
@@ -333,11 +340,10 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     if (!IsRegSeq)
       break;
     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, N->getVTList(),
-        RegSeqArgs, 2 * N->getNumOperands() + 1);
+                                RegSeqArgs);
   }
   case ISD::BUILD_PAIR: {
     SDValue RC, SubReg0, SubReg1;
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
       break;
     }
@@ -346,7 +352,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0_sub1, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub2_sub3, MVT::i32);
     } else if (N->getValueType(0) == MVT::i64) {
-      RC = CurDAG->getTargetConstant(AMDGPU::VSrc_64RegClassID, MVT::i32);
+      RC = CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32);
       SubReg0 = CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32);
       SubReg1 = CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32);
     } else {
@@ -357,8 +363,37 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE,
                                   SDLoc(N), N->getValueType(0), Ops);
   }
-  case AMDGPUISD::REGISTER_LOAD: {
+
+  case ISD::Constant:
+  case ISD::ConstantFP: {
     const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
+        N->getValueType(0).getSizeInBits() != 64 || isInlineImmediate(N))
+      break;
+
+    uint64_t Imm;
+    if (ConstantFPSDNode *FP = dyn_cast<ConstantFPSDNode>(N))
+      Imm = FP->getValueAPF().bitcastToAPInt().getZExtValue();
+    else {
+      ConstantSDNode *C = cast<ConstantSDNode>(N);
+      Imm = C->getZExtValue();
+    }
+
+    SDNode *Lo = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm & 0xFFFFFFFF, MVT::i32));
+    SDNode *Hi = CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SDLoc(N), MVT::i32,
+                                CurDAG->getConstant(Imm >> 32, MVT::i32));
+    const SDValue Ops[] = {
+      CurDAG->getTargetConstant(AMDGPU::SReg_64RegClassID, MVT::i32),
+      SDValue(Lo, 0), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
+      SDValue(Hi, 0), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32)
+    };
+
+    return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, SDLoc(N),
+                                  N->getValueType(0), Ops);
+  }
+
+  case AMDGPUISD::REGISTER_LOAD: {
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -375,7 +410,6 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                   Ops);
   }
   case AMDGPUISD::REGISTER_STORE: {
-    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       break;
     SDValue Addr, Offset;
@@ -391,42 +425,95 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
                                         CurDAG->getVTList(MVT::Other),
                                         Ops);
   }
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    if (ST.getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
+      break;
+
+    // There is a scalar version available, but unlike the vector version which
+    // has a separate operand for the offset and width, the scalar version packs
+    // the width and offset into a single operand. Try to move to the scalar
+    // version if the offsets are constant, so that we can try to keep extended
+    // loads of kernel arguments in SGPRs.
+
+    // TODO: Technically we could try to pattern match scalar bitshifts of
+    // dynamic values, but it's probably not useful.
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    bool Signed = Opc == AMDGPUISD::BFE_I32;
+
+    // Transformation function, pack the offset and width of a BFE into
+    // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+    // source, bits [5:0] contain the offset and bits [22:16] the width.
+
+    uint32_t OffsetVal = Offset->getZExtValue();
+    uint32_t WidthVal = Width->getZExtValue();
+
+    uint32_t PackedVal = OffsetVal | WidthVal << 16;
+
+    SDValue PackedOffsetWidth = CurDAG->getTargetConstant(PackedVal, MVT::i32);
+    return CurDAG->getMachineNode(Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32,
+                                  SDLoc(N),
+                                  MVT::i32,
+                                  N->getOperand(0),
+                                  PackedOffsetWidth);
+
+  }
   }
   return SelectCode(N);
 }
 
 
-bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
-  if (!ptr) {
+bool AMDGPUDAGToDAGISel::checkType(const Value *Ptr, unsigned AS) {
+  assert(AS != 0 && "Use checkPrivateAddress instead.");
+  if (!Ptr)
     return false;
-  }
-  Type *ptrType = ptr->getType();
-  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+
+  return Ptr->getType()->getPointerAddressSpace() == AS;
+}
+
+bool AMDGPUDAGToDAGISel::checkPrivateAddress(const MachineMemOperand *Op) {
+  if (Op->getPseudoValue())
+    return true;
+
+  if (PointerType *PT = dyn_cast<PointerType>(Op->getValue()->getType()))
+    return PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
-  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+  const Value *MemVal = N->getMemOperand()->getValue();
+  return (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+          !checkType(MemVal, AMDGPUAS::REGION_ADDRESS));
 }
 
 bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int CbId) const {
-  if (CbId == -1) {
-    return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS);
-  }
-  return checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (CbId == -1)
+    return checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS);
+
+  return checkType(MemVal, AMDGPUAS::CONSTANT_BUFFER_0 + CbId);
 }
 
 bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
@@ -437,27 +524,26 @@ bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) const {
       return true;
     }
   }
-  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::GLOBAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::PARAM_I_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::LOCAL_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) const {
-  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+  return checkType(N->getMemOperand()->getValue(), AMDGPUAS::REGION_ADDRESS);
 }
 
 bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
   MachineMemOperand *MMO = N->getMemOperand();
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     if (MMO) {
-      const Value *V = MMO->getValue();
-      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      const PseudoSourceValue *PSV = MMO->getPseudoValue();
       if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
         return true;
       }
@@ -467,24 +553,34 @@ bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) const {
 }
 
 bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) const {
-  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+  if (checkPrivateAddress(N->getMemOperand())) {
     // Check to make sure we are not a constant pool load or a constant load
     // that is marked as a private load
     if (isCPLoad(N) || isConstantLoad(N, -1)) {
       return false;
     }
   }
-  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
-      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+
+  const Value *MemVal = N->getMemOperand()->getValue();
+  if (!checkType(MemVal, AMDGPUAS::LOCAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::GLOBAL_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::REGION_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::CONSTANT_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_D_ADDRESS) &&
+      !checkType(MemVal, AMDGPUAS::PARAM_I_ADDRESS)){
     return true;
   }
   return false;
 }
 
+bool AMDGPUDAGToDAGISel::isCFDepth0() const {
+  // FIXME: Figure out a way to use DominatorTree analysis here.
+  const BasicBlock *CurBlock = FuncInfo->MBB->getBasicBlock();
+  const Function *Fn = FuncInfo->Fn;
+  return &Fn->front() == CurBlock || &Fn->back() == CurBlock;
+}
+
+
 const char *AMDGPUDAGToDAGISel::getPassName() const {
   return "AMDGPU DAG->DAG Pattern Instruction Selection";
 }
@@ -499,7 +595,7 @@ const char *AMDGPUDAGToDAGISel::getPassName() const {
 //===----------------------------------------------------------------------===//
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
-    SDValue& IntPtr) {
+                                                         SDValue& IntPtr) {
   if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Addr)) {
     IntPtr = CurDAG->getIntPtrConstant(Cst->getZExtValue() / 4, true);
     return true;
@@ -509,7 +605,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueConstantOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
     SDValue& BaseReg, SDValue &Offset) {
-  if (!dyn_cast<ConstantSDNode>(Addr)) {
+  if (!isa<ConstantSDNode>(Addr)) {
     BaseReg = Addr;
     Offset = CurDAG->getIntPtrConstant(0, true);
     return true;
@@ -519,7 +615,7 @@ bool AMDGPUDAGToDAGISel::SelectGlobalValueVariableOffset(SDValue Addr,
 
 bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
                                            SDValue &Offset) {
-  ConstantSDNode * IMMOffset;
+  ConstantSDNode *IMMOffset;
 
   if (Addr.getOpcode() == ISD::ADD
       && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
@@ -563,52 +659,9 @@ bool AMDGPUDAGToDAGISel::SelectADDRIndirect(SDValue Addr, SDValue &Base,
   return true;
 }
 
-SDValue AMDGPUDAGToDAGISel::SimplifyI24(SDValue &Op) {
-  APInt Demanded = APInt(32, 0x00FFFFFF);
-  APInt KnownZero, KnownOne;
-  TargetLowering::TargetLoweringOpt TLO(*CurDAG, true, true);
-  const TargetLowering *TLI = getTargetLowering();
-  if (TLI->SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO)) {
-    CurDAG->ReplaceAllUsesWith(Op, TLO.New);
-    CurDAG->RepositionNode(Op.getNode(), TLO.New.getNode());
-    return SimplifyI24(TLO.New);
-  } else {
-    return  Op;
-  }
-}
-
-bool AMDGPUDAGToDAGISel::SelectI24(SDValue Op, SDValue &I24) {
-
-  assert(Op.getValueType() == MVT::i32);
-
-  if (CurDAG->ComputeNumSignBits(Op) == 9) {
-    I24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
-bool AMDGPUDAGToDAGISel::SelectU24(SDValue Op, SDValue &U24) {
-  APInt KnownZero;
-  APInt KnownOne;
-  CurDAG->ComputeMaskedBits(Op, KnownZero, KnownOne);
-
-  assert (Op.getValueType() == MVT::i32);
-
-  // ANY_EXTEND and EXTLOAD operations can only be done on types smaller than
-  // i32.  These smaller types are legal to use with the i24 instructions.
-  if ((KnownZero & APInt(KnownZero.getBitWidth(), 0xFF000000)) == 0xFF000000 ||
-       Op.getOpcode() == ISD::ANY_EXTEND ||
-       ISD::isEXTLoad(Op.getNode())) {
-    U24 = SimplifyI24(Op);
-    return true;
-  }
-  return false;
-}
-
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
-    (*(const AMDGPUTargetLowering*)getTargetLowering());
+    *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
   bool IsModified = false;
   do {
     IsModified = false;
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 183725c..6c443ea 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -28,8 +28,50 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 
 using namespace llvm;
+
+namespace {
+
+/// Diagnostic information for unimplemented or unsupported feature reporting.
+class DiagnosticInfoUnsupported : public DiagnosticInfo {
+private:
+  const Twine &Description;
+  const Function &Fn;
+
+  static int KindID;
+
+  static int getKindID() {
+    if (KindID == 0)
+      KindID = llvm::getNextAvailablePluginDiagnosticKind();
+    return KindID;
+  }
+
+public:
+  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Desc,
+                          DiagnosticSeverity Severity = DS_Error)
+    : DiagnosticInfo(getKindID(), Severity),
+      Description(Desc),
+      Fn(Fn) { }
+
+  const Function &getFunction() const { return Fn; }
+  const Twine &getDescription() const { return Description; }
+
+  void print(DiagnosticPrinter &DP) const override {
+    DP << "unsupported " << getDescription() << " in " << Fn.getName();
+  }
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == getKindID();
+  }
+};
+
+int DiagnosticInfoUnsupported::KindID = 0;
+}
+
+
 static bool allocateStack(unsigned ValNo, MVT ValVT, MVT LocVT,
                       CCValAssign::LocInfo LocInfo,
                       ISD::ArgFlagsTy ArgFlags, CCState &State) {
@@ -88,6 +130,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::STORE, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v2i64);
+
   // Custom lowering of vector stores is required for local address space
   // stores.
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
@@ -103,6 +148,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   // handle 64-bit stores.
   setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
 
+  setTruncStoreAction(MVT::i64, MVT::i16, Expand);
+  setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
   setTruncStoreAction(MVT::v4i64, MVT::v4i1, Expand);
@@ -126,6 +173,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::LOAD, MVT::f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 
+  setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v2i64);
+
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v4f32, Custom);
   setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i32, Custom);
@@ -152,15 +202,19 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
 
+  setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
+
   setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
   setOperationAction(ISD::FNEG, MVT::v4f32, Expand);
 
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
   setOperationAction(ISD::MUL, MVT::i64, Expand);
+  setOperationAction(ISD::SUB, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
   setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i64, Custom);
   setOperationAction(ISD::UREM, MVT::i32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::VSELECT, MVT::v4f32, Expand);
@@ -168,10 +222,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType IntTypes[] = {
     MVT::v2i32, MVT::v4i32
   };
-  const size_t NumIntTypes = array_lengthof(IntTypes);
 
-  for (unsigned int x  = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = IntTypes[x];
+  for (MVT VT : IntTypes) {
     //Expand the following operations for the current type by default
     setOperationAction(ISD::ADD,  VT, Expand);
     setOperationAction(ISD::AND,  VT, Expand);
@@ -195,12 +247,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
   static const MVT::SimpleValueType FloatTypes[] = {
     MVT::v2f32, MVT::v4f32
   };
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
 
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = FloatTypes[x];
+  for (MVT VT : FloatTypes) {
     setOperationAction(ISD::FABS, VT, Expand);
     setOperationAction(ISD::FADD, VT, Expand);
+    setOperationAction(ISD::FCOS, VT, Expand);
     setOperationAction(ISD::FDIV, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
     setOperationAction(ISD::FFLOOR, VT, Expand);
@@ -208,25 +259,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FMUL, VT, Expand);
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FSQRT, VT, Expand);
+    setOperationAction(ISD::FSIN, VT, Expand);
     setOperationAction(ISD::FSUB, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
   }
 
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
-
-  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+  setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::SELECT_CC);
 }
 
 //===----------------------------------------------------------------------===//
@@ -325,6 +364,25 @@ SDValue AMDGPUTargetLowering::LowerReturn(
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
+                                        SmallVectorImpl<SDValue> &InVals) const {
+  SDValue Callee = CLI.Callee;
+  SelectionDAG &DAG = CLI.DAG;
+
+  const Function &Fn = *DAG.getMachineFunction().getFunction();
+
+  StringRef FuncName("<unknown>");
+
+  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee))
+    FuncName = G->getSymbol();
+  else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    FuncName = G->getGlobal()->getName();
+
+  DiagnosticInfoUnsupported NoCalls(Fn, "call to function " + FuncName);
+  DAG.getContext()->diagnose(NoCalls);
+  return SDValue();
+}
+
 SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
     const {
   switch (Op.getOpcode()) {
@@ -361,12 +419,111 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
     // nothing here and let the illegal result integer be handled normally.
     return;
+  case ISD::UDIV: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM);
+    break;
+  }
+  case ISD::UREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    SDValue UDIVREM = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT),
+      N->getOperand(0), N->getOperand(1));
+    Results.push_back(UDIVREM.getValue(1));
+    break;
+  }
+  case ISD::UDIVREM: {
+    SDValue Op = SDValue(N, 0);
+    SDLoc DL(Op);
+    EVT VT = Op.getValueType();
+    EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
+
+    SDValue one = DAG.getConstant(1, HalfVT);
+    SDValue zero = DAG.getConstant(0, HalfVT);
+
+    //HiLo split
+    SDValue LHS = N->getOperand(0);
+    SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, zero);
+    SDValue LHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, one);
+
+    SDValue RHS = N->getOperand(1);
+    SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, zero);
+    SDValue RHS_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, one);
+
+    // Get Speculative values
+    SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
+    SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
+
+    SDValue REM_Hi = zero;
+    SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, zero, REM_Part, LHS_Hi, ISD::SETEQ);
+
+    SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, zero, DIV_Part, zero, ISD::SETEQ);
+    SDValue DIV_Lo = zero;
+
+    const unsigned halfBitWidth = HalfVT.getSizeInBits();
+
+    for (unsigned i = 0; i < halfBitWidth; ++i) {
+      SDValue POS = DAG.getConstant(halfBitWidth - i - 1, HalfVT);
+      // Get Value of high bit
+      SDValue HBit;
+      if (halfBitWidth == 32 && Subtarget->hasBFE()) {
+        HBit = DAG.getNode(AMDGPUISD::BFE_U32, DL, HalfVT, LHS_Lo, POS, one);
+      } else {
+        HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
+        HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, one);
+      }
+
+      SDValue Carry = DAG.getNode(ISD::SRL, DL, HalfVT, REM_Lo,
+        DAG.getConstant(halfBitWidth - 1, HalfVT));
+      REM_Hi = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Hi, one);
+      REM_Hi = DAG.getNode(ISD::OR, DL, HalfVT, REM_Hi, Carry);
+
+      REM_Lo = DAG.getNode(ISD::SHL, DL, HalfVT, REM_Lo, one);
+      REM_Lo = DAG.getNode(ISD::OR, DL, HalfVT, REM_Lo, HBit);
+
+
+      SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+
+      SDValue BIT = DAG.getConstant(1 << (halfBitWidth - i - 1), HalfVT);
+      SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, zero, ISD::SETGE);
+
+      DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
+
+      // Update REM
+
+      SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
+
+      REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETGE);
+      REM_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, zero);
+      REM_Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, REM, one);
+    }
 
+    SDValue REM = DAG.getNode(ISD::BUILD_PAIR, DL, VT, REM_Lo, REM_Hi);
+    SDValue DIV = DAG.getNode(ISD::BUILD_PAIR, DL, VT, DIV_Lo, DIV_Hi);
+    Results.push_back(DIV);
+    Results.push_back(REM);
+    break;
+  }
   default:
     return;
   }
 }
 
+// FIXME: This implements accesses to initialized globals in the constant
+// address space by copying them to private and accessing that. It does not
+// properly handle illegal types or vectors. The private vector loads are not
+// scalarized, and the illegal scalars hit an assertion. This technique will not
+// work well with large initializers, and this should eventually be
+// removed. Initialized globals should be placed into a data section that the
+// runtime will load into a buffer before the kernel is executed. Uses of the
+// global need to be replaced with a pointer loaded from an implicit kernel
+// argument into this buffer holding the copy of the data, which will remove the
+// need for any of this.
 SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
                                                        const GlobalValue *GV,
                                                        const SDValue &InitPtr,
@@ -380,29 +537,60 @@ SDValue AMDGPUTargetLowering::LowerConstantInitializer(const Constant* Init,
     return DAG.getStore(Chain, DL,  DAG.getConstant(*CI, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CI->getType()));
-  } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(Init)) {
     EVT VT = EVT::getEVT(CFP->getType());
     PointerType *PtrTy = PointerType::get(CFP->getType(), 0);
     return DAG.getStore(Chain, DL, DAG.getConstantFP(*CFP, VT), InitPtr,
                  MachinePointerInfo(UndefValue::get(PtrTy)), false, false,
                  TD->getPrefTypeAlignment(CFP->getType()));
-  } else if (Init->getType()->isAggregateType()) {
+  }
+
+  Type *InitTy = Init->getType();
+  if (StructType *ST = dyn_cast<StructType>(InitTy)) {
+    const StructLayout *SL = TD->getStructLayout(ST);
+
     EVT PtrVT = InitPtr.getValueType();
-    unsigned NumElements = Init->getType()->getArrayNumElements();
+    SmallVector<SDValue, 8> Chains;
+
+    for (unsigned I = 0, N = ST->getNumElements(); I != N; ++I) {
+      SDValue Offset = DAG.getConstant(SL->getElementOffset(I), PtrVT);
+      SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
+
+      Constant *Elt = Init->getAggregateElement(I);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
+    }
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  }
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(InitTy)) {
+    EVT PtrVT = InitPtr.getValueType();
+
+    unsigned NumElements;
+    if (ArrayType *AT = dyn_cast<ArrayType>(SeqTy))
+      NumElements = AT->getNumElements();
+    else if (VectorType *VT = dyn_cast<VectorType>(SeqTy))
+      NumElements = VT->getNumElements();
+    else
+      llvm_unreachable("Unexpected type");
+
+    unsigned EltSize = TD->getTypeAllocSize(SeqTy->getElementType());
     SmallVector<SDValue, 8> Chains;
     for (unsigned i = 0; i < NumElements; ++i) {
-      SDValue Offset = DAG.getConstant(i * TD->getTypeAllocSize(
-          Init->getType()->getArrayElementType()), PtrVT);
+      SDValue Offset = DAG.getConstant(i * EltSize, PtrVT);
       SDValue Ptr = DAG.getNode(ISD::ADD, DL, PtrVT, InitPtr, Offset);
-      Chains.push_back(LowerConstantInitializer(Init->getAggregateElement(i),
-                       GV, Ptr, Chain, DAG));
+
+      Constant *Elt = Init->getAggregateElement(i);
+      Chains.push_back(LowerConstantInitializer(Elt, GV, Ptr, Chain, DAG));
     }
-    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, &Chains[0],
-                       Chains.size());
-  } else {
-    Init->dump();
-    llvm_unreachable("Unhandled constant initializer");
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   }
+
+  Init->dump();
+  llvm_unreachable("Unhandled constant initializer");
 }
 
 SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
@@ -440,7 +628,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
     unsigned Size = TD->getTypeAllocSize(EltType);
     unsigned Alignment = TD->getPrefTypeAlignment(EltType);
 
-    const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+    const GlobalVariable *Var = cast<GlobalVariable>(GV);
     const Constant *Init = Var->getInitializer();
     int FI = FrameInfo->CreateStackObject(Size, Alignment, false);
     SDValue InitPtr = DAG.getFrameIndex(FI,
@@ -461,7 +649,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
       for (unsigned i = 1; i < (*I)->getNumOperands(); ++i) {
         Ops.push_back((*I)->getOperand(i));
       }
-      DAG.UpdateNodeOperands(*I, &Ops[0], Ops.size());
+      DAG.UpdateNodeOperands(*I, Ops);
     }
     return DAG.getZExtOrTrunc(InitPtr, SDLoc(Op),
         getPointerTy(AMDGPUAS::CONSTANT_ADDRESS));
@@ -469,44 +657,28 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   }
 }
 
-void AMDGPUTargetLowering::ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                                         SmallVectorImpl<SDValue> &Args,
-                                         unsigned Start,
-                                         unsigned Count) const {
-  EVT VT = Op.getValueType();
-  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
-    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op),
-                               VT.getVectorElementType(),
-                               Op, DAG.getConstant(i, MVT::i32)));
-  }
-}
-
 SDValue AMDGPUTargetLowering::LowerCONCAT_VECTORS(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SmallVector<SDValue, 8> Args;
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
 
-  ExtractVectorElements(A, DAG, Args, 0,
-                        A.getValueType().getVectorNumElements());
-  ExtractVectorElements(B, DAG, Args, 0,
-                        B.getValueType().getVectorNumElements());
+  DAG.ExtractVectorElements(A, Args);
+  DAG.ExtractVectorElements(B, Args);
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                      SelectionDAG &DAG) const {
 
   SmallVector<SDValue, 8> Args;
-  EVT VT = Op.getValueType();
   unsigned Start = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  ExtractVectorElements(Op.getOperand(0), DAG, Args, Start,
-                        VT.getVectorNumElements());
+  EVT VT = Op.getValueType();
+  DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
+                            VT.getVectorNumElements());
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(),
-                     &Args[0], Args.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op), Op.getValueType(), Args);
 }
 
 SDValue AMDGPUTargetLowering::LowerFrameIndex(SDValue Op,
@@ -560,6 +732,22 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
                                                   Op.getOperand(2));
 
+    case AMDGPUIntrinsic::AMDGPU_umul24:
+      return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_imul24:
+      return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2));
+
+    case AMDGPUIntrinsic::AMDGPU_umad24:
+      return DAG.getNode(AMDGPUISD::MAD_U24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
+    case AMDGPUIntrinsic::AMDGPU_imad24:
+      return DAG.getNode(AMDGPUISD::MAD_I24, DL, VT,
+                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+
     case AMDGPUIntrinsic::AMDGPU_bfe_i32:
       return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
                          Op.getOperand(1),
@@ -590,8 +778,7 @@ SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
 ///IABS(a) = SMAX(sub(0, a), a)
 SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
-    SelectionDAG &DAG) const {
-
+                                                 SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
@@ -603,7 +790,7 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
 /// Linear Interpolation
 /// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
 SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
-    SelectionDAG &DAG) const {
+                                                SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
   SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
@@ -617,16 +804,16 @@ SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
 }
 
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
-    SelectionDAG &DAG) const {
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
+SDValue AMDGPUTargetLowering::CombineMinMax(SDNode *N,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  SDValue True = Op.getOperand(2);
-  SDValue False = Op.getOperand(3);
-  SDValue CC = Op.getOperand(4);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue True = N->getOperand(2);
+  SDValue False = N->getOperand(3);
+  SDValue CC = N->getOperand(4);
 
   if (VT != MVT::f32 ||
       !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
@@ -654,10 +841,8 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOLT:
   case ISD::SETLE:
   case ISD::SETLT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMIN : AMDGPUISD::FMAX;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETGT:
   case ISD::SETGE:
@@ -665,15 +850,13 @@ SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
   case ISD::SETOGE:
   case ISD::SETUGT:
   case ISD::SETOGT: {
-    if (LHS == True)
-      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
-    else
-      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+    unsigned Opc = (LHS == True) ? AMDGPUISD::FMAX : AMDGPUISD::FMIN;
+    return DAG.getNode(Opc, DL, VT, LHS, RHS);
   }
   case ISD::SETCC_INVALID:
     llvm_unreachable("Invalid setcc condcode!");
   }
-  return Op;
+  return SDValue();
 }
 
 SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
@@ -695,8 +878,7 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue &Op,
                         MemEltVT, Load->isVolatile(), Load->isNonTemporal(),
                         Load->getAlignment()));
   }
-  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(),
-                     Loads.data(), Loads.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, SL, Op.getValueType(), Loads);
 }
 
 SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
@@ -713,32 +895,46 @@ SDValue AMDGPUTargetLowering::MergeVectorStore(const SDValue &Op,
   }
 
   SDLoc DL(Op);
-  const SDValue &Value = Store->getValue();
+  SDValue Value = Store->getValue();
   EVT VT = Value.getValueType();
-  const SDValue &Ptr = Store->getBasePtr();
+  EVT ElemVT = VT.getVectorElementType();
+  SDValue Ptr = Store->getBasePtr();
   EVT MemEltVT = MemVT.getVectorElementType();
   unsigned MemEltBits = MemEltVT.getSizeInBits();
   unsigned MemNumElements = MemVT.getVectorNumElements();
-  EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
-  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, PackedVT);
+  unsigned PackedSize = MemVT.getStoreSizeInBits();
+  SDValue Mask = DAG.getConstant((1 << MemEltBits) - 1, MVT::i32);
+
+  assert(Value.getValueType().getScalarSizeInBits() >= 32);
 
   SDValue PackedValue;
   for (unsigned i = 0; i < MemNumElements; ++i) {
-    EVT ElemVT = VT.getVectorElementType();
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ElemVT, Value,
                               DAG.getConstant(i, MVT::i32));
-    Elt = DAG.getZExtOrTrunc(Elt, DL, PackedVT);
-    Elt = DAG.getNode(ISD::AND, DL, PackedVT, Elt, Mask);
-    SDValue Shift = DAG.getConstant(MemEltBits * i, PackedVT);
-    Elt = DAG.getNode(ISD::SHL, DL, PackedVT, Elt, Shift);
+    Elt = DAG.getZExtOrTrunc(Elt, DL, MVT::i32);
+    Elt = DAG.getNode(ISD::AND, DL, MVT::i32, Elt, Mask); // getZeroExtendInReg
+
+    SDValue Shift = DAG.getConstant(MemEltBits * i, MVT::i32);
+    Elt = DAG.getNode(ISD::SHL, DL, MVT::i32, Elt, Shift);
+
     if (i == 0) {
       PackedValue = Elt;
     } else {
-      PackedValue = DAG.getNode(ISD::OR, DL, PackedVT, PackedValue, Elt);
+      PackedValue = DAG.getNode(ISD::OR, DL, MVT::i32, PackedValue, Elt);
     }
   }
+
+  if (PackedSize < 32) {
+    EVT PackedVT = EVT::getIntegerVT(*DAG.getContext(), PackedSize);
+    return DAG.getTruncStore(Store->getChain(), DL, PackedValue, Ptr,
+                             Store->getMemOperand()->getPointerInfo(),
+                             PackedVT,
+                             Store->isNonTemporal(), Store->isVolatile(),
+                             Store->getAlignment());
+  }
+
   return DAG.getStore(Store->getChain(), DL, PackedValue, Ptr,
-                      MachinePointerInfo(Store->getMemOperand()->getValue()),
+                      Store->getMemOperand()->getPointerInfo(),
                       Store->isVolatile(),  Store->isNonTemporal(),
                       Store->getAlignment());
 }
@@ -766,7 +962,7 @@ SDValue AMDGPUTargetLowering::SplitVectorStore(SDValue Op,
                          MemEltVT, Store->isVolatile(), Store->isNonTemporal(),
                          Store->getAlignment()));
   }
-  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, &Chains[0], NumElts);
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Chains);
 }
 
 SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
@@ -788,9 +984,24 @@ SDValue AMDGPUTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getNode(ISD::getExtForLoadExtType(ExtType), DL, VT, ExtLoad32);
   }
 
+  if (ExtType == ISD::NON_EXTLOAD && VT.getSizeInBits() < 32) {
+    assert(VT == MVT::i1 && "Only i1 non-extloads expected");
+    // FIXME: Copied from PPC
+    // First, load into 32 bits, then truncate to 1 bit.
+
+    SDValue Chain = Load->getChain();
+    SDValue BasePtr = Load->getBasePtr();
+    MachineMemOperand *MMO = Load->getMemOperand();
+
+    SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain,
+                                   BasePtr, MVT::i8, MMO);
+    return DAG.getNode(ISD::TRUNCATE, DL, VT, NewLD);
+  }
+
   // Lower loads constant address space global variable loads
   if (Load->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
-      isa<GlobalVariable>(GetUnderlyingObject(Load->getPointerInfo().V))) {
+      isa<GlobalVariable>(
+          GetUnderlyingObject(Load->getMemOperand()->getValue()))) {
 
     SDValue Ptr = DAG.getZExtOrTrunc(Load->getBasePtr(), DL,
         getPointerTy(AMDGPUAS::PRIVATE_ADDRESS));
@@ -887,15 +1098,13 @@ SDValue AMDGPUTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
-    SelectionDAG &DAG) const {
+                                           SelectionDAG &DAG) const {
   SDLoc DL(Op);
   EVT VT = Op.getValueType();
 
   SDValue Num = Op.getOperand(0);
   SDValue Den = Op.getOperand(1);
 
-  SmallVector<SDValue, 8> Results;
-
   // RCP =  URECIP(Den) = 2^32 / Den + e
   // e is rounding error.
   SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
@@ -985,10 +1194,11 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
   // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
   Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
                             Remainder_A_Den, Rem, ISD::SETEQ);
-  SDValue Ops[2];
-  Ops[0] = Div;
-  Ops[1] = Rem;
-  return DAG.getMergeValues(Ops, 2, DL);
+  SDValue Ops[2] = {
+    Div,
+    Rem
+  };
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerUINT_TO_FP(SDValue Op,
@@ -1029,81 +1239,197 @@ SDValue AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
   MVT VT = Op.getSimpleValueType();
   MVT ScalarVT = VT.getScalarType();
 
-  unsigned SrcBits = ExtraVT.getScalarType().getSizeInBits();
-  unsigned DestBits = ScalarVT.getSizeInBits();
-  unsigned BitsDiff = DestBits - SrcBits;
-
-  if (!Subtarget->hasBFE())
-    return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  if (!VT.isVector())
+    return SDValue();
 
   SDValue Src = Op.getOperand(0);
-  if (VT.isVector()) {
-    SDLoc DL(Op);
-    // Need to scalarize this, and revisit each of the scalars later.
-    // TODO: Don't scalarize on Evergreen?
-    unsigned NElts = VT.getVectorNumElements();
-    SmallVector<SDValue, 8> Args;
-    ExtractVectorElements(Src, DAG, Args, 0, NElts);
+  SDLoc DL(Op);
 
-    SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
-    for (unsigned I = 0; I < NElts; ++I)
-      Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
+  // TODO: Don't scalarize on Evergreen?
+  unsigned NElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Args;
+  DAG.ExtractVectorElements(Src, Args, 0, NElts);
 
-    return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args.data(), Args.size());
-  }
+  SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
+  for (unsigned I = 0; I < NElts; ++I)
+    Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
 
-  if (SrcBits == 32) {
-    SDLoc DL(Op);
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Args);
+}
 
-    // If the source is 32-bits, this is really half of a 2-register pair, and
-    // we need to discard the unused half of the pair.
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Src);
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, TruncSrc);
-  }
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
 
-  unsigned NElts = VT.isVector() ? VT.getVectorNumElements() : 1;
+static bool isU24(SDValue Op, SelectionDAG &DAG) {
+  APInt KnownZero, KnownOne;
+  EVT VT = Op.getValueType();
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
 
-  // TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
-  // might not be worth the effort, and will need to expand to shifts when
-  // fixing SGPR copies.
-  if (SrcBits < 32 && DestBits <= 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
-
-    if (DestBits != 32)
-      Src = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtVT, Src);
-
-    // FIXME: This should use TargetConstant, but that hits assertions for
-    // Evergreen.
-    SDValue Ext = DAG.getNode(AMDGPUISD::BFE_I32, DL, ExtVT,
-                              Op.getOperand(0), // Operand
-                              DAG.getConstant(0, ExtVT), // Offset
-                              DAG.getConstant(SrcBits, ExtVT)); // Width
-
-    // Truncate to the original type if necessary.
-    if (ScalarVT == MVT::i32)
-      return Ext;
-    return DAG.getNode(ISD::TRUNCATE, DL, VT, Ext);
-  }
+  return (VT.getSizeInBits() - KnownZero.countLeadingOnes()) <= 24;
+}
 
-  // For small types, extend to 32-bits first.
-  if (SrcBits < 32) {
-    SDLoc DL(Op);
-    MVT ExtVT = (NElts == 1) ? MVT::i32 : MVT::getVectorVT(MVT::i32, NElts);
+static bool isI24(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-    SDValue TruncSrc = DAG.getNode(ISD::TRUNCATE, DL, ExtVT, Src);
-    SDValue Ext32 = DAG.getNode(AMDGPUISD::BFE_I32,
-                                DL,
-                                ExtVT,
-                                TruncSrc, // Operand
-                                DAG.getConstant(0, ExtVT), // Offset
-                                DAG.getConstant(SrcBits, ExtVT)); // Width
+  // In order for this to be a signed 24-bit value, bit 23, must
+  // be a sign bit.
+  return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
+                                     // as unsigned 24-bit values.
+         (VT.getSizeInBits() - DAG.ComputeNumSignBits(Op)) < 24;
+}
+
+static void simplifyI24(SDValue Op, TargetLowering::DAGCombinerInfo &DCI) {
+
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = Op.getValueType();
+
+  APInt Demanded = APInt::getLowBitsSet(VT.getSizeInBits(), 24);
+  APInt KnownZero, KnownOne;
+  TargetLowering::TargetLoweringOpt TLO(DAG, true, true);
+  if (TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    DCI.CommitTargetLoweringOpt(TLO);
+}
 
-    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Ext32);
+template <typename IntTy>
+static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0,
+                               uint32_t Offset, uint32_t Width) {
+  if (Width + Offset < 32) {
+    IntTy Result = (Src0 << (32 - Offset - Width)) >> (32 - Width);
+    return DAG.getConstant(Result, MVT::i32);
   }
 
-  // For everything else, use the standard bitshift expansion.
-  return ExpandSIGN_EXTEND_INREG(Op, BitsDiff, DAG);
+  return DAG.getConstant(Src0 >> Offset, MVT::i32);
+}
+
+SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc DL(N);
+
+  switch(N->getOpcode()) {
+    default: break;
+    case ISD::MUL: {
+      EVT VT = N->getValueType(0);
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      SDValue Mul;
+
+      // FIXME: Add support for 24-bit multiply with 64-bit output on SI.
+      if (VT.isVector() || VT.getSizeInBits() > 32)
+        break;
+
+      if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
+        N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_U24, DL, MVT::i32, N0, N1);
+      } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
+        N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
+        N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
+        Mul = DAG.getNode(AMDGPUISD::MUL_I24, DL, MVT::i32, N0, N1);
+      } else {
+        break;
+      }
+
+      // We need to use sext even for MUL_U24, because MUL_U24 is used
+      // for signed multiply of 8 and 16-bit types.
+      SDValue Reg = DAG.getSExtOrTrunc(Mul, DL, VT);
+
+      return Reg;
+    }
+    case AMDGPUISD::MUL_I24:
+    case AMDGPUISD::MUL_U24: {
+      SDValue N0 = N->getOperand(0);
+      SDValue N1 = N->getOperand(1);
+      simplifyI24(N0, DCI);
+      simplifyI24(N1, DCI);
+      return SDValue();
+    }
+    case ISD::SELECT_CC: {
+      return CombineMinMax(N, DAG);
+    }
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    assert(!N->getValueType(0).isVector() &&
+           "Vector handling of BFE not implemented");
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
+    if (!Width)
+      break;
+
+    uint32_t WidthVal = Width->getZExtValue() & 0x1f;
+    if (WidthVal == 0)
+      return DAG.getConstant(0, MVT::i32);
+
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (!Offset)
+      break;
+
+    SDValue BitsFrom = N->getOperand(0);
+    uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
+
+    bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
+
+    if (OffsetVal == 0) {
+      // This is already sign / zero extended, so try to fold away extra BFEs.
+      unsigned SignBits =  Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
+
+      unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
+      if (OpSignBits >= SignBits)
+        return BitsFrom;
+
+      EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
+      if (Signed) {
+        // This is a sign_extend_inreg. Replace it to take advantage of existing
+        // DAG Combines. If not eliminated, we will match back to BFE during
+        // selection.
+
+        // TODO: The sext_inreg of extended types ends, although we can could
+        // handle them in a single BFE.
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
+                           DAG.getValueType(SmallVT));
+      }
+
+      return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
+    }
+
+    if (ConstantSDNode *Val = dyn_cast<ConstantSDNode>(N->getOperand(0))) {
+      if (Signed) {
+        return constantFoldBFE<int32_t>(DAG,
+                                        Val->getSExtValue(),
+                                        OffsetVal,
+                                        WidthVal);
+      }
+
+      return constantFoldBFE<uint32_t>(DAG,
+                                       Val->getZExtValue(),
+                                       OffsetVal,
+                                       WidthVal);
+    }
+
+    APInt Demanded = APInt::getBitsSet(32,
+                                       OffsetVal,
+                                       OffsetVal + WidthVal);
+
+    if ((OffsetVal + WidthVal) >= 32) {
+      SDValue ShiftVal = DAG.getConstant(OffsetVal, MVT::i32);
+      return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
+                         BitsFrom, ShiftVal);
+    }
+
+    APInt KnownZero, KnownOne;
+    TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                                          !DCI.isBeforeLegalizeOps());
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLO.ShrinkDemandedConstant(BitsFrom, Demanded) ||
+        TLI.SimplifyDemandedBits(BitsFrom, Demanded, KnownZero, KnownOne, TLO)) {
+      DCI.CommitTargetLoweringOpt(TLO);
+    }
+
+    break;
+  }
+  }
+  return SDValue();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1181,7 +1507,7 @@ SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
 
 const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   // AMDIL DAG nodes
   NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
@@ -1202,6 +1528,10 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BFE_I32)
   NODE_NAME_CASE(BFI)
   NODE_NAME_CASE(BFM)
+  NODE_NAME_CASE(MUL_U24)
+  NODE_NAME_CASE(MUL_I24)
+  NODE_NAME_CASE(MAD_U24)
+  NODE_NAME_CASE(MAD_I24)
   NODE_NAME_CASE(URECIP)
   NODE_NAME_CASE(DOT4)
   NODE_NAME_CASE(EXPORT)
@@ -1219,22 +1549,22 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
-static void computeMaskedBitsForMinMax(const SDValue Op0,
-                                       const SDValue Op1,
-                                       APInt &KnownZero,
-                                       APInt &KnownOne,
-                                       const SelectionDAG &DAG,
-                                       unsigned Depth) {
+static void computeKnownBitsForMinMax(const SDValue Op0,
+                                      const SDValue Op1,
+                                      APInt &KnownZero,
+                                      APInt &KnownOne,
+                                      const SelectionDAG &DAG,
+                                      unsigned Depth) {
   APInt Op0Zero, Op0One;
   APInt Op1Zero, Op1One;
-  DAG.ComputeMaskedBits(Op0, Op0Zero, Op0One, Depth);
-  DAG.ComputeMaskedBits(Op1, Op1Zero, Op1One, Depth);
+  DAG.computeKnownBits(Op0, Op0Zero, Op0One, Depth);
+  DAG.computeKnownBits(Op1, Op1Zero, Op1One, Depth);
 
   KnownZero = Op0Zero & Op1Zero;
   KnownOne = Op0One & Op1One;
 }
 
-void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   const SDValue Op,
   APInt &KnownZero,
   APInt &KnownOne,
@@ -1242,8 +1572,14 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   unsigned Depth) const {
 
   KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+
+  APInt KnownZero2;
+  APInt KnownOne2;
   unsigned Opc = Op.getOpcode();
+
   switch (Opc) {
+  default:
+    break;
   case ISD::INTRINSIC_WO_CHAIN: {
     // FIXME: The intrinsic should just use the node.
     switch (cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue()) {
@@ -1251,8 +1587,8 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
     case AMDGPUIntrinsic::AMDGPU_umax:
     case AMDGPUIntrinsic::AMDGPU_imin:
     case AMDGPUIntrinsic::AMDGPU_umin:
-      computeMaskedBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
-                                 KnownZero, KnownOne, DAG, Depth);
+      computeKnownBitsForMinMax(Op.getOperand(1), Op.getOperand(2),
+                                KnownZero, KnownOne, DAG, Depth);
       break;
     default:
       break;
@@ -1264,10 +1600,62 @@ void AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
   case AMDGPUISD::UMAX:
   case AMDGPUISD::SMIN:
   case AMDGPUISD::UMIN:
-    computeMaskedBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
-                               KnownZero, KnownOne, DAG, Depth);
+    computeKnownBitsForMinMax(Op.getOperand(0), Op.getOperand(1),
+                              KnownZero, KnownOne, DAG, Depth);
     break;
-  default:
+
+  case AMDGPUISD::BFE_I32:
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!CWidth)
+      return;
+
+    unsigned BitWidth = 32;
+    uint32_t Width = CWidth->getZExtValue() & 0x1f;
+    if (Width == 0) {
+      KnownZero = APInt::getAllOnesValue(BitWidth);
+      KnownOne = APInt::getNullValue(BitWidth);
+      return;
+    }
+
+    // FIXME: This could do a lot more. If offset is 0, should be the same as
+    // sign_extend_inreg implementation, but that involves duplicating it.
+    if (Opc == AMDGPUISD::BFE_I32)
+      KnownOne = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+    else
+      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+
     break;
   }
+  }
+}
+
+unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &DAG,
+  unsigned Depth) const {
+  switch (Op.getOpcode()) {
+  case AMDGPUISD::BFE_I32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    if (!Width)
+      return 1;
+
+    unsigned SignBits = 32 - Width->getZExtValue() + 1;
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Offset || !Offset->isNullValue())
+      return SignBits;
+
+    // TODO: Could probably figure something out with non-0 offsets.
+    unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    return std::max(SignBits, Op0SignBits);
+  }
+
+  case AMDGPUISD::BFE_U32: {
+    ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
+  }
+
+  default:
+    return 1;
+  }
 }
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index a019616..d5d821d 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -29,9 +29,6 @@ protected:
   const AMDGPUSubtarget *Subtarget;
 
 private:
-  void ExtractVectorElements(SDValue Op, SelectionDAG &DAG,
-                             SmallVectorImpl<SDValue> &Args,
-                             unsigned Start, unsigned Count) const;
   SDValue LowerConstantInitializer(const Constant* Init, const GlobalValue *GV,
                                    const SDValue &InitPtr,
                                    SDValue Chain,
@@ -44,7 +41,7 @@ private:
   /// of the same bitwidth.
   SDValue MergeVectorStore(const SDValue &Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
-  /// \returns The resulting chain. 
+  /// \returns The resulting chain.
   SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
 
@@ -83,62 +80,67 @@ protected:
 public:
   AMDGPUTargetLowering(TargetMachine &TM);
 
-  virtual bool isFAbsFree(EVT VT) const override;
-  virtual bool isFNegFree(EVT VT) const override;
-  virtual bool isTruncateFree(EVT Src, EVT Dest) const override;
-  virtual bool isTruncateFree(Type *Src, Type *Dest) const override;
-
-  virtual bool isZExtFree(Type *Src, Type *Dest) const override;
-  virtual bool isZExtFree(EVT Src, EVT Dest) const override;
-
-  virtual bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
-
-  virtual MVT getVectorIdxTy() const override;
-  virtual bool isLoadBitCastBeneficial(EVT, EVT) const override;
-  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
-                              bool isVarArg,
-                              const SmallVectorImpl<ISD::OutputArg> &Outs,
-                              const SmallVectorImpl<SDValue> &OutVals,
-                              SDLoc DL, SelectionDAG &DAG) const;
-  virtual SDValue LowerCall(CallLoweringInfo &CLI,
-                            SmallVectorImpl<SDValue> &InVals) const {
-    CLI.Callee.dump();
-    llvm_unreachable("Undefined function");
-  }
+  bool isFAbsFree(EVT VT) const override;
+  bool isFNegFree(EVT VT) const override;
+  bool isTruncateFree(EVT Src, EVT Dest) const override;
+  bool isTruncateFree(Type *Src, Type *Dest) const override;
+
+  bool isZExtFree(Type *Src, Type *Dest) const override;
+  bool isZExtFree(EVT Src, EVT Dest) const override;
+
+  bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
+  MVT getVectorIdxTy() const override;
+  bool isLoadBitCastBeneficial(EVT, EVT) const override;
+  SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                      bool isVarArg,
+                      const SmallVectorImpl<ISD::OutputArg> &Outs,
+                      const SmallVectorImpl<SDValue> &OutVals,
+                      SDLoc DL, SelectionDAG &DAG) const override;
+  SDValue LowerCall(CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
 
   SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
-  virtual const char* getTargetNodeName(unsigned Opcode) const;
+  SDValue CombineMinMax(SDNode *N, SelectionDAG &DAG) const;
+  const char* getTargetNodeName(unsigned Opcode) const override;
 
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const {
+  virtual SDNode *PostISelFolding(MachineSDNode *N,
+                                  SelectionDAG &DAG) const {
     return N;
   }
 
   /// \brief Determine which of the bits specified in \p Mask are known to be
   /// either zero or one and return them in the \p KnownZero and \p KnownOne
   /// bitsets.
-  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                              APInt &KnownZero,
-                                              APInt &KnownOne,
-                                              const SelectionDAG &DAG,
-                                              unsigned Depth = 0) const override;
+  void computeKnownBitsForTargetNode(const SDValue Op,
+                                     APInt &KnownZero,
+                                     APInt &KnownOne,
+                                     const SelectionDAG &DAG,
+                                     unsigned Depth = 0) const override;
+
+  virtual unsigned ComputeNumSignBitsForTargetNode(
+    SDValue Op,
+    const SelectionDAG &DAG,
+    unsigned Depth = 0) const override;
 
 // Functions defined in AMDILISelLowering.cpp
 public:
-  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
-                                  const CallInst &I, unsigned Intrinsic) const;
+  bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                          const CallInst &I, unsigned Intrinsic) const override;
 
   /// We want to mark f32/f64 floating point values as legal.
-  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const override;
 
   /// We don't want to shrink f64/f32 constants.
-  bool ShouldShrinkFPConstant(EVT VT) const;
+  bool ShouldShrinkFPConstant(EVT VT) const override;
+
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
 private:
   void InitAMDILLowering();
@@ -158,7 +160,6 @@ private:
   SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
   EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 };
 
 namespace AMDGPUISD {
@@ -188,6 +189,10 @@ enum {
   BFE_I32, // Extract range of bits with sign extension to 32-bits.
   BFI, // (src0 & src1) | (~src0 & src2)
   BFM, // Insert a range of bits into a 32-bit word.
+  MUL_U24,
+  MUL_I24,
+  MAD_U24,
+  MAD_I24,
   TEXTURE_FETCH,
   EXPORT,
   CONST_ADDRESS,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
index e32dd9f..1c3361a 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.cpp
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -20,14 +20,13 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRINFO_NAMED_OPS
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void AMDGPUInstrInfo::anchor() {}
 
@@ -85,7 +84,7 @@ AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
                                       MachineBasicBlock::iterator &MBBI,
                                       LiveVariables *LV) const {
 // TODO: Implement this function
-  return NULL;
+  return nullptr;
 }
 bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
                                         MachineBasicBlock &MBB) const {
@@ -176,7 +175,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       int FrameIndex) const {
 // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 MachineInstr*
 AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
@@ -184,7 +183,7 @@ AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                       const SmallVectorImpl<unsigned> &Ops,
                                       MachineInstr *LoadMI) const {
   // TODO: Implement this function
-  return 0;
+  return nullptr;
 }
 bool
 AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
@@ -356,3 +355,14 @@ int AMDGPUInstrInfo::getMaskedMIMGOp(uint16_t Opcode, unsigned Channels) const {
   case 3: return AMDGPU::getMaskedMIMGOp(Opcode, AMDGPU::Channels_3);
   }
 }
+
+// Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
+// header files, so we need to wrap it in a function that takes unsigned 
+// instead.
+namespace llvm {
+namespace AMDGPU {
+int getMCOpcode(uint16_t Opcode, unsigned Gen) {
+  return getMCOpcode(Opcode);
+}
+}
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
index 426910c..74baf6b 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.h
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -52,14 +52,15 @@ public:
   virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
 
   bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const;
+                             unsigned &DstReg, unsigned &SubIdx) const override;
 
-  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
   unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
-                                     int &FrameIndex) const;
+                                     int &FrameIndex) const override;
   bool hasLoadFromStackSlot(const MachineInstr *MI,
                             const MachineMemOperand *&MMO,
-                            int &FrameIndex) const;
+                            int &FrameIndex) const override;
   unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
   unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
                                       int &FrameIndex) const;
@@ -70,7 +71,7 @@ public:
   MachineInstr *
   convertToThreeAddress(MachineFunction::iterator &MFI,
                         MachineBasicBlock::iterator &MBBI,
-                        LiveVariables *LV) const;
+                        LiveVariables *LV) const override;
 
 
   virtual void copyPhysReg(MachineBasicBlock &MBB,
@@ -78,61 +79,62 @@ public:
                            unsigned DestReg, unsigned SrcReg,
                            bool KillSrc) const = 0;
 
-  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
+  bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const override;
 
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
 protected:
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      int FrameIndex) const;
+                                      int FrameIndex) const override;
   MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
                                       MachineInstr *MI,
                                       const SmallVectorImpl<unsigned> &Ops,
-                                      MachineInstr *LoadMI) const;
+                                      MachineInstr *LoadMI) const override;
   /// \returns the smallest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexBegin(const MachineFunction &MF) const;
+  int getIndirectIndexBegin(const MachineFunction &MF) const;
 
   /// \returns the largest register index that will be accessed by an indirect
   /// read or write or -1 if indirect addressing is not used by this program.
-  virtual int getIndirectIndexEnd(const MachineFunction &MF) const;
+  int getIndirectIndexEnd(const MachineFunction &MF) const;
 
 public:
   bool canFoldMemoryOperand(const MachineInstr *MI,
-                            const SmallVectorImpl<unsigned> &Ops) const;
+                           const SmallVectorImpl<unsigned> &Ops) const override;
   bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
-                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
-                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+                        unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                        SmallVectorImpl<MachineInstr *> &NewMIs) const override;
   bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
-                           SmallVectorImpl<SDNode *> &NewNodes) const;
+                           SmallVectorImpl<SDNode *> &NewNodes) const override;
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
-                                      bool UnfoldLoad, bool UnfoldStore,
-                                      unsigned *LoadRegIndex = 0) const;
+                               bool UnfoldLoad, bool UnfoldStore,
+                               unsigned *LoadRegIndex = nullptr) const override;
   bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
                                int64_t Offset1, int64_t Offset2,
-                               unsigned NumLoads) const;
+                               unsigned NumLoads) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool
+  ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   void insertNoop(MachineBasicBlock &MBB,
-                  MachineBasicBlock::iterator MI) const;
-  bool isPredicated(const MachineInstr *MI) const;
+                  MachineBasicBlock::iterator MI) const override;
+  bool isPredicated(const MachineInstr *MI) const override;
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                   const SmallVectorImpl<MachineOperand> &Pred2) const override;
   bool DefinesPredicate(MachineInstr *MI,
-                        std::vector<MachineOperand> &Pred) const;
-  bool isPredicable(MachineInstr *MI) const;
-  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+                        std::vector<MachineOperand> &Pred) const override;
+  bool isPredicable(MachineInstr *MI) const override;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
   // Helper functions that check the opcode for status information
   bool isLoadInst(llvm::MachineInstr *MI) const;
@@ -186,8 +188,7 @@ public:
 
   /// \brief Convert the AMDIL MachineInstr to a supported ISA
   /// MachineInstr
-  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
-    DebugLoc DL) const;
+  void convertToISA(MachineInstr & MI, MachineFunction &MF, DebugLoc DL) const;
 
   /// \brief Build a MOV instruction.
   virtual MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
index 69d8059..f96dbb4 100644
--- a/lib/Target/R600/AMDGPUInstrInfo.td
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -92,3 +92,18 @@ def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
+// Signed and unsigned 24-bit mulitply.  The highest 8-bits are ignore when
+// performing the mulitply.  The result is a 32-bit value.
+def AMDGPUmul_u24 : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+def AMDGPUmul_i24 : SDNode<"AMDGPUISD::MUL_I24", SDTIntBinOp,
+  [SDNPCommutative]
+>;
+
+def AMDGPUmad_u24 : SDNode<"AMDGPUISD::MAD_U24", AMDGPUDTIntTernaryOp,
+  []
+>;
+def AMDGPUmad_i24 : SDNode<"AMDGPUISD::MAD_I24", AMDGPUDTIntTernaryOp,
+  []
+>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
index 505fc81..80bdf5b 100644
--- a/lib/Target/R600/AMDGPUInstructions.td
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -37,6 +37,18 @@ class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
 def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
 def ADDRIndirect : ComplexPattern<iPTR, 2, "SelectADDRIndirect", [], []>;
 
+def u32imm : Operand<i32> {
+  let PrintMethod = "printU32ImmOperand";
+}
+
+def u16imm : Operand<i16> {
+  let PrintMethod = "printU16ImmOperand";
+}
+
+def u8imm : Operand<i8> {
+  let PrintMethod = "printU8ImmOperand";
+}
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -253,9 +265,6 @@ def FP_ONE : PatLeaf <
   [{return N->isExactlyValue(1.0);}]
 >;
 
-def U24 : ComplexPattern<i32, 1, "SelectU24", [], []>;
-def I24 : ComplexPattern<i32, 1, "SelectI24", [], []>;
-
 let isCodeGenOnly = 1, isPseudo = 1 in {
 
 let usesCustomInserter = 1  in {
@@ -414,6 +423,40 @@ class UMUL24Pattern <Instruction UMUL24> : Pat <
 >;
 */
 
+class IMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_i24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+class UMad24Pat<Instruction Inst> : Pat <
+  (add (AMDGPUmul_u24 i32:$src0, i32:$src1), i32:$src2),
+  (Inst $src0, $src1, $src2)
+>;
+
+multiclass Expand24IBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_imad24 : Pat <
+    (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_imul24 : Pat <
+    (AMDGPUmul_i24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
+multiclass Expand24UBitOps<Instruction MulInst, Instruction AddInst> {
+  def _expand_umad24 : Pat <
+    (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2),
+    (AddInst (MulInst $src0, $src1), $src2)
+  >;
+
+  def _expand_umul24 : Pat <
+    (AMDGPUmul_u24 i32:$src0, i32:$src1),
+    (MulInst $src0, $src1)
+  >;
+}
+
 include "R600Instructions.td"
 include "R700Instructions.td"
 include "EvergreenInstructions.td"
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
index c6521d0..9ad5e72 100644
--- a/lib/Target/R600/AMDGPUIntrinsics.td
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -49,6 +49,10 @@ let TargetPrefix = "AMDGPU", isTarget = 1 in {
   def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imul24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umad24 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfi : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
index 2c9909f..b759495 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.cpp
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -17,6 +17,7 @@
 #include "AMDGPUAsmPrinter.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "R600InstrInfo.h"
+#include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/Constants.h"
@@ -31,16 +32,30 @@
 
 using namespace llvm;
 
-AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
-  Ctx(ctx)
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &st):
+  Ctx(ctx), ST(st)
 { }
 
+enum AMDGPUMCInstLower::SISubtarget
+AMDGPUMCInstLower::AMDGPUSubtargetToSISubtarget(unsigned) const {
+  return AMDGPUMCInstLower::SI;
+}
+
+unsigned AMDGPUMCInstLower::getMCOpcode(unsigned MIOpcode) const {
+
+  int MCOpcode = AMDGPU::getMCOpcode(MIOpcode,
+                              AMDGPUSubtargetToSISubtarget(ST.getGeneration()));
+  if (MCOpcode == -1)
+    MCOpcode = MIOpcode;
+
+  return MCOpcode;
+}
+
 void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
-  OutMI.setOpcode(MI->getOpcode());
 
-  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
+  OutMI.setOpcode(getMCOpcode(MI->getOpcode()));
 
+  for (const MachineOperand &MO : MI->explicit_operands()) {
     MCOperand MCOp;
     switch (MO.getType()) {
     default:
@@ -67,7 +82,8 @@ void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
 }
 
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  AMDGPUMCInstLower MCInstLowering(OutContext);
+  AMDGPUMCInstLower MCInstLowering(OutContext,
+                               MF->getTarget().getSubtarget<AMDGPUSubtarget>());
 
 #ifdef _DEBUG
   StringRef Err;
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
index d7d538e..2b7f1e3 100644
--- a/lib/Target/R600/AMDGPUMCInstLower.h
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -13,16 +13,30 @@
 
 namespace llvm {
 
+class AMDGPUSubtarget;
 class MCInst;
 class MCContext;
 class MachineInstr;
 
 class AMDGPUMCInstLower {
 
+  // This must be kept in sync with the SISubtarget class in SIInstrInfo.td
+  enum SISubtarget {
+    SI = 0
+  };
+
   MCContext &Ctx;
+  const AMDGPUSubtarget &ST;
+
+  /// Convert a member of the AMDGPUSubtarget::Generation enum to the
+  /// SISubtarget enum.
+  enum SISubtarget AMDGPUSubtargetToSISubtarget(unsigned Gen) const;
+
+  /// Get the MC opcode for this MachineInstr.
+  unsigned getMCOpcode(unsigned MIOpcode) const;
 
 public:
-  AMDGPUMCInstLower(MCContext &ctx);
+  AMDGPUMCInstLower(MCContext &ctx, const AMDGPUSubtarget &ST);
 
   /// \brief Lower a MachineInstr to an MCInst
   void lower(const MachineInstr *MI, MCInst &OutMI) const;
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
index 8fbec4e..19927fa 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.cpp
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -27,10 +27,10 @@ AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm)
 // they are not supported at this time.
 //===----------------------------------------------------------------------===//
 
-const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+const MCPhysReg AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
 
-const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
-                                                                         const {
+const MCPhysReg*
+AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return &CalleeSavedReg;
 }
 
@@ -54,7 +54,7 @@ unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel) const {
     AMDGPU::sub15
   };
 
-  assert (Channel < array_lengthof(SubRegs));
+  assert(Channel < array_lengthof(SubRegs));
   return SubRegs[Channel];
 }
 
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
index 688e1a0..a7cba0d 100644
--- a/lib/Target/R600/AMDGPURegisterInfo.h
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -30,11 +30,11 @@ class TargetInstrInfo;
 
 struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   TargetMachine &TM;
-  static const uint16_t CalleeSavedReg;
+  static const MCPhysReg CalleeSavedReg;
 
   AMDGPURegisterInfo(TargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+  BitVector getReservedRegs(const MachineFunction &MF) const override {
     assert(!"Unimplemented");  return BitVector();
   }
 
@@ -43,11 +43,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// \returns The ISA reg class that is equivalent to \p RC.
   virtual const TargetRegisterClass * getISARegClass(
                                          const TargetRegisterClass * RC) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
-    assert(!"Unimplemented"); return NULL;
+    assert(!"Unimplemented"); return nullptr;
   }
 
   virtual unsigned getHWRegIndex(unsigned Reg) const {
@@ -58,11 +58,11 @@ struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
   /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
   unsigned getSubRegFromChannel(unsigned Channel) const;
 
-  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+  const MCPhysReg* getCalleeSavedRegs(const MachineFunction *MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
-                           RegScavenger *RS) const;
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+                           RegScavenger *RS) const override;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   unsigned getIndirectSubReg(unsigned IndirectIndex) const;
 
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index e77ab5e..f3b9932 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -16,6 +16,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "amdgpu-subtarget"
+
 #define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
@@ -28,9 +30,6 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   // Default card
   StringRef GPU = CPU;
   Is64bit = false;
-  DefaultSize[0] = 64;
-  DefaultSize[1] = 1;
-  DefaultSize[2] = 1;
   HasVertexCache = false;
   TexVTXClauseSize = 0;
   Gen = AMDGPUSubtarget::R600;
@@ -106,14 +105,6 @@ bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
-size_t
-AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
-  if (dim > 2) {
-    return 1;
-  } else {
-    return DefaultSize[dim];
-  }
-}
 
 std::string
 AMDGPUSubtarget::getDeviceName() const {
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 8874d14..1b041d6 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -38,7 +38,6 @@ public:
   };
 
 private:
-  size_t DefaultSize[3];
   std::string DevName;
   bool Is64bit;
   bool Is32on64bit;
@@ -60,7 +59,7 @@ public:
   AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
 
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
-  virtual void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
+  void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   bool is64bit() const;
   bool hasVertexCache() const;
@@ -77,20 +76,28 @@ public:
     return hasBFE();
   }
 
+  bool hasMulU24() const {
+    return (getGeneration() >= EVERGREEN);
+  }
+
+  bool hasMulI24() const {
+    return (getGeneration() >= SOUTHERN_ISLANDS ||
+            hasCaymanISA());
+  }
+
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
   unsigned getStackEntrySize() const;
   bool hasCFAluBug() const;
 
-  virtual bool enableMachineScheduler() const {
+  bool enableMachineScheduler() const override {
     return getGeneration() <= NORTHERN_ISLANDS;
   }
 
   // Helper functions to simplify if statements
   bool isTargetELF() const;
   std::string getDeviceName() const;
-  virtual size_t getDefaultSize(uint32_t dim) const;
   bool dumpCode() const { return DumpCode; }
   bool r600ALUEncoding() const { return R600ALUInst; }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
index b11fce3..174fdca 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.cpp
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -42,7 +42,7 @@ extern "C" void LLVMInitializeR600Target() {
 }
 
 static ScheduleDAGInstrs *createR600MachineScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, new R600SchedStrategy());
+  return new ScheduleDAGMILive(C, make_unique<R600SchedStrategy>());
 }
 
 static MachineSchedRegistry
@@ -54,7 +54,7 @@ static std::string computeDataLayout(const AMDGPUSubtarget &ST) {
 
   if (ST.is64bit()) {
     // 32-bit private, local, and region pointers. 64-bit global and constant.
-    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:64:64";
+    Ret += "-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64";
   }
 
   Ret += "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256"
@@ -103,20 +103,20 @@ public:
     return getTM<AMDGPUTargetMachine>();
   }
 
-  virtual ScheduleDAGInstrs *
-  createMachineScheduler(MachineSchedContext *C) const {
+  ScheduleDAGInstrs *
+  createMachineScheduler(MachineSchedContext *C) const override {
     const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS)
       return createR600MachineScheduler(C);
-    return 0;
+    return nullptr;
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreRegAlloc();
-  virtual bool addPostRegAlloc();
-  virtual bool addPreSched2();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreRegAlloc() override;
+  bool addPostRegAlloc() override;
+  bool addPreSched2() override;
+  bool addPreEmitPass() override;
 };
 } // End of anonymous namespace
 
@@ -154,6 +154,7 @@ AMDGPUPassConfig::addPreISel() {
 
 bool AMDGPUPassConfig::addInstSelector() {
   addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  addPass(createSILowerI1CopiesPass());
   return false;
 }
 
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
index f942614..1287e13 100644
--- a/lib/Target/R600/AMDGPUTargetMachine.h
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -20,7 +20,6 @@
 #include "AMDGPUSubtarget.h"
 #include "AMDILIntrinsicInfo.h"
 #include "R600ISelLowering.h"
-#include "llvm/ADT/OwningPtr.h"
 #include "llvm/IR/DataLayout.h"
 
 namespace llvm {
@@ -31,8 +30,8 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
   const DataLayout Layout;
   AMDGPUFrameLowering FrameLowering;
   AMDGPUIntrinsicInfo IntrinsicInfo;
-  OwningPtr<AMDGPUInstrInfo> InstrInfo;
-  OwningPtr<AMDGPUTargetLowering> TLInfo;
+  std::unique_ptr<AMDGPUInstrInfo> InstrInfo;
+  std::unique_ptr<AMDGPUTargetLowering> TLInfo;
   const InstrItineraryData *InstrItins;
 
 public:
@@ -40,30 +39,32 @@ public:
                       StringRef CPU, TargetOptions Options, Reloc::Model RM,
                       CodeModel::Model CM, CodeGenOpt::Level OL);
   ~AMDGPUTargetMachine();
-  virtual const AMDGPUFrameLowering *getFrameLowering() const {
+  const AMDGPUFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const AMDGPUIntrinsicInfo *getIntrinsicInfo() const {
+  const AMDGPUIntrinsicInfo *getIntrinsicInfo() const override {
     return &IntrinsicInfo;
   }
-  virtual const AMDGPUInstrInfo *getInstrInfo() const {
+  const AMDGPUInstrInfo *getInstrInfo() const override {
     return InstrInfo.get();
   }
-  virtual const AMDGPUSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+  const AMDGPUSubtarget *getSubtargetImpl() const override {
+    return &Subtarget;
+  }
+  const AMDGPURegisterInfo *getRegisterInfo() const override {
     return &InstrInfo->getRegisterInfo();
   }
-  virtual AMDGPUTargetLowering *getTargetLowering() const {
+  AMDGPUTargetLowering *getTargetLowering() const override {
     return TLInfo.get();
   }
-  virtual const InstrItineraryData *getInstrItineraryData() const {
+  const InstrItineraryData *getInstrItineraryData() const override {
     return InstrItins;
   }
-  virtual const DataLayout *getDataLayout() const { return &Layout; }
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  const DataLayout *getDataLayout() const override { return &Layout; }
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// \brief Register R600 analysis passes with a pass manager.
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
index 51225eb..ea78f43 100644
--- a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "AMDGPUtti"
 #include "AMDGPU.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -26,6 +25,8 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "AMDGPUtti"
+
 // Declare the pass initialization routine locally as target-specific passes
 // don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
@@ -45,7 +46,7 @@ class AMDGPUTTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) {
+  AMDGPUTTI() : ImmutablePass(ID), TM(nullptr), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -55,9 +56,9 @@ public:
     initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual void initializePass() override { pushTTIStack(this); }
+  void initializePass() override { pushTTIStack(this); }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     TargetTransformInfo::getAnalysisUsage(AU);
   }
 
@@ -65,15 +66,16 @@ public:
   static char ID;
 
   /// Provide necessary pointer adjustments for the two base classes.
-  virtual void *getAdjustedAnalysisPointer(const void *ID) override {
+  void *getAdjustedAnalysisPointer(const void *ID) override {
     if (ID == &TargetTransformInfo::ID)
       return (TargetTransformInfo *)this;
     return this;
   }
 
-  virtual bool hasBranchDivergence() const override;
+  bool hasBranchDivergence() const override;
 
-  virtual void getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const;
+  void getUnrollingPreferences(Loop *L,
+                               UnrollingPreferences &UP) const override;
 
   /// @}
 };
@@ -109,11 +111,11 @@ void AMDGPUTTI::getUnrollingPreferences(Loop *L,
         // require us to use indirect addressing, which is slow and prone to
         // compiler bugs.  If this loop does an address calculation on an
         // alloca ptr, then we want to use a higher than normal loop unroll
-	// threshold.  This will give SROA a better chance to eliminate these
-	// allocas.
-	//
-	// Don't use the maximum allowed value here as it will make some
-	// programs way too big.
+        // threshold. This will give SROA a better chance to eliminate these
+        // allocas.
+        //
+        // Don't use the maximum allowed value here as it will make some
+        // programs way too big.
         UP.Threshold = 500;
       }
     }
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
index 21ca560..f3a0391 100644
--- a/lib/Target/R600/AMDILCFGStructurizer.cpp
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -8,8 +8,6 @@
 /// \file
 //==-----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "structcfg"
-
 #include "AMDGPU.h"
 #include "AMDGPUInstrInfo.h"
 #include "R600InstrInfo.h"
@@ -34,6 +32,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "structcfg"
+
 #define DEFAULT_VEC_SLOTS 8
 
 // TODO: move-begin.
@@ -135,15 +135,15 @@ public:
   static char ID;
 
   AMDGPUCFGStructurizer() :
-      MachineFunctionPass(ID), TII(NULL), TRI(NULL) {
+      MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {
     initializeAMDGPUCFGStructurizerPass(*PassRegistry::getPassRegistry());
   }
 
-   const char *getPassName() const {
+   const char *getPassName() const override {
     return "AMDGPU Control Flow Graph structurizer Pass";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addPreserved<MachineFunctionAnalysis>();
     AU.addRequired<MachineFunctionAnalysis>();
     AU.addRequired<MachineDominatorTree>();
@@ -159,7 +159,7 @@ public:
   /// sure all loops have an exit block
   bool prepare();
 
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI = &TII->getRegisterInfo();
     DEBUG(MF.dump(););
@@ -168,7 +168,7 @@ public:
     MLI = &getAnalysis<MachineLoopInfo>();
     DEBUG(dbgs() << "LoopInfo:\n"; PrintLoopinfo(*MLI););
     MDT = &getAnalysis<MachineDominatorTree>();
-    DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
+    DEBUG(MDT->print(dbgs(), (const llvm::Module*)nullptr););
     PDT = &getAnalysis<MachinePostDominatorTree>();
     DEBUG(PDT->print(dbgs()););
     prepare();
@@ -334,7 +334,7 @@ protected:
       MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
   void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
   void retireBlock(MachineBasicBlock *MBB);
-  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = NULL);
+  void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = nullptr);
 
   MachineBasicBlock *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
   /// This is work around solution for findNearestCommonDominator not avaiable
@@ -361,7 +361,7 @@ MachineBasicBlock *AMDGPUCFGStructurizer::getLoopLandInfo(MachineLoop *LoopRep)
     const {
   LoopLandInfoMap::const_iterator It = LLInfoMap.find(LoopRep);
   if (It == LLInfoMap.end())
-    return NULL;
+    return nullptr;
   return (*It).second;
 }
 
@@ -632,7 +632,7 @@ MachineInstr *AMDGPUCFGStructurizer::getNormalBlockBranchInstr(
   MachineInstr *MI = &*It;
   if (MI && (isCondBranch(MI) || isUncondBranch(MI)))
     return MI;
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
@@ -648,7 +648,7 @@ MachineInstr *AMDGPUCFGStructurizer::getLoopendBlockBranchInstr(
         break;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
@@ -658,7 +658,7 @@ MachineInstr *AMDGPUCFGStructurizer::getReturnInstr(MachineBasicBlock *MBB) {
     if (instr->getOpcode() == AMDGPU::RETURN)
       return instr;
   }
-  return NULL;
+  return nullptr;
 }
 
 MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
@@ -668,7 +668,7 @@ MachineInstr *AMDGPUCFGStructurizer::getContinueInstr(MachineBasicBlock *MBB) {
     if (MI->getOpcode() == AMDGPU::CONTINUE)
       return MI;
   }
-  return NULL;
+  return nullptr;
 }
 
 bool AMDGPUCFGStructurizer::isReturnBlock(MachineBasicBlock *MBB) {
@@ -819,7 +819,7 @@ bool AMDGPUCFGStructurizer::run() {
 
     SmallVectorImpl<MachineBasicBlock *>::const_iterator SccBeginIter =
         It;
-    MachineBasicBlock *SccBeginMBB = NULL;
+    MachineBasicBlock *SccBeginMBB = nullptr;
     int SccNumBlk = 0;  // The number of active blocks, init to a
                         // maximum possible number.
     int SccNumIter;     // Number of iteration in this SCC.
@@ -874,7 +874,7 @@ bool AMDGPUCFGStructurizer::run() {
       }
 
       if (ContNextScc)
-        SccBeginMBB = NULL;
+        SccBeginMBB = nullptr;
     } //while, "one iteration" over the function.
 
     MachineBasicBlock *EntryMBB =
@@ -933,7 +933,7 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
   MachineBasicBlock *MBB;
   for (scc_iterator<MachineFunction *> It = scc_begin(MF); !It.isAtEnd();
        ++It, ++SccNum) {
-    std::vector<MachineBasicBlock *> &SccNext = *It;
+    const std::vector<MachineBasicBlock *> &SccNext = *It;
     for (std::vector<MachineBasicBlock *>::const_iterator
          blockIter = SccNext.begin(), blockEnd = SccNext.end();
          blockIter != blockEnd; ++blockIter) {
@@ -1026,7 +1026,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
   } else if (TrueMBB->succ_size() == 1 && *TrueMBB->succ_begin() == FalseMBB) {
     // Triangle pattern, false is empty
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && *FalseMBB->succ_begin() == TrueMBB) {
     // Triangle pattern, true is empty
@@ -1034,7 +1034,7 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
     std::swap(TrueMBB, FalseMBB);
     reversePredicateSetter(MBB->end());
     LandBlk = FalseMBB;
-    FalseMBB = NULL;
+    FalseMBB = nullptr;
   } else if (FalseMBB->succ_size() == 1
              && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) {
     LandBlk = *FalseMBB->succ_begin();
@@ -1075,13 +1075,11 @@ int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
 
 int AMDGPUCFGStructurizer::loopendPatternMatch() {
   std::vector<MachineLoop *> NestedLoops;
-  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end();
-      It != E; ++It) {
-    df_iterator<MachineLoop *> LpIt = df_begin(*It),
-        LpE = df_end(*It);
-    for (; LpIt != LpE; ++LpIt)
-      NestedLoops.push_back(*LpIt);
-  }
+  for (MachineLoopInfo::iterator It = MLI->begin(), E = MLI->end(); It != E;
+       ++It)
+    for (MachineLoop *ML : depth_first(*It))
+      NestedLoops.push_back(ML);
+
   if (NestedLoops.size() == 0)
     return 0;
 
@@ -1244,7 +1242,7 @@ int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
     DEBUG(
       dbgs() << " not working\n";
     );
-    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : NULL;
+    DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) : nullptr;
   } // walk down the postDomTree
 
   return Num;
@@ -1723,11 +1721,11 @@ AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) {
   const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
 
   if (!LoopHeader || !LoopLatch)
-    return NULL;
+    return nullptr;
   MachineInstr *BranchMI = getLoopendBlockBranchInstr(LoopLatch);
   // Is LoopRep an infinite loop ?
   if (!BranchMI || !isUncondBranch(BranchMI))
-    return NULL;
+    return nullptr;
 
   MachineBasicBlock *DummyExitBlk = FuncRep->CreateMachineBasicBlock();
   FuncRep->push_back(DummyExitBlk);  //insert to function
@@ -1860,7 +1858,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
 
   if (!Node1 || !Node2)
-    return NULL;
+    return nullptr;
 
   Node1 = Node1->getIDom();
   while (Node1) {
@@ -1869,7 +1867,7 @@ AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
     Node1 = Node1->getIDom();
   }
 
-  return NULL;
+  return nullptr;
 }
 
 MachineBasicBlock *
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
index 0761ff4..7cea803 100644
--- a/lib/Target/R600/AMDILISelLowering.cpp
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -39,61 +39,55 @@ using namespace llvm;
 // TargetLowering Class Implementation Begins
 //===----------------------------------------------------------------------===//
 void AMDGPUTargetLowering::InitAMDILLowering() {
-  static const int types[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::f32,
-    (int)MVT::f64,
-    (int)MVT::i64,
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType types[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::f32,
+    MVT::f64,
+    MVT::i64,
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
 
-  static const int IntTypes[] = {
-    (int)MVT::i8,
-    (int)MVT::i16,
-    (int)MVT::i32,
-    (int)MVT::i64
+  static const MVT::SimpleValueType IntTypes[] = {
+    MVT::i8,
+    MVT::i16,
+    MVT::i32,
+    MVT::i64
   };
 
-  static const int FloatTypes[] = {
-    (int)MVT::f32,
-    (int)MVT::f64
+  static const MVT::SimpleValueType FloatTypes[] = {
+    MVT::f32,
+    MVT::f64
   };
 
-  static const int VectorTypes[] = {
-    (int)MVT::v2i8,
-    (int)MVT::v4i8,
-    (int)MVT::v2i16,
-    (int)MVT::v4i16,
-    (int)MVT::v4f32,
-    (int)MVT::v4i32,
-    (int)MVT::v2f32,
-    (int)MVT::v2i32,
-    (int)MVT::v2f64,
-    (int)MVT::v2i64
+  static const MVT::SimpleValueType VectorTypes[] = {
+    MVT::v2i8,
+    MVT::v4i8,
+    MVT::v2i16,
+    MVT::v4i16,
+    MVT::v4f32,
+    MVT::v4i32,
+    MVT::v2f32,
+    MVT::v2i32,
+    MVT::v2f64,
+    MVT::v2i64
   };
-  const size_t NumTypes = array_lengthof(types);
-  const size_t NumFloatTypes = array_lengthof(FloatTypes);
-  const size_t NumIntTypes = array_lengthof(IntTypes);
-  const size_t NumVectorTypes = array_lengthof(VectorTypes);
 
   const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
   // These are the current register classes that are
   // supported
 
-  for (unsigned int x  = 0; x < NumTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
-
+  for (MVT VT : types) {
     setOperationAction(ISD::SUBE, VT, Expand);
     setOperationAction(ISD::SUBC, VT, Expand);
     setOperationAction(ISD::ADDE, VT, Expand);
@@ -109,9 +103,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
       setOperationAction(ISD::SDIV, VT, Custom);
     }
   }
-  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
-
+  for (MVT VT : FloatTypes) {
     // IL does not have these operations for floating point types
     setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
     setOperationAction(ISD::SETOLT, VT, Expand);
@@ -124,9 +116,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::SETULE, VT, Expand);
   }
 
-  for (unsigned int x = 0; x < NumIntTypes; ++x) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
-
+  for (MVT VT : IntTypes) {
     // GPU also does not have divrem function for signed or unsigned
     setOperationAction(ISD::SDIVREM, VT, Expand);
 
@@ -142,9 +132,7 @@ void AMDGPUTargetLowering::InitAMDILLowering() {
     setOperationAction(ISD::CTLZ, VT, Expand);
   }
 
-  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
-    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
-
+  for (MVT VT : VectorTypes) {
     setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
index 762ee39..fab4a3b 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.cpp
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -38,7 +38,7 @@ AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
   };
 
   if (IntrID < Intrinsic::num_intrinsics) {
-    return 0;
+    return nullptr;
   }
   assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
       && "Invalid intrinsic ID");
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
index 35559e2..924275a 100644
--- a/lib/Target/R600/AMDILIntrinsicInfo.h
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -34,13 +34,13 @@ enum ID {
 class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
 public:
   AMDGPUIntrinsicInfo(TargetMachine *tm);
-  std::string getName(unsigned int IntrId, Type **Tys = 0,
-                      unsigned int numTys = 0) const;
-  unsigned int lookupName(const char *Name, unsigned int Len) const;
-  bool isOverloaded(unsigned int IID) const;
+  std::string getName(unsigned int IntrId, Type **Tys = nullptr,
+                      unsigned int numTys = 0) const override;
+  unsigned int lookupName(const char *Name, unsigned int Len) const override;
+  bool isOverloaded(unsigned int IID) const override;
   Function *getDeclaration(Module *M, unsigned int ID,
-                           Type **Tys = 0,
-                           unsigned int numTys = 0) const;
+                           Type **Tys = nullptr,
+                           unsigned int numTys = 0) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
index 658deb5..4a3e02e 100644
--- a/lib/Target/R600/AMDILIntrinsics.td
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -92,10 +92,6 @@ let TargetPrefix = "AMDIL", isTarget = 1 in {
           BinaryIntInt;
   def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
           BinaryIntInt;
-  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
-          BinaryIntInt;
-  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
-          BinaryIntInt;
   def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
           BinaryIntInt;
   def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
index 93a5117..3c6fa5a 100644
--- a/lib/Target/R600/CMakeLists.txt
+++ b/lib/Target/R600/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_target(R600CodeGen
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILowerControlFlow.cpp
+  SILowerI1Copies.cpp
   SIMachineFunctionInfo.cpp
   SIRegisterInfo.cpp
   SITypeRewriter.cpp
diff --git a/lib/Target/R600/CaymanInstructions.td b/lib/Target/R600/CaymanInstructions.td
index acd7bde..2630345 100644
--- a/lib/Target/R600/CaymanInstructions.td
+++ b/lib/Target/R600/CaymanInstructions.td
@@ -21,12 +21,14 @@ def isCayman : Predicate<"Subtarget.hasCaymanISA()">;
 let Predicates = [isCayman] in {
 
 def MULADD_INT24_cm : R600_3OP <0x08, "MULADD_INT24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
 def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))], VecALU
 >;
 
+def : IMad24Pat<MULADD_INT24_cm>;
+
 let isVector = 1 in {
 
 def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
@@ -47,6 +49,7 @@ def COS_cm : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+defm : Expand24UBitOps<MULLO_UINT_cm, ADD_INT>;
 
 // RECIP_UINT emulation for Cayman
 // The multiplication scales from [0,1] to the unsigned integer range
diff --git a/lib/Target/R600/EvergreenInstructions.td b/lib/Target/R600/EvergreenInstructions.td
index 6430ca6..2065441 100644
--- a/lib/Target/R600/EvergreenInstructions.td
+++ b/lib/Target/R600/EvergreenInstructions.td
@@ -75,6 +75,8 @@ def COS_eg : COS_Common<0x8E>;
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
 def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 
+defm : Expand24IBitOps<MULLO_INT_eg, ADD_INT>;
+
 //===----------------------------------------------------------------------===//
 // Memory read/write instructions
 //===----------------------------------------------------------------------===//
@@ -273,7 +275,7 @@ def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
   VecALU
 >;
 
-def BFE_INT_eg : R600_3OP <0x4, "BFE_INT",
+def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))],
   VecALU
 >;
@@ -286,6 +288,13 @@ def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   VecALU
 >;
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (i32 ONE_INT))>;
+def : Pat<(i32 (sext_inreg i32:$src, i8)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 8))>;
+def : Pat<(i32 (sext_inreg i32:$src, i16)),
+  (BFE_INT_eg i32:$src, (i32 ZERO), (MOV_IMM_I32 16))>;
+
 defm : BFIPatterns <BFI_INT_eg>;
 
 def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
@@ -294,8 +303,11 @@ def BFM_INT_eg : R600_2OP <0xA0, "BFM_INT",
 >;
 
 def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))], VecALU
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))], VecALU
 >;
+
+def : UMad24Pat<MULADD_UINT24_eg>;
+
 def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
@@ -309,7 +321,7 @@ def CNDGE_eg : CNDGE_Common<0x1B>;
 def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
 def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
 def MUL_UINT24_eg : R600_2OP <0xB5, "MUL_UINT24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))], VecALU
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))], VecALU
 >;
 def DOT4_eg : DOT4_Common<0xBE>;
 defm CUBE_eg : CUBE_Common<0xC0>;
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
index 7105879..11ae091 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -12,6 +12,8 @@
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
@@ -23,6 +25,21 @@ void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
   printAnnotation(OS, Annot);
 }
 
+void AMDGPUInstPrinter::printU8ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xff);
+}
+
+void AMDGPUInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffff);
+}
+
+void AMDGPUInstPrinter::printU32ImmOperand(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  O << formatHex(MI->getOperand(OpNo).getImm() & 0xffffffff);
+}
+
 void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
   switch (reg) {
   case AMDGPU::VCC:
@@ -41,43 +58,78 @@ void AMDGPUInstPrinter::printRegOperand(unsigned reg, raw_ostream &O) {
     break;
   }
 
-  // It's seems there's no way to use SIRegisterInfo here, and dealing with the
-  // giant enum of all the different shifted sets of registers is pretty
-  // unmanagable, so parse the name and reformat it to be prettier.
-  StringRef Name(getRegisterName(reg));
-
-  std::pair<StringRef, StringRef> Split = Name.split('_');
-  StringRef SubRegName = Split.first;
-  StringRef Rest = Split.second;
+  char Type;
+  unsigned NumRegs;
+
+  if (MRI.getRegClass(AMDGPU::VGPR_32RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 1;
+  } else  if (MRI.getRegClass(AMDGPU::SGPR_32RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 1;
+  } else if (MRI.getRegClass(AMDGPU::VReg_64RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 2;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_64RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 2;
+  } else if (MRI.getRegClass(AMDGPU::VReg_128RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 4;
+  } else  if (MRI.getRegClass(AMDGPU::SReg_128RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 4;
+  } else if (MRI.getRegClass(AMDGPU::VReg_96RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 3;
+  } else if (MRI.getRegClass(AMDGPU::VReg_256RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::SReg_256RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 8;
+  } else if (MRI.getRegClass(AMDGPU::VReg_512RegClassID).contains(reg)) {
+    Type = 'v';
+    NumRegs = 16;
+  } else if (MRI.getRegClass(AMDGPU::SReg_512RegClassID).contains(reg)) {
+    Type = 's';
+    NumRegs = 16;
+  } else {
+    O << getRegisterName(reg);
+    return;
+  }
 
-  if (SubRegName.size() <= 4) { // Must at least be as long as "SGPR"/"VGPR".
-    O << Name;
+  // The low 8 bits encoding value is the register index, for both VGPRs and
+  // SGPRs.
+  unsigned RegIdx = MRI.getEncodingValue(reg) & ((1 << 8)  - 1);
+  if (NumRegs == 1) {
+    O << Type << RegIdx;
     return;
   }
 
-  unsigned RegIndex;
-  StringRef RegIndexStr = SubRegName.drop_front(4);
+  O << Type << '[' << RegIdx << ':' << (RegIdx + NumRegs - 1) << ']';
+}
 
-  if (RegIndexStr.getAsInteger(10, RegIndex)) {
-    O << Name;
+void AMDGPUInstPrinter::printImmediate(uint32_t Imm, raw_ostream &O) {
+  int32_t SImm = static_cast<int32_t>(Imm);
+  if (SImm >= -16 && SImm <= 64) {
+    O << SImm;
     return;
   }
 
-  if (SubRegName.front() == 'V')
-    O << 'v';
-  else if (SubRegName.front() == 'S')
-    O << 's';
-  else {
-    O << Name;
+  if (Imm == FloatToBits(1.0f) ||
+      Imm == FloatToBits(-1.0f) ||
+      Imm == FloatToBits(0.5f) ||
+      Imm == FloatToBits(-0.5f) ||
+      Imm == FloatToBits(2.0f) ||
+      Imm == FloatToBits(-2.0f) ||
+      Imm == FloatToBits(4.0f) ||
+      Imm == FloatToBits(-4.0f)) {
+    O << BitsToFloat(Imm);
     return;
   }
 
-  if (Rest.empty()) // Only 1 32-bit register
-    O << RegIndex;
-  else {
-    unsigned NumReg = Rest.count('_') + 2;
-    O << '[' << RegIndex << ':' << (RegIndex + NumReg - 1) << ']';
-  }
+  O << formatHex(static_cast<uint64_t>(Imm));
 }
 
 void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
@@ -95,7 +147,7 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       break;
     }
   } else if (Op.isImm()) {
-    O << Op.getImm();
+    printImmediate(Op.getImm(), O);
   } else if (Op.isFPImm()) {
     O << Op.getFPImm();
   } else if (Op.isExpr()) {
@@ -106,6 +158,18 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   }
 }
 
+void AMDGPUInstPrinter::printOperandAndMods(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  unsigned InputModifiers = MI->getOperand(OpNo).getImm();
+  if (InputModifiers & 0x1)
+    O << "-";
+  if (InputModifiers & 0x2)
+    O << "|";
+  printOperand(MI, OpNo + 1, O);
+  if (InputModifiers & 0x2)
+    O << "|";
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
index 1d24680..6ca7170 100644
--- a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -29,13 +29,18 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 
 private:
-  static void printRegOperand(unsigned RegNo, raw_ostream &O);
-  static void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU8ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printU32ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRegOperand(unsigned RegNo, raw_ostream &O);
+  void printImmediate(uint32_t Imm, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperandAndMods(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printInterpSlot(const MCInst *MI, unsigned OpNum, raw_ostream &O);
-  static void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   static void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O,
                          StringRef Asm, StringRef Default = "");
   static void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
index a6bb59f..489cec7 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -23,8 +23,8 @@ namespace {
 class AMDGPUMCObjectWriter : public MCObjectWriter {
 public:
   AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
-  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
-                                        const MCAsmLayout &Layout) {
+  void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override {
     //XXX: Implement if necessary.
   }
   void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
@@ -34,7 +34,7 @@ public:
     assert(!"Not implemented");
   }
 
-  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+  void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 
 };
 
@@ -43,19 +43,19 @@ public:
   AMDGPUAsmBackend(const Target &T)
     : MCAsmBackend() {}
 
-  virtual unsigned getNumFixupKinds() const { return 0; };
-  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value, bool IsPCRel) const;
-  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
-                                    const MCRelaxableFragment *DF,
-                                    const MCAsmLayout &Layout) const {
+  unsigned getNumFixupKinds() const override { return 0; };
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel) const override;
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
     return false;
   }
-  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+  void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
     assert(!"Not implemented");
   }
-  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
-  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
     return true;
   }
 };
@@ -88,7 +88,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 public:
   ELFAMDGPUAsmBackend(const Target &T) : AMDGPUAsmBackend(T) { }
 
-  MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+  MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
     return createAMDGPUELFObjectWriter(OS);
   }
 };
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index aee9bd1..78bbe0a 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -35,7 +35,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
   Data16bitsDirective = ".short\t";
   Data32bitsDirective = ".long\t";
   Data64bitsDirective = ".quad\t";
-  GPRel32Directive = 0;
+  GPRel32Directive = nullptr;
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
 
@@ -58,5 +58,5 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(StringRef &TT) : MCAsmInfo() {
 
 const MCSection*
 AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
-  return 0;
+  return nullptr;
 }
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
index 22afd63..59aebec 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -22,7 +22,7 @@ class StringRef;
 class AMDGPUMCAsmInfo : public MCAsmInfo {
 public:
   explicit AMDGPUMCAsmInfo(StringRef &TT);
-  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const override;
 };
 } // namespace llvm
 #endif // AMDGPUMCASMINFO_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 6592b0e..38a2956 100644
--- a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -24,6 +24,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "AMDGPUGenInstrInfo.inc"
 
@@ -33,8 +35,6 @@
 #define GET_REGINFO_MC_DESC
 #include "AMDGPUGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createAMDGPUMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitAMDGPUMCInstrInfo(X);
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
index b1beab0..74b8ca0 100644
--- a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
@@ -1,4 +1,4 @@
-;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt -------------*- Conf -*--===;
 ;
 ;                     The LLVM Compiler Infrastructure
 ;
@@ -19,5 +19,5 @@
 type = Library
 name = R600Desc
 parent = R600
-required_libraries = R600AsmPrinter R600Info MC
+required_libraries = MC R600AsmPrinter R600Info Support
 add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 286c7d1..5e7cefe 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -41,14 +41,14 @@ public:
     : MCII(mcii), MRI(mri) { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 private:
 
   void EmitByte(unsigned int byte, raw_ostream &OS) const;
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
index f42e978..ee02111 100644
--- a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -54,14 +54,14 @@ public:
   ~SIMCCodeEmitter() { }
 
   /// \brief Encode the instruction and write it to the OS.
-  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+  void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
-  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
-                                     SmallVectorImpl<MCFixup> &Fixups,
-                                     const MCSubtargetInfo &STI) const;
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const override;
 };
 
 } // End anonymous namespace
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index fde4481..ce17d7c 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -106,3 +106,5 @@ def : Proc<"kabini",     SI_Itin, [FeatureSeaIslands]>;
 def : Proc<"kaveri",     SI_Itin, [FeatureSeaIslands]>;
 
 def : Proc<"hawaii",     SI_Itin, [FeatureSeaIslands]>;
+
+def : Proc<"mullins",    SI_Itin, [FeatureSeaIslands]>;
diff --git a/lib/Target/R600/R600ClauseMergePass.cpp b/lib/Target/R600/R600ClauseMergePass.cpp
index 3d9015c..92bf0df 100644
--- a/lib/Target/R600/R600ClauseMergePass.cpp
+++ b/lib/Target/R600/R600ClauseMergePass.cpp
@@ -13,7 +13,6 @@
 /// It needs to be called after IfCvt for best results.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600mergeclause"
 #include "AMDGPU.h"
 #include "R600Defines.h"
 #include "R600InstrInfo.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600mergeclause"
+
 namespace {
 
 static bool isCFAlu(const MachineInstr *MI) {
@@ -62,9 +63,9 @@ private:
 public:
   R600ClauseMergePass(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const;
+  const char *getPassName() const override;
 };
 
 char R600ClauseMergePass::ID = 0;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index f74bef3..d255e96 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -12,7 +12,6 @@
 /// computing their address on the fly ; it also sets STACK_SIZE info.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "r600cf"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600Defines.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "r600cf"
+
 namespace {
 
 struct CFStack {
@@ -468,13 +469,13 @@ private:
 
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII (0), TRI(0),
+    TII (nullptr), TRI(nullptr),
     ST(tm.getSubtarget<AMDGPUSubtarget>()) {
       const AMDGPUSubtarget &ST = tm.getSubtarget<AMDGPUSubtarget>();
       MaxFetchInst = ST.getTexVTXClauseSize();
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
     R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
@@ -501,13 +502,13 @@ public:
           DEBUG(dbgs() << CfCount << ":"; I->dump(););
           FetchClauses.push_back(MakeFetchClause(MBB, I));
           CfCount++;
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
           continue;
         }
 
         MachineBasicBlock::iterator MI = I;
         if (MI->getOpcode() != AMDGPU::ENDIF)
-          LastAlu.back() = 0;
+          LastAlu.back() = nullptr;
         if (MI->getOpcode() == AMDGPU::CF_ALU)
           LastAlu.back() = MI;
         I++;
@@ -558,7 +559,7 @@ public:
           break;
         }
         case AMDGPU::IF_PREDICATE_SET: {
-          LastAlu.push_back(0);
+          LastAlu.push_back(nullptr);
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_JUMP))
               .addImm(0)
@@ -665,7 +666,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Control Flow Finalizer Pass";
   }
 };
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 5bd793a..38afebe 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -291,12 +291,12 @@ private:
 
 public:
   static char ID;
-  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(0), Address(0) {
+  R600EmitClauseMarkers() : MachineFunctionPass(ID), TII(nullptr), Address(0) {
 
     initializeR600EmitClauseMarkersPass(*PassRegistry::getPassRegistry());
   }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
 
     for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
@@ -315,7 +315,7 @@ public:
     return false;
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Emit Clause Markers Pass";
   }
 };
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index ca1189d..732b06d 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -38,11 +38,11 @@ private:
 
 public:
   R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
-    TII(0) { }
+    TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Expand special instructions pass";
   }
 };
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 6405a82..d6c6830 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -82,9 +82,31 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SELECT, MVT::i32, Expand);
   setOperationAction(ISD::SELECT, MVT::f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v2i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v2f32, Expand);
   setOperationAction(ISD::SELECT, MVT::v4i32, Expand);
-  setOperationAction(ISD::SELECT, MVT::v4f32, Expand);
+
+  // Expand sign extension of vectors
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Expand);
+
+  if (!Subtarget->hasBFE())
+    setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i32, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
 
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, MVT::i32, Custom);
@@ -117,6 +139,11 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
   setTargetDAGCombine(ISD::SELECT_CC);
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
+  // These should be replaced by UDVIREM, but it does not happen automatically
+  // during Type Legalization
+  setOperationAction(ISD::UDIV, MVT::i64, Custom);
+  setOperationAction(ISD::UREM, MVT::i64, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
 
   setBooleanContents(ZeroOrNegativeOneBooleanContent);
@@ -538,8 +565,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         DAG.getConstant(2, MVT::i32), // SWZ_Z
         DAG.getConstant(3, MVT::i32) // SWZ_W
       };
-      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(),
-          Args, 8);
+      return DAG.getNode(AMDGPUISD::EXPORT, SDLoc(Op), Op.getValueType(), Args);
     }
 
     // default for switch(IntrinsicID)
@@ -689,7 +715,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
         Op.getOperand(9),
         Op.getOperand(10)
       };
-      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
+      return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs);
     }
     case AMDGPUIntrinsic::AMDGPU_dp4: {
       SDValue Args[8] = {
@@ -710,7 +736,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
           DAG.getConstant(3, MVT::i32))
       };
-      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+      return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args);
     }
 
     case Intrinsic::r600_read_ngroups_x:
@@ -960,13 +986,6 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
     return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
   }
 
-
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   // If we make it this for it means we have no native instructions to handle
   // this SELECT_CC, so we must lower it.
   SDValue HWTrue, HWFalse;
@@ -1088,10 +1107,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
         DAG.getConstant(0, MVT::i32),
         Mask
       };
-      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src, 4);
+      SDValue Input = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i32, Src);
       SDValue Args[3] = { Chain, Input, DWordAddr };
       return DAG.getMemIntrinsicNode(AMDGPUISD::STORE_MSKOR, DL,
-                                     Op->getVTList(), Args, 3, MemVT,
+                                     Op->getVTList(), Args, MemVT,
                                      StoreNode->getMemOperand());
     } else if (Ptr->getOpcode() != AMDGPUISD::DWORDADDR &&
                Value.getValueType().bitsGE(MVT::i32)) {
@@ -1131,7 +1150,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   if (ValueVT.isVector()) {
     unsigned NumElemVT = ValueVT.getVectorNumElements();
     EVT ElemVT = ValueVT.getVectorElementType();
-    SDValue Stores[4];
+    SmallVector<SDValue, 4> Stores(NumElemVT);
 
     assert(NumElemVT >= StackWidth && "Stack width cannot be greater than "
                                       "vector width in load");
@@ -1148,7 +1167,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
                               Chain, Elem, Ptr,
                               DAG.getTargetConstant(Channel, MVT::i32));
     }
-     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores, NumElemVT);
+     Chain =  DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
    } else {
     if (ValueVT == MVT::i8) {
       Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Value);
@@ -1212,10 +1231,11 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
 
   SDValue Ret = AMDGPUTargetLowering::LowerLOAD(Op, DAG);
   if (Ret.getNode()) {
-    SDValue Ops[2];
-    Ops[0] = Ret;
-    Ops[1] = Chain;
-    return DAG.getMergeValues(Ops, 2, DL);
+    SDValue Ops[2] = {
+      Ret,
+      Chain
+    };
+    return DAG.getMergeValues(Ops, DL);
   }
 
 
@@ -1224,7 +1244,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       SplitVectorLoad(Op, DAG),
       Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
@@ -1232,8 +1252,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
     SDValue Result;
-    if (isa<ConstantExpr>(LoadNode->getSrcValue()) ||
-        isa<Constant>(LoadNode->getSrcValue()) ||
+    if (isa<ConstantExpr>(LoadNode->getMemOperand()->getValue()) ||
+        isa<Constant>(LoadNode->getMemOperand()->getValue()) ||
         isa<ConstantSDNode>(Ptr)) {
       SDValue Slots[4];
       for (unsigned i = 0; i < 4; i++) {
@@ -1252,7 +1272,8 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
         NewVT = VT;
         NumElements = VT.getVectorNumElements();
       }
-      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT, Slots, NumElements);
+      Result = DAG.getNode(ISD::BUILD_VECTOR, DL, NewVT,
+                           makeArrayRef(Slots, NumElements));
     } else {
       // non-constant ptr can't be folded, keeps it as a v4f32 load
       Result = DAG.getNode(AMDGPUISD::CONST_ADDRESS, DL, MVT::v4i32,
@@ -1268,10 +1289,10 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     }
 
     SDValue MergedValues[2] = {
-        Result,
-        Chain
+      Result,
+      Chain
     };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   // For most operations returning SDValue() will result in the node being
@@ -1295,7 +1316,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
     SDValue Sra = DAG.getNode(ISD::SRA, DL, VT, Shl, ShiftAmount);
 
     SDValue MergedValues[2] = { Sra, Chain };
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -1332,7 +1353,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
       Loads[i] = DAG.getUNDEF(ElemVT);
     }
     EVT TargetVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, 4);
-    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads, 4);
+    LoweredLoad = DAG.getNode(ISD::BUILD_VECTOR, DL, TargetVT, Loads);
   } else {
     LoweredLoad = DAG.getNode(AMDGPUISD::REGISTER_LOAD, DL, VT,
                               Chain, Ptr,
@@ -1340,11 +1361,12 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const
                               Op.getOperand(2));
   }
 
-  SDValue Ops[2];
-  Ops[0] = LoweredLoad;
-  Ops[1] = Chain;
+  SDValue Ops[2] = {
+    LoweredLoad,
+    Chain
+  };
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 /// XXX Only kernel functions are supported, so we can assume for now that
@@ -1365,8 +1387,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
   SmallVector<ISD::InputArg, 8> LocalIns;
 
-  getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins,
-                          LocalIns);
+  getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns);
 
   AnalyzeFormalArguments(CCInfo, LocalIns);
 
@@ -1392,32 +1413,38 @@ SDValue R600TargetLowering::LowerFormalArguments(
 
     // The first 36 bytes of the input buffer contains information about
     // thread group and global sizes.
-    SDValue Arg = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain,
+
+    // FIXME: This should really check the extload type, but the handling of
+    // extload vecto parameters seems to be broken.
+    //ISD::LoadExtType Ext = Ins[i].Flags.isSExt() ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+    ISD::LoadExtType Ext = ISD::SEXTLOAD;
+    SDValue Arg = DAG.getExtLoad(Ext, DL, VT, Chain,
                                  DAG.getConstant(36 + VA.getLocMemOffset(), MVT::i32),
                                  MachinePointerInfo(UndefValue::get(PtrTy)),
                                  MemVT, false, false, 4);
-    // 4 is the preferred alignment for
-    // the CONSTANT memory space.
+
+    // 4 is the preferred alignment for the CONSTANT memory space.
     InVals.push_back(Arg);
   }
   return Chain;
 }
 
 EVT R600TargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
-   if (!VT.isVector()) return MVT::i32;
+   if (!VT.isVector())
+     return MVT::i32;
    return VT.changeVectorElementTypeToInteger();
 }
 
-static SDValue
-CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
-                        DenseMap<unsigned, unsigned> &RemapSwizzle) {
+static SDValue CompactSwizzlableVector(
+  SelectionDAG &DAG, SDValue VectorEntry,
+  DenseMap<unsigned, unsigned> &RemapSwizzle) {
   assert(VectorEntry.getOpcode() == ISD::BUILD_VECTOR);
   assert(RemapSwizzle.empty());
   SDValue NewBldVec[4] = {
-      VectorEntry.getOperand(0),
-      VectorEntry.getOperand(1),
-      VectorEntry.getOperand(2),
-      VectorEntry.getOperand(3)
+    VectorEntry.getOperand(0),
+    VectorEntry.getOperand(1),
+    VectorEntry.getOperand(2),
+    VectorEntry.getOperand(3)
   };
 
   for (unsigned i = 0; i < 4; i++) {
@@ -1448,7 +1475,7 @@ CompactSwizzlableVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
@@ -1486,7 +1513,7 @@ static SDValue ReorganizeVector(SelectionDAG &DAG, SDValue VectorEntry,
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(VectorEntry),
-      VectorEntry.getValueType(), NewBldVec, 4);
+                     VectorEntry.getValueType(), NewBldVec);
 }
 
 
@@ -1524,6 +1551,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
 
   switch (N->getOpcode()) {
+  default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
   // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
   case ISD::FP_ROUND: {
       SDValue Arg = N->getOperand(0);
@@ -1613,8 +1641,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     }
 
     // Return the new vector
-    return DAG.getNode(ISD::BUILD_VECTOR, dl,
-                       VT, &Ops[0], Ops.size());
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   // Extract_vec (Build_vector) generated by custom lowering
@@ -1638,6 +1665,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
   }
 
   case ISD::SELECT_CC: {
+    // Try common optimizations
+    SDValue Ret = AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
+    if (Ret.getNode())
+      return Ret;
+
     // fold selectcc (selectcc x, y, a, b, cc), b, a, b, seteq ->
     //      selectcc x, y, a, b, inv(cc)
     //
@@ -1697,7 +1729,7 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     SDLoc DL(N);
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[4], DAG);
-    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs, 8);
+    return DAG.getNode(AMDGPUISD::EXPORT, DL, N->getVTList(), NewArgs);
   }
   case AMDGPUISD::TEXTURE_FETCH: {
     SDValue Arg = N->getOperand(1);
@@ -1727,10 +1759,11 @@ SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
     };
     NewArgs[1] = OptimizeSwizzle(N->getOperand(1), &NewArgs[2], DAG);
     return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, SDLoc(N), N->getVTList(),
-        NewArgs, 19);
+        NewArgs);
   }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 static bool
@@ -1779,8 +1812,7 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       TII->getOperandIdx(Opcode, AMDGPU::OpName::src1_W)
     };
     std::vector<unsigned> Consts;
-    for (unsigned i = 0; i < sizeof(SrcIndices) / sizeof(int); i++) {
-      int OtherSrcIdx = SrcIndices[i];
+    for (int OtherSrcIdx : SrcIndices) {
       int OtherSelIdx = TII->getSelIdx(Opcode, OtherSrcIdx);
       if (OtherSrcIdx < 0 || OtherSelIdx < 0)
         continue;
@@ -1791,14 +1823,14 @@ FoldOperand(SDNode *ParentNode, unsigned SrcIdx, SDValue &Src, SDValue &Neg,
       if (RegisterSDNode *Reg =
           dyn_cast<RegisterSDNode>(ParentNode->getOperand(OtherSrcIdx))) {
         if (Reg->getReg() == AMDGPU::ALU_CONST) {
-          ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(
-              ParentNode->getOperand(OtherSelIdx));
+          ConstantSDNode *Cst
+            = cast<ConstantSDNode>(ParentNode->getOperand(OtherSelIdx));
           Consts.push_back(Cst->getZExtValue());
         }
       }
     }
 
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
+    ConstantSDNode *Cst = cast<ConstantSDNode>(CstOffset);
     Consts.push_back(Cst->getZExtValue());
     if (!TII->fitsConstReadLimitations(Consts)) {
       return false;
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
index 22ef728..a8a464f 100644
--- a/lib/Target/R600/R600ISelLowering.h
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -24,21 +24,21 @@ class R600InstrInfo;
 class R600TargetLowering : public AMDGPUTargetLowering {
 public:
   R600TargetLowering(TargetMachine &TM);
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
-      MachineBasicBlock * BB) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual void ReplaceNodeResults(SDNode * N,
-                                  SmallVectorImpl<SDValue> &Results,
-                                  SelectionDAG &DAG) const override;
-  virtual SDValue LowerFormalArguments(
-                                      SDValue Chain,
-                                      CallingConv::ID CallConv,
-                                      bool isVarArg,
-                                      const SmallVectorImpl<ISD::InputArg> &Ins,
-                                      SDLoc DL, SelectionDAG &DAG,
-                                      SmallVectorImpl<SDValue> &InVals) const;
-  virtual EVT getSetCCResultType(LLVMContext &, EVT VT) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  void ReplaceNodeResults(SDNode * N,
+                          SmallVectorImpl<SDValue> &Results,
+                          SelectionDAG &DAG) const override;
+  SDValue LowerFormalArguments(
+                              SDValue Chain,
+                              CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::InputArg> &Ins,
+                              SDLoc DL, SelectionDAG &DAG,
+                              SmallVectorImpl<SDValue> &InVals) const override;
+  EVT getSetCCResultType(LLVMContext &, EVT VT) const override;
 private:
   unsigned Gen;
   /// Each OpenCL kernel has nine implicit parameters that are stored in the
@@ -66,7 +66,7 @@ private:
   void getStackAddress(unsigned StackWidth, unsigned ElemIdx,
                        unsigned &Channel, unsigned &PtrIncr) const;
   bool isZero(SDValue Op) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
 };
 
 } // End namespace llvm;
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 0281dd0..b0d9ae3 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -23,11 +23,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenDFAPacketizer.inc"
 
-using namespace llvm;
-
 R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
   : AMDGPUInstrInfo(tm),
     RI(tm),
@@ -677,7 +677,7 @@ findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
       return MI;
   }
 
-  return NULL;
+  return nullptr;
 }
 
 static
@@ -797,7 +797,7 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
                             DebugLoc DL) const {
   assert(TBB && "InsertBranch must not be told to insert a fallthrough");
 
-  if (FBB == 0) {
+  if (!FBB) {
     if (Cond.empty()) {
       BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB);
       return 1;
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index d5ff4de..b5304a0 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -50,13 +50,13 @@ namespace llvm {
 
   explicit R600InstrInfo(AMDGPUTargetMachine &tm);
 
-  const R600RegisterInfo &getRegisterInfo() const;
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  const R600RegisterInfo &getRegisterInfo() const override;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
   bool isLegalToSplitMBBAt(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI) const;
+                           MachineBasicBlock::iterator MBBI) const override;
 
   bool isTrig(const MachineInstr &MI) const;
   bool isPlaceHolderOpcode(unsigned opcode) const;
@@ -142,79 +142,79 @@ namespace llvm {
   /// instruction slots within an instruction group.
   bool isVector(const MachineInstr &MI) const;
 
-  virtual unsigned getIEQOpcode() const;
-  virtual bool isMov(unsigned Opcode) const;
+  unsigned getIEQOpcode() const override;
+  bool isMov(unsigned Opcode) const override;
 
   DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
-                                           const ScheduleDAG *DAG) const;
+                                           const ScheduleDAG *DAG) const override;
 
-  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
   bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
-                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const override;
 
-  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const override;
 
-  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
 
-  bool isPredicated(const MachineInstr *MI) const;
+  bool isPredicated(const MachineInstr *MI) const override;
 
-  bool isPredicable(MachineInstr *MI) const;
+  bool isPredicable(MachineInstr *MI) const override;
 
   bool
    isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
-                             const BranchProbability &Probability) const;
+                             const BranchProbability &Probability) const override;
 
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
                            unsigned ExtraPredCycles,
-                           const BranchProbability &Probability) const ;
+                           const BranchProbability &Probability) const override ;
 
   bool
    isProfitableToIfCvt(MachineBasicBlock &TMBB,
                        unsigned NumTCycles, unsigned ExtraTCycles,
                        MachineBasicBlock &FMBB,
                        unsigned NumFCycles, unsigned ExtraFCycles,
-                       const BranchProbability &Probability) const;
+                       const BranchProbability &Probability) const override;
 
   bool DefinesPredicate(MachineInstr *MI,
-                                  std::vector<MachineOperand> &Pred) const;
+                                  std::vector<MachineOperand> &Pred) const override;
 
   bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
-                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+                         const SmallVectorImpl<MachineOperand> &Pred2) const override;
 
   bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
-                                          MachineBasicBlock &FMBB) const;
+                                          MachineBasicBlock &FMBB) const override;
 
   bool PredicateInstruction(MachineInstr *MI,
-                        const SmallVectorImpl<MachineOperand> &Pred) const;
+                        const SmallVectorImpl<MachineOperand> &Pred) const override;
 
-  unsigned int getPredicationCost(const MachineInstr *) const;
+  unsigned int getPredicationCost(const MachineInstr *) const override;
 
   unsigned int getInstrLatency(const InstrItineraryData *ItinData,
                                const MachineInstr *MI,
-                               unsigned *PredCost = 0) const;
+                               unsigned *PredCost = nullptr) const override;
 
-  virtual int getInstrLatency(const InstrItineraryData *ItinData,
-                              SDNode *Node) const { return 1;}
+  int getInstrLatency(const InstrItineraryData *ItinData,
+                      SDNode *Node) const override { return 1;}
 
   /// \brief Reserve the registers that may be accesed using indirect addressing.
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                          MachineBasicBlock::iterator I,
+                          unsigned ValueReg, unsigned Address,
+                          unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                  MachineBasicBlock::iterator I,
-                                  unsigned ValueReg, unsigned Address,
-                                  unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg, unsigned Address,
+                                        unsigned OffsetReg) const override;
 
   unsigned getMaxAlusPerClause() const;
 
@@ -244,7 +244,7 @@ namespace llvm {
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
 
   /// \brief Get the index of Op in the MachineInstr.
   ///
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index d2075c0..590fde2 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1625,6 +1625,12 @@ def : DwordAddrPat  <i32, R600_Reg32>;
 
 } // End isR600toCayman Predicate
 
+let Predicates = [isR600] in {
+// Intrinsic patterns
+defm : Expand24IBitOps<MULLO_INT_r600, ADD_INT>;
+defm : Expand24UBitOps<MULLO_UINT_r600, ADD_INT>;
+} // End isR600
+
 def getLDSNoRetOp : InstrMapping {
   let FilterClass = "R600_LDS_1A1D";
   let RowFields = ["BaseOp"];
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
index c1bec0a..b0ae22e 100644
--- a/lib/Target/R600/R600MachineFunctionInfo.h
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -21,7 +21,7 @@
 namespace llvm {
 
 class R600MachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
   R600MachineFunctionInfo(const MachineFunction &MF);
   SmallVector<unsigned, 4> LiveOuts;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index d3ffb50..d1655d1 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "misched"
-
 #include "R600MachineScheduler.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "misched"
+
 void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
   assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness");
   DAG = static_cast<ScheduleDAGMILive*>(dag);
@@ -56,7 +56,7 @@ unsigned getWFCountLimitedByGPR(unsigned GPRCount) {
 }
 
 SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   NextInstKind = IDOther;
 
   IsTopNode = false;
@@ -316,7 +316,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
 
 SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
   if (Q.empty())
-    return NULL;
+    return nullptr;
   for (std::vector<SUnit *>::reverse_iterator It = Q.rbegin(), E = Q.rend();
       It != E; ++It) {
     SUnit *SU = *It;
@@ -331,7 +331,7 @@ SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q, bool AnyALU) {
       InstructionsGroupCandidate.pop_back();
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 void R600SchedStrategy::LoadAlu() {
@@ -448,11 +448,11 @@ SUnit* R600SchedStrategy::pickAlu() {
     }
     PrepareNextSlot();
   }
-  return NULL;
+  return nullptr;
 }
 
 SUnit* R600SchedStrategy::pickOther(int QID) {
-  SUnit *SU = 0;
+  SUnit *SU = nullptr;
   std::vector<SUnit *> &AQ = Available[QID];
 
   if (AQ.empty()) {
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index b909ff7..fd475af 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -68,17 +68,16 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
 public:
   R600SchedStrategy() :
-    DAG(0), TII(0), TRI(0), MRI(0) {
+    DAG(nullptr), TII(nullptr), TRI(nullptr), MRI(nullptr) {
   }
 
-  virtual ~R600SchedStrategy() {
-  }
+  virtual ~R600SchedStrategy() {}
 
-  virtual void initialize(ScheduleDAGMI *dag);
-  virtual SUnit *pickNode(bool &IsTopNode);
-  virtual void schedNode(SUnit *SU, bool IsTopNode);
-  virtual void releaseTopNode(SUnit *SU);
-  virtual void releaseBottomNode(SUnit *SU);
+  void initialize(ScheduleDAGMI *dag) override;
+  SUnit *pickNode(bool &IsTopNode) override;
+  void schedNode(SUnit *SU, bool IsTopNode) override;
+  void releaseTopNode(SUnit *SU) override;
+  void releaseBottomNode(SUnit *SU) override;
 
 private:
   std::vector<MachineInstr *> InstructionsGroupCandidate;
diff --git a/lib/Target/R600/R600OptimizeVectorRegisters.cpp b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
index 767e5e3..2314136 100644
--- a/lib/Target/R600/R600OptimizeVectorRegisters.cpp
+++ b/lib/Target/R600/R600OptimizeVectorRegisters.cpp
@@ -27,7 +27,6 @@
 /// to reduce MOV count.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "vec-merger"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -42,6 +41,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "vec-merger"
+
 namespace {
 
 static bool
@@ -107,9 +108,9 @@ private:
 public:
   static char ID;
   R600VectorRegMerger(TargetMachine &tm) : MachineFunctionPass(ID),
-  TII(0) { }
+  TII(nullptr) { }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -118,11 +119,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Vector Registers Merge Pass";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 
 char R600VectorRegMerger::ID = 0;
diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
index b7b7610..c2f6c03 100644
--- a/lib/Target/R600/R600Packetizer.cpp
+++ b/lib/Target/R600/R600Packetizer.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "packets"
 #include "llvm/Support/Debug.h"
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
@@ -28,6 +27,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "packets"
+
 namespace {
 
 class R600Packetizer : public MachineFunctionPass {
@@ -36,7 +37,7 @@ public:
   static char ID;
   R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<MachineDominatorTree>();
     AU.addPreserved<MachineDominatorTree>();
@@ -45,11 +46,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Packetizer";
   }
 
-  bool runOnMachineFunction(MachineFunction &Fn);
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 };
 char R600Packetizer::ID = 0;
 
@@ -155,18 +156,19 @@ public:
   }
 
   // initPacketizerState - initialize some internal flags.
-  void initPacketizerState() {
+  void initPacketizerState() override {
     ConsideredInstUsesAlreadyWrittenVectorElement = false;
   }
 
   // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
-  bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
+  bool ignorePseudoInstruction(MachineInstr *MI,
+                               MachineBasicBlock *MBB) override {
     return false;
   }
 
   // isSoloInstruction - return true if instruction MI can not be packetized
   // with any other instruction, which means that MI itself is a packet.
-  bool isSoloInstruction(MachineInstr *MI) {
+  bool isSoloInstruction(MachineInstr *MI) override {
     if (TII->isVector(*MI))
       return true;
     if (!TII->isALUInstr(MI->getOpcode()))
@@ -182,7 +184,7 @@ public:
 
   // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
   // together.
-  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
+  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) override {
     MachineInstr *MII = SUI->getInstr(), *MIJ = SUJ->getInstr();
     if (getSlot(MII) == getSlot(MIJ))
       ConsideredInstUsesAlreadyWrittenVectorElement = true;
@@ -219,7 +221,9 @@ public:
 
   // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
   // and SUJ.
-  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
+  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) override {
+    return false;
+  }
 
   void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
     unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), AMDGPU::OpName::last);
@@ -288,7 +292,7 @@ public:
     return true;
   }
 
-  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
+  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) override {
     MachineBasicBlock::iterator FirstInBundle =
         CurrentPacketMIs.empty() ? MI : CurrentPacketMIs.front();
     const DenseMap<unsigned, unsigned> &PV =
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
index c74c49e..52e1a4b 100644
--- a/lib/Target/R600/R600RegisterInfo.h
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -28,27 +28,28 @@ struct R600RegisterInfo : public AMDGPURegisterInfo {
 
   R600RegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the R600 reg class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *getISARegClass(
-    const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *getISARegClass(
+    const TargetRegisterClass *RC) const override;
 
   /// \brief get the HW encoding for a register's channel.
   unsigned getHWRegChan(unsigned reg) const;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual const RegClassWeight &getRegClassWeight(const TargetRegisterClass *RC) const;
+  const RegClassWeight &
+    getRegClassWeight(const TargetRegisterClass *RC) const override;
 
   // \returns true if \p Reg can be defined in one ALU caluse and used in another.
-  virtual bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
+  bool isPhysRegLiveAcrossClauses(unsigned Reg) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
index 9d24404..419ec8b 100644
--- a/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
+++ b/lib/Target/R600/R600TextureIntrinsicsReplacer.cpp
@@ -209,7 +209,7 @@ public:
     FunctionPass(ID) {
   }
 
-  virtual bool doInitialization(Module &M) {
+  bool doInitialization(Module &M) override {
     LLVMContext &Ctx = M.getContext();
     Mod = &M;
     FloatType = Type::getFloatTy(Ctx);
@@ -245,16 +245,16 @@ public:
     return false;
   }
 
-  virtual bool runOnFunction(Function &F) {
+  bool runOnFunction(Function &F) override {
     visit(F);
     return false;
   }
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "R600 Texture Intrinsics Replacer";
   }
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
   }
 
   void visitCallInst(CallInst &I) {
diff --git a/lib/Target/R600/SIAnnotateControlFlow.cpp b/lib/Target/R600/SIAnnotateControlFlow.cpp
index f9214a8..d6e4451 100644
--- a/lib/Target/R600/SIAnnotateControlFlow.cpp
+++ b/lib/Target/R600/SIAnnotateControlFlow.cpp
@@ -12,8 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "si-annotate-control-flow"
-
 #include "AMDGPU.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/IR/Constants.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-annotate-control-flow"
+
 namespace {
 
 // Complex types used in this pass
@@ -91,15 +91,15 @@ public:
   SIAnnotateControlFlow():
     FunctionPass(ID) { }
 
-  virtual bool doInitialization(Module &M);
+  bool doInitialization(Module &M) override;
 
-  virtual bool runOnFunction(Function &F);
+  bool runOnFunction(Function &F) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI annotate control flow";
   }
 
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
@@ -118,7 +118,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Void = Type::getVoidTy(Context);
   Boolean = Type::getInt1Ty(Context);
   Int64 = Type::getInt64Ty(Context);
-  ReturnStruct = StructType::get(Boolean, Int64, (Type *)0);
+  ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr);
 
   BoolTrue = ConstantInt::getTrue(Context);
   BoolFalse = ConstantInt::getFalse(Context);
@@ -126,25 +126,25 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   Int64Zero = ConstantInt::get(Int64, 0);
 
   If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)0);
+    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
 
   Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)0);
+    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
 
   Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)0);
+    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
 
   IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0);
+    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
 
   ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0);
+    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
 
   Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)0);
+    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
 
   EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)0);
+    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
 
   return false;
 }
diff --git a/lib/Target/R600/SIFixSGPRCopies.cpp b/lib/Target/R600/SIFixSGPRCopies.cpp
index 402f1f4..5f71453 100644
--- a/lib/Target/R600/SIFixSGPRCopies.cpp
+++ b/lib/Target/R600/SIFixSGPRCopies.cpp
@@ -65,7 +65,6 @@
 /// ultimately led to the creation of an illegal COPY.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sgpr-copies"
 #include "AMDGPU.h"
 #include "SIInstrInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -77,6 +76,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sgpr-copies"
+
 namespace {
 
 class SIFixSGPRCopies : public MachineFunctionPass {
@@ -97,9 +98,9 @@ private:
 public:
   SIFixSGPRCopies(TargetMachine &tm) : MachineFunctionPass(ID) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Fix SGPR copies";
   }
 
@@ -184,7 +185,8 @@ bool SIFixSGPRCopies::isVGPRToSGPRCopy(const MachineInstr &Copy,
   const TargetRegisterClass *SrcRC;
 
   if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
-      DstRC == &AMDGPU::M0RegRegClass)
+      DstRC == &AMDGPU::M0RegRegClass ||
+      MRI.getRegClass(SrcReg) == &AMDGPU::VReg_1RegClass)
     return false;
 
   SrcRC = TRI->getSubRegClass(MRI.getRegClass(SrcReg), SrcSubReg);
@@ -256,6 +258,19 @@ bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
         TII->moveToVALU(MI);
         break;
       }
+      case AMDGPU::INSERT_SUBREG: {
+        const TargetRegisterClass *DstRC, *Src0RC, *Src1RC;
+        DstRC = MRI.getRegClass(MI.getOperand(0).getReg());
+        Src0RC = MRI.getRegClass(MI.getOperand(1).getReg());
+        Src1RC = MRI.getRegClass(MI.getOperand(2).getReg());
+        if (TRI->isSGPRClass(DstRC) &&
+            (TRI->hasVGPRs(Src0RC) || TRI->hasVGPRs(Src1RC))) {
+          DEBUG(dbgs() << " Fixing INSERT_SUBREG:\n");
+          DEBUG(MI.print(dbgs()));
+          TII->moveToVALU(MI);
+        }
+        break;
+      }
       }
     }
   }
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 0b55411..c9e247c 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -29,22 +29,21 @@ using namespace llvm;
 
 SITargetLowering::SITargetLowering(TargetMachine &TM) :
     AMDGPUTargetLowering(TM) {
-  addRegisterClass(MVT::i1, &AMDGPU::SReg_64RegClass);
-  addRegisterClass(MVT::i64, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
 
   addRegisterClass(MVT::v32i8, &AMDGPU::SReg_256RegClass);
   addRegisterClass(MVT::v64i8, &AMDGPU::SReg_512RegClass);
 
-  addRegisterClass(MVT::i32, &AMDGPU::VSrc_32RegClass);
-  addRegisterClass(MVT::f32, &AMDGPU::VSrc_32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
 
-  addRegisterClass(MVT::f64, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2i32, &AMDGPU::VSrc_64RegClass);
-  addRegisterClass(MVT::v2f32, &AMDGPU::VSrc_64RegClass);
+  addRegisterClass(MVT::f64, &AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::v2f32, &AMDGPU::VReg_64RegClass);
 
-  addRegisterClass(MVT::v4i32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
   addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
-  addRegisterClass(MVT::i128, &AMDGPU::SReg_128RegClass);
 
   addRegisterClass(MVT::v8i32, &AMDGPU::VReg_256RegClass);
   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
@@ -78,8 +77,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::ADDC, MVT::i32, Legal);
   setOperationAction(ISD::ADDE, MVT::i32, Legal);
 
-  setOperationAction(ISD::BITCAST, MVT::i128, Legal);
-
   // We need to custom lower vector stores from local memory
   setOperationAction(ISD::LOAD, MVT::v2i32, Custom);
   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
@@ -99,10 +96,11 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::STORE, MVT::i1, Custom);
   setOperationAction(ISD::STORE, MVT::i32, Custom);
   setOperationAction(ISD::STORE, MVT::i64, Custom);
-  setOperationAction(ISD::STORE, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::v2i32, Custom);
   setOperationAction(ISD::STORE, MVT::v4i32, Custom);
 
+  setOperationAction(ISD::SELECT, MVT::f32, Promote);
+  AddPromotedToType(ISD::SELECT, MVT::f32, MVT::i32);
   setOperationAction(ISD::SELECT, MVT::i64, Custom);
   setOperationAction(ISD::SELECT, MVT::f64, Promote);
   AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
@@ -119,6 +117,22 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
   setOperationAction(ISD::SIGN_EXTEND, MVT::i64, Custom);
   setOperationAction(ISD::ZERO_EXTEND, MVT::i64, Custom);
 
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i8, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i8, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16, Legal);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i16, Custom);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Custom);
+
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v16i8, Custom);
@@ -126,39 +140,48 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
 
-  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::SEXTLOAD, MVT::i16, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
-  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::SEXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, Expand);
   setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, Expand);
 
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i1, Promote);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i8, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i16, Custom);
+  setLoadExtAction(ISD::ZEXTLOAD, MVT::i32, Expand);
+
+  setLoadExtAction(ISD::EXTLOAD, MVT::i1, Promote);
   setLoadExtAction(ISD::EXTLOAD, MVT::i8, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i16, Custom);
   setLoadExtAction(ISD::EXTLOAD, MVT::i32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, Expand);
+
   setTruncStoreAction(MVT::i32, MVT::i8, Custom);
   setTruncStoreAction(MVT::i32, MVT::i16, Custom);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::i64, MVT::i32, Expand);
-  setTruncStoreAction(MVT::i128, MVT::i64, Expand);
   setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
 
+  setOperationAction(ISD::LOAD, MVT::i1, Custom);
+
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
 
+  // These should use UDIVREM, so set them to expand
+  setOperationAction(ISD::UDIV, MVT::i64, Expand);
+  setOperationAction(ISD::UREM, MVT::i64, Expand);
+
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
   MVT VecTypes[] = {
     MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32
   };
 
-  const size_t NumVecTypes = array_lengthof(VecTypes);
-  for (unsigned Type = 0; Type < NumVecTypes; ++Type) {
+  for (MVT VT : VecTypes) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch(Op) {
       case ISD::LOAD:
@@ -172,7 +195,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
       case ISD::EXTRACT_SUBVECTOR:
         break;
       default:
-        setOperationAction(Op, VecTypes[Type], Expand);
+        setOperationAction(Op, VT, Expand);
         break;
       }
     }
@@ -189,6 +212,7 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
     setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
     setOperationAction(ISD::FCEIL, MVT::f64, Legal);
     setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::FRINT, MVT::f64, Legal);
   }
 
   setTargetDAGCombine(ISD::SELECT_CC);
@@ -204,10 +228,40 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) :
 bool SITargetLowering::allowsUnalignedMemoryAccesses(EVT  VT,
                                                      unsigned AddrSpace,
                                                      bool *IsFast) const {
+  if (IsFast)
+    *IsFast = false;
+
   // XXX: This depends on the address space and also we may want to revist
   // the alignment values we specify in the DataLayout.
+
+  // TODO: I think v3i32 should allow unaligned accesses on CI with DS_READ_B96,
+  // which isn't a simple VT.
   if (!VT.isSimple() || VT == MVT::Other)
     return false;
+
+  // XXX - CI changes say "Support for unaligned memory accesses" but I don't
+  // see what for specifically. The wording everywhere else seems to be the
+  // same.
+
+  // 3.6.4 - Operations using pairs of VGPRs (for example: double-floats) have
+  // no alignment restrictions.
+  if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
+    // Using any pair of GPRs should be the same as any other pair.
+    if (IsFast)
+      *IsFast = true;
+    return VT.bitsGE(MVT::i64);
+  }
+
+  // XXX - The only mention I see of this in the ISA manual is for LDS direct
+  // reads the "byte address and must be dword aligned". Is it also true for the
+  // normal loads and stores?
+  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS)
+    return false;
+
+  // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
+  // byte-address are ignored, thus forcing Dword alignment.
+  if (IsFast)
+    *IsFast = true;
   return VT.bitsGT(MVT::i32);
 }
 
@@ -224,7 +278,7 @@ bool SITargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
 
 SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                          SDLoc DL, SDValue Chain,
-                                         unsigned Offset) const {
+                                         unsigned Offset, bool Signed) const {
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
                                             AMDGPUAS::CONSTANT_ADDRESS);
@@ -232,7 +286,7 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                            MRI.getLiveInVirtReg(AMDGPU::SGPR0_SGPR1), MVT::i64);
   SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                              DAG.getConstant(Offset, MVT::i64));
-  return DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, Chain, Ptr,
+  return DAG.getExtLoad(Signed ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL, VT, Chain, Ptr,
                             MachinePointerInfo(UndefValue::get(PtrTy)), MemVT,
                             false, false, MemVT.getSizeInBits() >> 3);
 
@@ -340,7 +394,8 @@ SDValue SITargetLowering::LowerFormalArguments(
       // The first 36 bytes of the input buffer contains information about
       // thread group and global sizes.
       SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, DAG.getRoot(),
-                                   36 + VA.getLocMemOffset());
+                                   36 + VA.getLocMemOffset(),
+                                   Ins[i].Flags.isSExt());
       InVals.push_back(Arg);
       continue;
     }
@@ -381,8 +436,7 @@ SDValue SITargetLowering::LowerFormalArguments(
       for (unsigned j = 0; j != NumElements; ++j)
         Regs.push_back(DAG.getUNDEF(VT));
 
-      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT,
-                                   Regs.data(), Regs.size()));
+      InVals.push_back(DAG.getNode(ISD::BUILD_VECTOR, DL, Arg.VT, Regs));
       continue;
     }
 
@@ -395,15 +449,15 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MachineInstr * MI, MachineBasicBlock * BB) const {
 
   MachineBasicBlock::iterator I = *MI;
+  const SIInstrInfo *TII =
+    static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
 
   switch (MI->getOpcode()) {
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   case AMDGPU::BRANCH: return BB;
   case AMDGPU::SI_ADDR64_RSRC: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
     unsigned SuperReg = MI->getOperand(0).getReg();
     unsigned SubRegLo = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
     unsigned SubRegHi = MRI.createVirtualRegister(&AMDGPU::SGPR_64RegClass);
@@ -428,9 +482,7 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
     MI->eraseFromParent();
     break;
   }
-  case AMDGPU::V_SUB_F64: {
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+  case AMDGPU::V_SUB_F64:
     BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F64),
             MI->getOperand(0).getReg())
             .addReg(MI->getOperand(1).getReg())
@@ -442,11 +494,9 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
             .addImm(2); /* NEG */
     MI->eraseFromParent();
     break;
-  }
+
   case AMDGPU::SI_RegisterStorePseudo: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
-    const SIInstrInfo *TII =
-      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
     unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
     MachineInstrBuilder MIB =
         BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::SI_RegisterStore),
@@ -455,6 +505,50 @@ MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
       MIB.addOperand(MI->getOperand(i));
 
     MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FABS_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x7fffffff);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_AND_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FNEG_SI: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    unsigned Reg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_MOV_B32_e32),
+            Reg)
+            .addImm(0x80000000);
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_XOR_B32_e32),
+            MI->getOperand(0).getReg())
+            .addReg(MI->getOperand(1).getReg())
+            .addReg(Reg);
+    MI->eraseFromParent();
+    break;
+  }
+  case AMDGPU::FCLAMP_SI: {
+    const SIInstrInfo *TII =
+      static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
+    BuildMI(*BB, I, MI->getDebugLoc(), TII->get(AMDGPU::V_ADD_F32_e64),
+            MI->getOperand(0).getReg())
+            .addImm(0) // SRC0 modifiers
+            .addOperand(MI->getOperand(1))
+            .addImm(0) // SRC1 modifiers
+            .addImm(0) // SRC1
+            .addImm(1) // CLAMP
+            .addImm(0); // OMOD
+    MI->eraseFromParent();
   }
   }
   return BB;
@@ -510,7 +604,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SplitVectorLoad(Op, DAG),
         Load->getChain()
       };
-      return DAG.getMergeValues(MergedValues, 2, SDLoc(Op));
+      return DAG.getMergeValues(MergedValues, SDLoc(Op));
     } else {
       return LowerLOAD(Op, DAG);
     }
@@ -533,23 +627,23 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     switch (IntrinsicID) {
     default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
     case Intrinsic::r600_read_ngroups_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 0, false);
     case Intrinsic::r600_read_ngroups_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 4, false);
     case Intrinsic::r600_read_ngroups_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 8, false);
     case Intrinsic::r600_read_global_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 12, false);
     case Intrinsic::r600_read_global_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 16, false);
     case Intrinsic::r600_read_global_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 20, false);
     case Intrinsic::r600_read_local_size_x:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 24, false);
     case Intrinsic::r600_read_local_size_y:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 28, false);
     case Intrinsic::r600_read_local_size_z:
-      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32);
+      return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(), 32, false);
     case Intrinsic::r600_read_tgid_x:
       return CreateLiveInRegister(DAG, &AMDGPU::SReg_32RegClass,
                      AMDGPU::SReg_32RegClass.getRegister(NumUserSGPRs + 0), VT);
@@ -570,7 +664,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
                                   AMDGPU::VGPR2, VT);
     case AMDGPUIntrinsic::SI_load_const: {
       SDValue Ops [] = {
-        ResourceDescriptorToi128(Op.getOperand(1), DAG),
+        Op.getOperand(1),
         Op.getOperand(2)
       };
 
@@ -579,7 +673,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
           MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant,
           VT.getSizeInBits() / 8, 4);
       return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
-                                     Op->getVTList(), Ops, 2, VT, MMO);
+                                     Op->getVTList(), Ops, VT, MMO);
     }
     case AMDGPUIntrinsic::SI_sample:
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLE, Op, DAG);
@@ -591,7 +685,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
       return LowerSampleIntrinsic(AMDGPUISD::SAMPLEL, Op, DAG);
     case AMDGPUIntrinsic::SI_vs_load_input:
       return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                         ResourceDescriptorToi128(Op.getOperand(1), DAG),
+                         Op.getOperand(1),
                          Op.getOperand(2),
                          Op.getOperand(3));
     }
@@ -606,7 +700,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         SDLoc DL(Op);
         SDValue Ops [] = {
           Chain,
-          ResourceDescriptorToi128(Op.getOperand(2), DAG),
+          Op.getOperand(2),
           Op.getOperand(3),
           Op.getOperand(4),
           Op.getOperand(5),
@@ -627,8 +721,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
             MachineMemOperand::MOStore,
             VT.getSizeInBits() / 8, 4);
         return DAG.getMemIntrinsicNode(AMDGPUISD::TBUFFER_STORE_FORMAT, DL,
-                                       Op->getVTList(), Ops,
-                                       sizeof(Ops)/sizeof(Ops[0]), VT, MMO);
+                                       Op->getVTList(), Ops, VT, MMO);
       }
       default:
         break;
@@ -650,7 +743,7 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
     if (I->getOpcode() == Opcode)
       return *I;
   }
-  return 0;
+  return nullptr;
 }
 
 /// This transforms the control flow intrinsics to get the branch destination as
@@ -662,7 +755,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
 
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
-  SDNode *BR = 0;
+  SDNode *BR = nullptr;
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
@@ -695,7 +788,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // build the new intrinsic call
   SDNode *Result = DAG.getNode(
     Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode();
+    DAG.getVTList(Res), Ops).getNode();
 
   if (BR) {
     // Give the branch instruction our target
@@ -703,7 +796,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
       BR->getOperand(0),
       BRCOND.getOperand(2)
     };
-    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2);
+    DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops);
   }
 
   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -739,7 +832,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   MergedValues[1] = Load->getChain();
   if (Ret.getNode()) {
     MergedValues[0] = Ret;
-    return DAG.getMergeValues(MergedValues, 2, DL);
+    return DAG.getMergeValues(MergedValues, DL);
   }
 
   if (Load->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
@@ -770,30 +863,16 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   MergedValues[0] = Ret;
-  return DAG.getMergeValues(MergedValues, 2, DL);
+  return DAG.getMergeValues(MergedValues, DL);
 
 }
 
-SDValue SITargetLowering::ResourceDescriptorToi128(SDValue Op,
-                                             SelectionDAG &DAG) const {
-
-  if (Op.getValueType() == MVT::i128) {
-    return Op;
-  }
-
-  assert(Op.getOpcode() == ISD::UNDEF);
-
-  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(Op), MVT::i128,
-                     DAG.getConstant(0, MVT::i64),
-                     DAG.getConstant(0, MVT::i64));
-}
-
 SDValue SITargetLowering::LowerSampleIntrinsic(unsigned Opcode,
                                                const SDValue &Op,
                                                SelectionDAG &DAG) const {
   return DAG.getNode(Opcode, SDLoc(Op), Op.getValueType(), Op.getOperand(1),
                      Op.getOperand(2),
-                     ResourceDescriptorToi128(Op.getOperand(3), DAG),
+                     Op.getOperand(3),
                      Op.getOperand(4));
 }
 
@@ -833,12 +912,6 @@ SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   SDLoc DL(Op);
 
-  // Possible Min/Max pattern
-  SDValue MinMax = LowerMinMax(Op, DAG);
-  if (MinMax.getNode()) {
-    return MinMax;
-  }
-
   SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
   return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
 }
@@ -948,8 +1021,12 @@ SDValue SITargetLowering::LowerZERO_EXTEND(SDValue Op,
     return SDValue();
   }
 
-  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Op.getOperand(0),
-                                              DAG.getConstant(0, MVT::i32));
+  SDValue Src = Op.getOperand(0);
+  if (Src.getValueType() != MVT::i32)
+    Src = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
+
+  SDValue Zero = DAG.getConstant(0, MVT::i32);
+  return DAG.getNode(ISD::BUILD_PAIR, DL, VT, Src, Zero);
 }
 
 //===----------------------------------------------------------------------===//
@@ -963,7 +1040,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   EVT VT = N->getValueType(0);
 
   switch (N->getOpcode()) {
-    default: break;
+    default: return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
     case ISD::SELECT_CC: {
       ConstantSDNode *True, *False;
       // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
@@ -982,7 +1059,7 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Arg0 = N->getOperand(0);
       SDValue Arg1 = N->getOperand(1);
       SDValue CC = N->getOperand(2);
-      ConstantSDNode * C = NULL;
+      ConstantSDNode * C = nullptr;
       ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
 
       // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
@@ -998,7 +1075,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
       break;
     }
   }
-  return SDValue();
+
+  return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
 
 /// \brief Test if RegClass is one of the VSrc classes
@@ -1029,9 +1107,11 @@ int32_t SITargetLowering::analyzeImmediate(const SDNode *N) const {
         return -1;
     }
     Imm.I = Node->getSExtValue();
-  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N))
+  } else if (const ConstantFPSDNode *Node = dyn_cast<ConstantFPSDNode>(N)) {
+    if (N->getValueType(0) != MVT::f32)
+      return -1;
     Imm.F = Node->getValueAPF().convertToFloat();
-  else
+  } else
     return -1; // It isn't an immediate
 
   if ((Imm.I >= -16 && Imm.I <= 64) ||
@@ -1051,7 +1131,7 @@ bool SITargetLowering::foldImm(SDValue &Operand, int32_t &Immediate,
   MachineSDNode *Mov = dyn_cast<MachineSDNode>(Operand);
   const SIInstrInfo *TII =
     static_cast<const SIInstrInfo*>(getTargetMachine().getInstrInfo());
-  if (Mov == 0 || !TII->isMov(Mov->getMachineOpcode()))
+  if (!Mov || !TII->isMov(Mov->getMachineOpcode()))
     return false;
 
   const SDValue &Op = Mov->getOperand(0);
@@ -1098,7 +1178,7 @@ const TargetRegisterClass *SITargetLowering::getRegClassForNode(
       }
       return TRI.getPhysRegClass(Reg);
     }
-    default:  return NULL;
+    default:  return nullptr;
     }
   }
   const MCInstrDesc &Desc = TII->get(Op->getMachineOpcode());
@@ -1202,17 +1282,17 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
 
   // Commuted opcode if available
   int OpcodeRev = Desc->isCommutable() ? TII->commuteOpcode(Opcode) : -1;
-  const MCInstrDesc *DescRev = OpcodeRev == -1 ? 0 : &TII->get(OpcodeRev);
+  const MCInstrDesc *DescRev = OpcodeRev == -1 ? nullptr : &TII->get(OpcodeRev);
 
   assert(!DescRev || DescRev->getNumDefs() == NumDefs);
   assert(!DescRev || DescRev->getNumOperands() == NumOps);
 
   // e64 version if available, -1 otherwise
   int OpcodeE64 = AMDGPU::getVOPe64(Opcode);
-  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? 0 : &TII->get(OpcodeE64);
+  const MCInstrDesc *DescE64 = OpcodeE64 == -1 ? nullptr : &TII->get(OpcodeE64);
+  int InputModifiers[3] = {0};
 
   assert(!DescE64 || DescE64->getNumDefs() == NumDefs);
-  assert(!DescE64 || DescE64->getNumOperands() == (NumOps + 4));
 
   int32_t Immediate = Desc->getSize() == 4 ? 0 : -1;
   bool HaveVSrc = false, HaveSSrc = false;
@@ -1279,17 +1359,18 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
            fitsRegClass(DAG, Ops[1], OtherRegClass))) {
 
         // Swap commutable operands
-        SDValue Tmp = Ops[1];
-        Ops[1] = Ops[0];
-        Ops[0] = Tmp;
+        std::swap(Ops[0], Ops[1]);
 
         Desc = DescRev;
-        DescRev = 0;
+        DescRev = nullptr;
         continue;
       }
     }
 
-    if (DescE64 && !Immediate) {
+    if (Immediate)
+      continue;
+
+    if (DescE64) {
 
       // Test if it makes sense to switch to e64 encoding
       unsigned OtherRegClass = DescE64->OpInfo[Op].RegClass;
@@ -1305,14 +1386,46 @@ SDNode *SITargetLowering::foldOperands(MachineSDNode *Node,
         Immediate = -1;
         Promote2e64 = true;
         Desc = DescE64;
-        DescE64 = 0;
+        DescE64 = nullptr;
       }
     }
+
+    if (!DescE64 && !Promote2e64)
+      continue;
+    if (!Operand.isMachineOpcode())
+      continue;
+    if (Operand.getMachineOpcode() == AMDGPU::FNEG_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 1;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
+    else if (Operand.getMachineOpcode() == AMDGPU::FABS_SI) {
+      Ops.pop_back();
+      Ops.push_back(Operand.getOperand(0));
+      InputModifiers[i] = 2;
+      Promote2e64 = true;
+      if (!DescE64)
+        continue;
+      Desc = DescE64;
+      DescE64 = 0;
+    }
   }
 
   if (Promote2e64) {
+    std::vector<SDValue> OldOps(Ops);
+    Ops.clear();
+    for (unsigned i = 0; i < OldOps.size(); ++i) {
+      // src_modifier
+      Ops.push_back(DAG.getTargetConstant(InputModifiers[i], MVT::i32));
+      Ops.push_back(OldOps[i]);
+    }
     // Add the modifier flags while promoting
-    for (unsigned i = 0; i < 4; ++i)
+    for (unsigned i = 0; i < 2; ++i)
       Ops.push_back(DAG.getTargetConstant(0, MVT::i32));
   }
 
@@ -1390,7 +1503,7 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   Ops.push_back(DAG.getTargetConstant(NewDmask, MVT::i32));
   for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i)
     Ops.push_back(Node->getOperand(i));
-  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops.data(), Ops.size());
+  Node = (MachineSDNode*)DAG.UpdateNodeOperands(Node, Ops);
 
   // If we only got one lane, replace it with a copy
   // (if NewDmask has only one bit set...)
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
index ca73f53..c6eaa81 100644
--- a/lib/Target/R600/SIISelLowering.h
+++ b/lib/Target/R600/SIISelLowering.h
@@ -22,7 +22,7 @@ namespace llvm {
 
 class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, SDLoc DL,
-                         SDValue Chain, unsigned Offset) const;
+                         SDValue Chain, unsigned Offset, bool Signed) const;
   SDValue LowerSampleIntrinsic(unsigned Opcode, const SDValue &Op,
                                SelectionDAG &DAG) const;
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
@@ -33,7 +33,6 @@ class SITargetLowering : public AMDGPUTargetLowering {
   SDValue LowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue ResourceDescriptorToi128(SDValue Op, SelectionDAG &DAG) const;
   bool foldImm(SDValue &Operand, int32_t &Immediate,
                bool &ScalarSlotUsed) const;
   const TargetRegisterClass *getRegClassForNode(SelectionDAG &DAG,
@@ -49,32 +48,33 @@ class SITargetLowering : public AMDGPUTargetLowering {
 
 public:
   SITargetLowering(TargetMachine &tm);
-  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS, bool *IsFast) const;
-  virtual bool shouldSplitVectorType(EVT VT) const override;
+  bool allowsUnalignedMemoryAccesses(EVT VT, unsigned AS,
+                                     bool *IsFast) const override;
+  bool shouldSplitVectorType(EVT VT) const override;
 
-  virtual bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
-                                                 Type *Ty) const override;
+  bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
+                                        Type *Ty) const override;
 
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool isVarArg,
                                const SmallVectorImpl<ISD::InputArg> &Ins,
                                SDLoc DL, SelectionDAG &DAG,
-                               SmallVectorImpl<SDValue> &InVals) const;
+                               SmallVectorImpl<SDValue> &InVals) const override;
 
-  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
-                                              MachineBasicBlock * BB) const;
-  virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
-  virtual MVT getScalarShiftAmountTy(EVT VT) const;
-  virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const;
-  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
-  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
-  virtual SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const;
-  virtual void AdjustInstrPostInstrSelection(MachineInstr *MI,
-                                             SDNode *Node) const;
+  MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                      MachineBasicBlock * BB) const override;
+  EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
+  MVT getScalarShiftAmountTy(EVT VT) const override;
+  bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+  SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
+  SDNode *PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override;
+  void AdjustInstrPostInstrSelection(MachineInstr *MI,
+                                     SDNode *Node) const override;
 
   int32_t analyzeImmediate(const SDNode *N) const;
   SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const;
+                               unsigned Reg, EVT VT) const override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIInsertWaits.cpp b/lib/Target/R600/SIInsertWaits.cpp
index 695ec40..a17fed7 100644
--- a/lib/Target/R600/SIInsertWaits.cpp
+++ b/lib/Target/R600/SIInsertWaits.cpp
@@ -97,13 +97,13 @@ private:
 public:
   SIInsertWaits(TargetMachine &tm) :
     MachineFunctionPass(ID),
-    TII(0),
-    TRI(0),
+    TII(nullptr),
+    TRI(nullptr),
     ExpInstrTypesSeen(0) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI insert wait  instructions";
   }
 
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
index aa2c22c..168eff2 100644
--- a/lib/Target/R600/SIInstrFormats.td
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -12,7 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
-    AMDGPUInst<outs, ins, asm, pattern> {
+    AMDGPUInst<outs, ins, asm, pattern>, PredicateControl {
 
   field bits<1> VM_CNT = 0;
   field bits<1> EXP_CNT = 0;
@@ -210,16 +210,19 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
-  bits<3> abs;
   bits<1> clamp;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
-  let Inst{10-8} = abs;
+  let Inst{8} = src0_modifiers{1};
+  let Inst{9} = src1_modifiers{1};
+  let Inst{10} = src2_modifiers{1};
   let Inst{11} = clamp;
   let Inst{25-17} = op;
   let Inst{31-26} = 0x34; //encoding
@@ -227,7 +230,9 @@ class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
   
   let mayLoad = 0;
   let mayStore = 0;
@@ -240,12 +245,14 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
     Enc64 <outs, ins, asm, pattern> {
 
   bits<8> dst;
+  bits<2> src0_modifiers;
   bits<9> src0;
+  bits<2> src1_modifiers;
   bits<9> src1;
+  bits<2> src2_modifiers;
   bits<9> src2;
   bits<7> sdst;
   bits<2> omod;
-  bits<3> neg;
 
   let Inst{7-0} = dst;
   let Inst{14-8} = sdst;
@@ -255,7 +262,9 @@ class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
   let Inst{49-41} = src1;
   let Inst{58-50} = src2;
   let Inst{60-59} = omod;
-  let Inst{63-61} = neg;
+  let Inst{61} = src0_modifiers{0};
+  let Inst{62} = src1_modifiers{0};
+  let Inst{63} = src2_modifiers{0};
 
   let mayLoad = 0;
   let mayStore = 0;
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
index ab2fe09..4a9e346 100644
--- a/lib/Target/R600/SIInstrInfo.cpp
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -187,27 +187,45 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   unsigned KillFlag = isKill ? RegState::Kill : 0;
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   if (TRI->getCommonSubClass(RC, &AMDGPU::SGPR_32RegClass)) {
-    unsigned Lane = MFI->SpillTracker.getNextLane(MRI);
-    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
-            MFI->SpillTracker.LaneVGPR)
+    unsigned Lane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent());
+
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32), MFI->SpillTracker.LaneVGPR)
             .addReg(SrcReg, KillFlag)
             .addImm(Lane);
+    MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR, Lane);
+  } else if (RI.isSGPRClass(RC)) {
+    // We are only allowed to create one new instruction when spilling
+    // registers, so we need to use pseudo instruction for vector
+    // registers.
+    //
+    // Reserve a spot in the spill tracker for each sub-register of
+    // the vector register.
+    unsigned NumSubRegs = RC->getSize() / 4;
+    unsigned FirstLane = MFI->SpillTracker.reserveLanes(MRI, MBB.getParent(),
+                                                        NumSubRegs);
     MFI->SpillTracker.addSpilledReg(FrameIndex, MFI->SpillTracker.LaneVGPR,
-                                    Lane);
-  } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      BuildMI(MBB, MI, MBB.findDebugLoc(MI), get(AMDGPU::COPY), SubReg)
-              .addReg(SrcReg, 0, RI.getSubRegFromChannel(i));
-      storeRegToStackSlot(MBB, MI, SubReg, isKill, FrameIndex + i,
-                          &AMDGPU::SReg_32RegClass, TRI);
+                                    FirstLane);
+
+    unsigned Opcode;
+    switch (RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_SAVE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_SAVE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_SAVE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_SAVE; break;
+    default: llvm_unreachable("Cannot spill register class");
     }
+
+    BuildMI(MBB, MI, DL, get(Opcode), MFI->SpillTracker.LaneVGPR)
+            .addReg(SrcReg)
+            .addImm(FrameIndex);
+  } else {
+    llvm_unreachable("VGPR spilling not supported");
   }
 }
 
@@ -216,30 +234,125 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        unsigned DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
-  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   SIMachineFunctionInfo *MFI = MBB.getParent()->getInfo<SIMachineFunctionInfo>();
   DebugLoc DL = MBB.findDebugLoc(MI);
   if (TRI->getCommonSubClass(RC, &AMDGPU::SReg_32RegClass)) {
-     SIMachineFunctionInfo::SpilledReg Spill =
+    SIMachineFunctionInfo::SpilledReg Spill =
         MFI->SpillTracker.getSpilledReg(FrameIndex);
     assert(Spill.VGPR);
     BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), DestReg)
             .addReg(Spill.VGPR)
             .addImm(Spill.Lane);
+    insertNOPs(MI, 3);
+  } else if (RI.isSGPRClass(RC)){
+    unsigned Opcode;
+    switch(RC->getSize() * 8) {
+    case 64:  Opcode = AMDGPU::SI_SPILL_S64_RESTORE;  break;
+    case 128: Opcode = AMDGPU::SI_SPILL_S128_RESTORE; break;
+    case 256: Opcode = AMDGPU::SI_SPILL_S256_RESTORE; break;
+    case 512: Opcode = AMDGPU::SI_SPILL_S512_RESTORE; break;
+    default: llvm_unreachable("Cannot spill register class");
+    }
+
+    SIMachineFunctionInfo::SpilledReg Spill =
+        MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+    BuildMI(MBB, MI, DL, get(Opcode), DestReg)
+            .addReg(Spill.VGPR)
+            .addImm(FrameIndex);
+    insertNOPs(MI, 3);
   } else {
-    for (unsigned i = 0, e = RC->getSize() / 4; i != e; ++i) {
-      unsigned Flags = RegState::Define;
-      if (i == 0) {
-        Flags |= RegState::Undef;
-      }
-      unsigned SubReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
-      loadRegFromStackSlot(MBB, MI, SubReg, FrameIndex + i,
-                           &AMDGPU::SReg_32RegClass, TRI);
-      BuildMI(MBB, MI, DL, get(AMDGPU::COPY))
-              .addReg(DestReg, Flags, RI.getSubRegFromChannel(i))
-              .addReg(SubReg);
+    llvm_unreachable("VGPR spilling not supported");
+  }
+}
+
+static unsigned getNumSubRegsForSpillOp(unsigned Op) {
+
+  switch (Op) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+    return 16;
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+    return 8;
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+    return 4;
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+    return 2;
+  default: llvm_unreachable("Invalid spill opcode");
+  }
+}
+
+void SIInstrInfo::insertNOPs(MachineBasicBlock::iterator MI,
+                             int Count) const {
+  while (Count > 0) {
+    int Arg;
+    if (Count >= 8)
+      Arg = 7;
+    else
+      Arg = Count - 1;
+    Count -= 8;
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), get(AMDGPU::S_NOP))
+            .addImm(Arg);
+  }
+}
+
+bool SIInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
+  SIMachineFunctionInfo *MFI =
+      MI->getParent()->getParent()->getInfo<SIMachineFunctionInfo>();
+  MachineBasicBlock &MBB = *MI->getParent();
+  DebugLoc DL = MBB.findDebugLoc(MI);
+  switch (MI->getOpcode()) {
+  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
+
+  // SGPR register spill
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+    unsigned FrameIndex = MI->getOperand(2).getImm();
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(1).getReg(),
+                                            &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_WRITELANE_B32),
+              MI->getOperand(0).getReg())
+              .addReg(SubReg)
+              .addImm(Spill.Lane + i);
+    }
+    MI->eraseFromParent();
+    break;
+  }
+
+  // SGPR register restore
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE: {
+    unsigned NumSubRegs = getNumSubRegsForSpillOp(MI->getOpcode());
+
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      SIMachineFunctionInfo::SpilledReg Spill;
+      unsigned FrameIndex = MI->getOperand(2).getImm();
+      unsigned SubReg = RI.getPhysRegSubReg(MI->getOperand(0).getReg(),
+                                   &AMDGPU::SGPR_32RegClass, i);
+      Spill = MFI->SpillTracker.getSpilledReg(FrameIndex);
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_READLANE_B32), SubReg)
+              .addReg(MI->getOperand(1).getReg())
+              .addImm(Spill.Lane + i);
     }
+    MI->eraseFromParent();
+    break;
   }
+  }
+  return true;
 }
 
 MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
@@ -247,18 +360,18 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
 
   MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
   if (MI->getNumOperands() < 3 || !MI->getOperand(1).isReg())
-    return 0;
+    return nullptr;
 
   // Cannot commute VOP2 if src0 is SGPR.
   if (isVOP2(MI->getOpcode()) && MI->getOperand(1).isReg() &&
       RI.isSGPRClass(MRI.getRegClass(MI->getOperand(1).getReg())))
-   return 0;
+   return nullptr;
 
   if (!MI->getOperand(2).isReg()) {
     // XXX: Commute instructions with FPImm operands
     if (NewMI || MI->getOperand(2).isFPImm() ||
        (!isVOP2(MI->getOpcode()) && !isVOP3(MI->getOpcode()))) {
-      return 0;
+      return nullptr;
     }
 
     // XXX: Commute VOP3 instructions with abs and neg set.
@@ -267,7 +380,7 @@ MachineInstr *SIInstrInfo::commuteInstruction(MachineInstr *MI,
                         AMDGPU::OpName::abs)).getImm() ||
          MI->getOperand(AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                         AMDGPU::OpName::neg)).getImm()))
-      return 0;
+      return nullptr;
 
     unsigned Reg = MI->getOperand(1).getReg();
     unsigned SubReg = MI->getOperand(1).getSubReg();
@@ -516,6 +629,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
   case AMDGPU::COPY: return AMDGPU::COPY;
   case AMDGPU::PHI: return AMDGPU::PHI;
+  case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
   case AMDGPU::S_MOV_B32:
     return MI.getOperand(1).isReg() ?
            AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
@@ -536,6 +650,23 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
   case AMDGPU::S_LSHL_B64: return AMDGPU::V_LSHL_B64;
   case AMDGPU::S_LSHR_B32: return AMDGPU::V_LSHR_B32_e32;
   case AMDGPU::S_LSHR_B64: return AMDGPU::V_LSHR_B64;
+  case AMDGPU::S_SEXT_I32_I8: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_SEXT_I32_I16: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_BFE_U32: return AMDGPU::V_BFE_U32;
+  case AMDGPU::S_BFE_I32: return AMDGPU::V_BFE_I32;
+  case AMDGPU::S_NOT_B32: return AMDGPU::V_NOT_B32_e32;
+  case AMDGPU::S_CMP_EQ_I32: return AMDGPU::V_CMP_EQ_I32_e32;
+  case AMDGPU::S_CMP_LG_I32: return AMDGPU::V_CMP_NE_I32_e32;
+  case AMDGPU::S_CMP_GT_I32: return AMDGPU::V_CMP_GT_I32_e32;
+  case AMDGPU::S_CMP_GE_I32: return AMDGPU::V_CMP_GE_I32_e32;
+  case AMDGPU::S_CMP_LT_I32: return AMDGPU::V_CMP_LT_I32_e32;
+  case AMDGPU::S_CMP_LE_I32: return AMDGPU::V_CMP_LE_I32_e32;
+  case AMDGPU::S_LOAD_DWORD_IMM:
+  case AMDGPU::S_LOAD_DWORD_SGPR: return AMDGPU::BUFFER_LOAD_DWORD_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX2_IMM:
+  case AMDGPU::S_LOAD_DWORDX2_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX2_ADDR64;
+  case AMDGPU::S_LOAD_DWORDX4_IMM:
+  case AMDGPU::S_LOAD_DWORDX4_SGPR: return AMDGPU::BUFFER_LOAD_DWORDX4_ADDR64;
   }
 }
 
@@ -559,6 +690,8 @@ bool SIInstrInfo::canReadVGPR(const MachineInstr &MI, unsigned OpNo) const {
   switch (MI.getOpcode()) {
   case AMDGPU::COPY:
   case AMDGPU::REG_SEQUENCE:
+  case AMDGPU::PHI:
+  case AMDGPU::INSERT_SUBREG:
     return RI.hasVGPRs(getOpRegClass(MI, 0));
   default:
     return RI.hasVGPRs(getOpRegClass(MI, OpNo));
@@ -737,11 +870,12 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
     }
   }
 
-  // Legalize REG_SEQUENCE
+  // Legalize REG_SEQUENCE and PHI
   // The register class of the operands much be the same type as the register
   // class of the output.
-  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
-    const TargetRegisterClass *RC = NULL, *SRC = NULL, *VRC = NULL;
+  if (MI->getOpcode() == AMDGPU::REG_SEQUENCE ||
+      MI->getOpcode() == AMDGPU::PHI) {
+    const TargetRegisterClass *RC = nullptr, *SRC = nullptr, *VRC = nullptr;
     for (unsigned i = 1, e = MI->getNumOperands(); i != e; i+=2) {
       if (!MI->getOperand(i).isReg() ||
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
@@ -774,13 +908,40 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
           !TargetRegisterInfo::isVirtualRegister(MI->getOperand(i).getReg()))
         continue;
       unsigned DstReg = MRI.createVirtualRegister(RC);
-      BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+      MachineBasicBlock *InsertBB;
+      MachineBasicBlock::iterator Insert;
+      if (MI->getOpcode() == AMDGPU::REG_SEQUENCE) {
+        InsertBB = MI->getParent();
+        Insert = MI;
+      } else {
+        // MI is a PHI instruction.
+        InsertBB = MI->getOperand(i + 1).getMBB();
+        Insert = InsertBB->getFirstTerminator();
+      }
+      BuildMI(*InsertBB, Insert, MI->getDebugLoc(),
               get(AMDGPU::COPY), DstReg)
               .addOperand(MI->getOperand(i));
       MI->getOperand(i).setReg(DstReg);
     }
   }
 
+  // Legalize INSERT_SUBREG
+  // src0 must have the same register class as dst
+  if (MI->getOpcode() == AMDGPU::INSERT_SUBREG) {
+    unsigned Dst = MI->getOperand(0).getReg();
+    unsigned Src0 = MI->getOperand(1).getReg();
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+    const TargetRegisterClass *Src0RC = MRI.getRegClass(Src0);
+    if (DstRC != Src0RC) {
+      MachineBasicBlock &MBB = *MI->getParent();
+      unsigned NewSrc0 = MRI.createVirtualRegister(DstRC);
+      BuildMI(MBB, MI, MI->getDebugLoc(), get(AMDGPU::COPY), NewSrc0)
+              .addReg(Src0);
+      MI->getOperand(1).setReg(NewSrc0);
+    }
+    return;
+  }
+
   // Legalize MUBUF* instructions
   // FIXME: If we start using the non-addr64 instructions for compute, we
   // may need to legalize them here.
@@ -886,6 +1047,72 @@ void SIInstrInfo::legalizeOperands(MachineInstr *MI) const {
   }
 }
 
+void SIInstrInfo::moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  switch (MI->getOpcode()) {
+    case AMDGPU::S_LOAD_DWORD_IMM:
+    case AMDGPU::S_LOAD_DWORD_SGPR:
+    case AMDGPU::S_LOAD_DWORDX2_IMM:
+    case AMDGPU::S_LOAD_DWORDX2_SGPR:
+    case AMDGPU::S_LOAD_DWORDX4_IMM:
+    case AMDGPU::S_LOAD_DWORDX4_SGPR:
+      unsigned NewOpcode = getVALUOp(*MI);
+      unsigned RegOffset;
+      unsigned ImmOffset;
+
+      if (MI->getOperand(2).isReg()) {
+        RegOffset = MI->getOperand(2).getReg();
+        ImmOffset = 0;
+      } else {
+        assert(MI->getOperand(2).isImm());
+        // SMRD instructions take a dword offsets and MUBUF instructions
+        // take a byte offset.
+        ImmOffset = MI->getOperand(2).getImm() << 2;
+        RegOffset = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+        if (isUInt<12>(ImmOffset)) {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(0);
+        } else {
+          BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32),
+                  RegOffset)
+                  .addImm(ImmOffset);
+          ImmOffset = 0;
+        }
+      }
+
+      unsigned SRsrc = MRI.createVirtualRegister(&AMDGPU::SReg_128RegClass);
+      unsigned DWord0 = RegOffset;
+      unsigned DWord1 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord2 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+      unsigned DWord3 = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
+
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord1)
+              .addImm(0);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord2)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT & 0xFFFFFFFF);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::S_MOV_B32), DWord3)
+              .addImm(AMDGPU::RSRC_DATA_FORMAT >> 32);
+      BuildMI(*MBB, MI, MI->getDebugLoc(), get(AMDGPU::REG_SEQUENCE), SRsrc)
+              .addReg(DWord0)
+              .addImm(AMDGPU::sub0)
+              .addReg(DWord1)
+              .addImm(AMDGPU::sub1)
+              .addReg(DWord2)
+              .addImm(AMDGPU::sub2)
+              .addReg(DWord3)
+              .addImm(AMDGPU::sub3);
+     MI->setDesc(get(NewOpcode));
+     if (MI->getOperand(2).isReg()) {
+       MI->getOperand(2).setReg(MI->getOperand(1).getReg());
+     } else {
+       MI->getOperand(2).ChangeToRegister(MI->getOperand(1).getReg(), false);
+     }
+     MI->getOperand(1).setReg(SRsrc);
+     MI->addOperand(*MBB->getParent(), MachineOperand::CreateImm(ImmOffset));
+  }
+}
+
 void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
   SmallVector<MachineInstr *, 128> Worklist;
   Worklist.push_back(&TopInst);
@@ -895,8 +1122,16 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     MachineBasicBlock *MBB = Inst->getParent();
     MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
+    unsigned Opcode = Inst->getOpcode();
+    unsigned NewOpcode = getVALUOp(*Inst);
+
     // Handle some special cases
-    switch(Inst->getOpcode()) {
+    switch (Opcode) {
+    default:
+      if (isSMRD(Inst->getOpcode())) {
+        moveSMRDToVALU(Inst, MRI);
+      }
+      break;
     case AMDGPU::S_MOV_B64: {
       DebugLoc DL = Inst->getDebugLoc();
 
@@ -947,7 +1182,6 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       llvm_unreachable("Moving this op to VALU not implemented");
     }
 
-    unsigned NewOpcode = getVALUOp(*Inst);
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
       // We cannot move this instruction to the VALU, so we should try to
       // legalize its operands instead.
@@ -968,27 +1202,52 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
         Inst->RemoveOperand(i);
     }
 
-    // Add the implict and explicit register definitions.
-    if (NewDesc.ImplicitUses) {
-      for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitUses[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
-      }
+    if (Opcode == AMDGPU::S_SEXT_I32_I8 || Opcode == AMDGPU::S_SEXT_I32_I16) {
+      // We are converting these to a BFE, so we need to add the missing
+      // operands for the size and offset.
+      unsigned Size = (Opcode == AMDGPU::S_SEXT_I32_I8) ? 8 : 16;
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Size));
+
+      // XXX - Other pointless operands. There are 4, but it seems you only need
+      // 3 to not hit an assertion later in MCInstLower.
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    if (NewDesc.ImplicitDefs) {
-      for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
-        unsigned Reg = NewDesc.ImplicitDefs[i];
-        Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
-      }
+    addDescImplicitUseDef(NewDesc, Inst);
+
+    if (Opcode == AMDGPU::S_BFE_I32 || Opcode == AMDGPU::S_BFE_U32) {
+      const MachineOperand &OffsetWidthOp = Inst->getOperand(2);
+      // If we need to move this to VGPRs, we need to unpack the second operand
+      // back into the 2 separate ones for bit offset and width.
+      assert(OffsetWidthOp.isImm() &&
+             "Scalar BFE is only implemented for constant width and offset");
+      uint32_t Imm = OffsetWidthOp.getImm();
+
+      uint32_t Offset = Imm & 0x3f; // Extract bits [5:0].
+      uint32_t BitWidth = (Imm & 0x7f0000) >> 16; // Extract bits [22:16].
+
+      Inst->RemoveOperand(2); // Remove old immediate.
+      Inst->addOperand(Inst->getOperand(1));
+      Inst->getOperand(1).ChangeToImmediate(0);
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(Offset));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(BitWidth));
+      Inst->addOperand(MachineOperand::CreateImm(0));
+      Inst->addOperand(MachineOperand::CreateImm(0));
     }
 
-    legalizeOperands(Inst);
-
     // Update the destination register class.
+
     const TargetRegisterClass *NewDstRC = getOpRegClass(*Inst, 0);
 
-    switch (Inst->getOpcode()) {
+    switch (Opcode) {
       // For target instructions, getOpRegClass just returns the virtual
       // register class associated with the operand, so we need to find an
       // equivalent VGPR register class in order to move the instruction to the
@@ -996,6 +1255,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::COPY:
     case AMDGPU::PHI:
     case AMDGPU::REG_SEQUENCE:
+    case AMDGPU::INSERT_SUBREG:
       if (RI.hasVGPRs(NewDstRC))
         continue;
       NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
@@ -1010,6 +1270,9 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     unsigned NewDstReg = MRI.createVirtualRegister(NewDstRC);
     MRI.replaceRegWith(DstReg, NewDstReg);
 
+    // Legalize the operands
+    legalizeOperands(Inst);
+
     for (MachineRegisterInfo::use_iterator I = MRI.use_begin(NewDstReg),
            E = MRI.use_end(); I != E; ++I) {
       MachineInstr &UseMI = *I->getParent();
@@ -1097,6 +1360,24 @@ void SIInstrInfo::splitScalar64BitOp(SmallVectorImpl<MachineInstr *> &Worklist,
   Worklist.push_back(HiHalf);
 }
 
+void SIInstrInfo::addDescImplicitUseDef(const MCInstrDesc &NewDesc,
+                                        MachineInstr *Inst) const {
+  // Add the implict and explicit register definitions.
+  if (NewDesc.ImplicitUses) {
+    for (unsigned i = 0; NewDesc.ImplicitUses[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitUses[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, false, true));
+    }
+  }
+
+  if (NewDesc.ImplicitDefs) {
+    for (unsigned i = 0; NewDesc.ImplicitDefs[i]; ++i) {
+      unsigned Reg = NewDesc.ImplicitDefs[i];
+      Inst->addOperand(MachineOperand::CreateReg(Reg, true, true));
+    }
+  }
+}
+
 MachineInstrBuilder SIInstrInfo::buildIndirectWrite(
                                    MachineBasicBlock *MBB,
                                    MachineBasicBlock::iterator I,
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
index c537038..7b31a81 100644
--- a/lib/Target/R600/SIInstrInfo.h
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -47,49 +47,52 @@ private:
   void splitScalar64BitOp(SmallVectorImpl<MachineInstr *> & Worklist,
                           MachineInstr *Inst, unsigned Opcode) const;
 
+  void addDescImplicitUseDef(const MCInstrDesc &Desc, MachineInstr *MI) const;
 
 public:
   explicit SIInstrInfo(AMDGPUTargetMachine &tm);
 
-  const SIRegisterInfo &getRegisterInfo() const {
+  const SIRegisterInfo &getRegisterInfo() const override {
     return RI;
   }
 
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator MI, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
                            unsigned SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
-                           const TargetRegisterInfo *TRI) const;
+                           const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
                             unsigned DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
-                            const TargetRegisterInfo *TRI) const;
+                            const TargetRegisterInfo *TRI) const override;
+
+  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
   unsigned commuteOpcode(unsigned Opcode) const;
 
-  virtual MachineInstr *commuteInstruction(MachineInstr *MI,
-                                           bool NewMI=false) const;
+  MachineInstr *commuteInstruction(MachineInstr *MI,
+                                   bool NewMI=false) const override;
 
   bool isTriviallyReMaterializable(const MachineInstr *MI,
-                                   AliasAnalysis *AA = 0) const;
+                                   AliasAnalysis *AA = nullptr) const;
 
-  virtual unsigned getIEQOpcode() const {
+  unsigned getIEQOpcode() const override {
     llvm_unreachable("Unimplemented");
   }
 
   MachineInstr *buildMovInstr(MachineBasicBlock *MBB,
                               MachineBasicBlock::iterator I,
-                              unsigned DstReg, unsigned SrcReg) const;
-  virtual bool isMov(unsigned Opcode) const;
+                              unsigned DstReg, unsigned SrcReg) const override;
+  bool isMov(unsigned Opcode) const override;
 
-  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
   bool isDS(uint16_t Opcode) const;
   int isMIMG(uint16_t Opcode) const;
   int isSMRD(uint16_t Opcode) const;
@@ -101,8 +104,8 @@ public:
   bool isInlineConstant(const MachineOperand &MO) const;
   bool isLiteralConstant(const MachineOperand &MO) const;
 
-  virtual bool verifyInstruction(const MachineInstr *MI,
-                                 StringRef &ErrInfo) const;
+  bool verifyInstruction(const MachineInstr *MI,
+                         StringRef &ErrInfo) const override;
 
   bool isSALUInstr(const MachineInstr &MI) const;
   static unsigned getVALUOp(const MachineInstr &MI);
@@ -136,32 +139,36 @@ public:
   /// create new instruction and insert them before \p MI.
   void legalizeOperands(MachineInstr *MI) const;
 
+  void moveSMRDToVALU(MachineInstr *MI, MachineRegisterInfo &MRI) const;
+
   /// \brief Replace this instruction's opcode with the equivalent VALU
   /// opcode.  This function will also move the users of \p MI to the
   /// VALU if necessary.
   void moveToVALU(MachineInstr &MI) const;
 
-  virtual unsigned calculateIndirectAddress(unsigned RegIndex,
-                                            unsigned Channel) const;
+  unsigned calculateIndirectAddress(unsigned RegIndex,
+                                    unsigned Channel) const override;
 
-  virtual const TargetRegisterClass *getIndirectAddrRegClass() const;
+  const TargetRegisterClass *getIndirectAddrRegClass() const override;
 
-  virtual MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
-                                                 MachineBasicBlock::iterator I,
-                                                 unsigned ValueReg,
-                                                 unsigned Address,
-                                                 unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned ValueReg,
+                                         unsigned Address,
+                                         unsigned OffsetReg) const override;
 
-  virtual MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
-                                                MachineBasicBlock::iterator I,
-                                                unsigned ValueReg,
-                                                unsigned Address,
-                                                unsigned OffsetReg) const;
+  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
+                                        MachineBasicBlock::iterator I,
+                                        unsigned ValueReg,
+                                        unsigned Address,
+                                        unsigned OffsetReg) const override;
   void reserveIndirectRegisters(BitVector &Reserved,
                                 const MachineFunction &MF) const;
 
   void LoadM0(MachineInstr *MoveRel, MachineBasicBlock::iterator I,
               unsigned SavReg, unsigned IndexReg) const;
+
+  void insertNOPs(MachineBasicBlock::iterator MI, int Count) const;
 };
 
 namespace AMDGPU {
@@ -169,6 +176,7 @@ namespace AMDGPU {
   int getVOPe64(uint16_t Opcode);
   int getCommuteRev(uint16_t Opcode);
   int getCommuteOrig(uint16_t Opcode);
+  int getMCOpcode(uint16_t Opcode, unsigned Gen);
 
   const uint64_t RSRC_DATA_FORMAT = 0xf00000000000LL;
 
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
index e05ab65..2242e6d 100644
--- a/lib/Target/R600/SIInstrInfo.td
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -7,23 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Execpt for the NONE field, this must be kept in sync with the SISubtarget enum
+// in AMDGPUMCInstLower.h
+def SISubtarget {
+  int NONE = -1;
+  int SI = 0;
+}
+
 //===----------------------------------------------------------------------===//
 // SI DAG Nodes
 //===----------------------------------------------------------------------===//
 
-// SMRD takes a 64bit memory address and can only add an 32bit offset
-def SIadd64bit32bit : SDNode<"ISD::ADD",
-  SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVT<0, i64>, SDTCisVT<2, i32>]>
->;
-
 def SIload_constant : SDNode<"AMDGPUISD::LOAD_CONSTANT",
-  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, i128>, SDTCisVT<2, i32>]>,
+  SDTypeProfile<1, 2, [SDTCisVT<0, f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i32>]>,
                       [SDNPMayLoad, SDNPMemOperand]
 >;
 
 def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
   SDTypeProfile<0, 13,
-    [SDTCisVT<0, i128>,   // rsrc(SGPR)
+    [SDTCisVT<0, v4i32>,   // rsrc(SGPR)
      SDTCisVT<1, iAny>,   // vdata(VGPR)
      SDTCisVT<2, i32>,    // num_channels(imm)
      SDTCisVT<3, i32>,    // vaddr(VGPR)
@@ -41,13 +43,13 @@ def SItbuffer_store : SDNode<"AMDGPUISD::TBUFFER_STORE_FORMAT",
 >;
 
 def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, i128>, SDTCisVT<2, i16>,
+  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
                        SDTCisVT<3, i32>]>
 >;
 
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v32i8>,
-                       SDTCisVT<3, i128>, SDTCisVT<4, i32>]>
+                       SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
 >;
 
 def SIsample : SDSample<"AMDGPUISD::SAMPLE">;
@@ -111,14 +113,17 @@ def IMM16bit : PatLeaf <(imm),
   [{return isUInt<16>(N->getZExtValue());}]
 >;
 
+def IMM32bit : PatLeaf <(imm),
+  [{return isUInt<32>(N->getZExtValue());}]
+>;
+
 def mubuf_vaddr_offset : PatFrag<
   (ops node:$ptr, node:$offset, node:$imm_offset),
   (add (add node:$ptr, node:$offset), node:$imm_offset)
 >;
 
 class InlineImm <ValueType vt> : PatLeaf <(vt imm), [{
-  return
-    (*(const SITargetLowering *)getTargetLowering()).analyzeImmediate(N) == 0;
+  return isInlineImmediate(N);
 }]>;
 
 class SGPRImm <dag frag> : PatLeaf<frag, [{
@@ -138,7 +143,7 @@ class SGPRImm <dag frag> : PatLeaf<frag, [{
 }]>;
 
 def FRAMEri32 : Operand<iPTR> {
-  let MIOperandInfo = (ops SReg_32:$ptr, i32imm:$index);
+  let MIOperandInfo = (ops i32:$ptr, i32imm:$index);
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,15 +202,17 @@ class SOP2_SHIFT_64 <bits<7> op, string opName, list<dag> pattern> : SOP2 <
   opName#" $dst, $src0, $src1", pattern
 >;
 
-class SOPC_32 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_32:$src0, SSrc_32:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
 
-class SOPC_64 <bits<7> op, string opName, list<dag> pattern> : SOPC <
-  op, (outs SCCReg:$dst), (ins SSrc_64:$src0, SSrc_64:$src1),
-  opName#" $dst, $src0, $src1", pattern
->;
+class SOPC_Helper <bits<7> op, RegisterClass rc, ValueType vt,
+                    string opName, PatLeaf cond> : SOPC <
+  op, (outs SCCReg:$dst), (ins rc:$src0, rc:$src1),
+  opName#" $dst, $src0, $src1", []>;
+
+class SOPC_32<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_32, i32, opName, cond>;
+
+class SOPC_64<bits<7> op, string opName, PatLeaf cond = COND_NULL>
+  : SOPC_Helper<op, SSrc_64, i64, opName, cond>;
 
 class SOPK_32 <bits<5> op, string opName, list<dag> pattern> : SOPK <
   op, (outs SReg_32:$dst), (ins i16imm:$src0),
@@ -221,7 +228,7 @@ multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass baseClass,
                         RegisterClass dstClass> {
   def _IMM : SMRD <
     op, 1, (outs dstClass:$dst),
-    (ins baseClass:$sbase, i32imm:$offset),
+    (ins baseClass:$sbase, u32imm:$offset),
     asm#" $dst, $sbase, $offset", []
   >;
 
@@ -245,6 +252,28 @@ class VOP2_REV <string revOp, bit isOrig> {
   bit IsOrig = isOrig;
 }
 
+class SIMCInstr <string pseudo, int subtarget> {
+  string PseudoInstr = pseudo;
+  int Subtarget = subtarget;
+}
+
+multiclass VOP3_m <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern,
+                   string opName> {
+
+  def "" : InstSI <outs, ins, "", pattern>, VOP <opName>,
+           SIMCInstr<OpName, SISubtarget.NONE> {
+    let isPseudo = 1;
+  }
+
+  def _si : VOP3 <op, outs, ins, asm, []>, SIMCInstr<opName, SISubtarget.SI>;
+
+}
+
+// This must always be right before the operand being input modified.
+def InputMods : OperandWithDefaultOps <i32, (ops (i32 0))> {
+  let PrintMethod = "printOperandAndMods";
+}
+
 multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
                         string opName, list<dag> pattern> {
 
@@ -256,10 +285,8 @@ multiclass VOP1_Helper <bits<8> op, RegisterClass drc, RegisterClass src,
   def _e64 : VOP3 <
     {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs drc:$dst),
-    (ins src:$src0,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, src:$src0, i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $clamp, $omod", []
   >, VOP <opName> {
     let src1 = SIOperand.ZERO;
     let src2 = SIOperand.ZERO;
@@ -288,10 +315,10 @@ multiclass VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs vrc:$dst),
-    (ins arc:$src0, arc:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
   }
@@ -316,10 +343,10 @@ multiclass VOP2b_32 <bits<6> op, string opName, list<dag> pattern,
   def _e64 : VOP3b <
     {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs VReg_32:$dst),
-    (ins VSrc_32:$src0, VSrc_32:$src1,
-         i32imm:$abs, i32imm:$clamp,
-         i32imm:$omod, i32imm:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg", []
+    (ins InputMods: $src0_modifiers, VSrc_32:$src0,
+         InputMods:$src1_modifiers, VSrc_32:$src1,
+         i32imm:$clamp, i32imm:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod", []
   >, VOP <opName>, VOP2_REV<revOp#"_e64", !eq(revOp, opName)> {
     let src2 = SIOperand.ZERO;
     /* the VOP2 variant puts the carry out into VCC, the VOP3 variant
@@ -340,15 +367,16 @@ multiclass VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
   def _e64 : VOP3 <
     {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
     (outs SReg_64:$dst),
-    (ins arc:$src0, arc:$src1,
-         InstFlag:$abs, InstFlag:$clamp,
-         InstFlag:$omod, InstFlag:$neg),
-    opName#"_e64 $dst, $src0, $src1, $abs, $clamp, $omod, $neg",
+    (ins InputMods:$src0_modifiers, arc:$src0,
+         InputMods:$src1_modifiers, arc:$src1,
+         InstFlag:$clamp, InstFlag:$omod),
+    opName#"_e64 $dst, $src0_modifiers, $src1_modifiers, $clamp, $omod",
     !if(!eq(!cast<string>(cond), "COND_NULL"), []<dag>,
       [(set SReg_64:$dst, (i1 (setcc (vt arc:$src0), arc:$src1, cond)))]
     )
   >, VOP <opName> {
     let src2 = SIOperand.ZERO;
+    let src2_modifiers = 0;
   }
 }
 
@@ -360,12 +388,13 @@ multiclass VOPC_64 <bits<8> op, string opName,
   ValueType vt = untyped, PatLeaf cond = COND_NULL>
   : VOPC_Helper <op, VReg_64, VSrc_64, opName, vt, cond>;
 
-class VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
+multiclass VOP3_32 <bits<9> op, string opName, list<dag> pattern> : VOP3_m <
   op, (outs VReg_32:$dst),
-  (ins VSrc_32:$src0, VSrc_32:$src1, VSrc_32:$src2,
-   InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
-  opName#" $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg", pattern
->, VOP <opName>;
+  (ins InputMods: $src0_modifiers, VSrc_32:$src0, InputMods:$src1_modifiers,
+   VSrc_32:$src1, InputMods:$src2_modifiers, VSrc_32:$src2,
+   InstFlag:$clamp, InstFlag:$omod),
+  opName#" $dst, $src0_modifiers, $src1, $src2, $clamp, $omod", pattern, opName
+>;
 
 class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
   op, (outs VReg_64:$dst),
@@ -374,10 +403,9 @@ class VOP3_64_Shift <bits <9> op, string opName, list<dag> pattern> : VOP3 <
 >, VOP <opName> {
 
   let src2 = SIOperand.ZERO;
-  let abs = 0;
+  let src0_modifiers = 0;
   let clamp = 0;
   let omod = 0;
-  let neg = 0;
 }
 
 class VOP3_64 <bits<9> op, string opName, list<dag> pattern> : VOP3 <
@@ -403,7 +431,7 @@ class DS_1A <bits<8> op, dag outs, dag ins, string asm, list<dag> pat> :
 class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, u16imm:$offset),
   asm#" $vdst, $addr, $offset, [M0]",
   []> {
   let data0 = 0;
@@ -415,7 +443,7 @@ class DS_Load_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
   op,
   (outs regClass:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, u8imm:$offset0, u8imm:$offset1),
   asm#" $gds, $vdst, $addr, $offset0, $offset1, [M0]",
   []> {
   let data0 = 0;
@@ -427,7 +455,7 @@ class DS_Load2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS <
 class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u16imm:$offset),
   asm#" $addr, $data0, $offset [M0]",
   []> {
   let data1 = 0;
@@ -439,7 +467,7 @@ class DS_Store_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
 class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A <
   op,
   (outs),
-  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, i8imm:$offset0, i8imm:$offset1),
+  (ins i1imm:$gds, VReg_32:$addr, regClass:$data0, u8imm:$offset0, u8imm:$offset1),
   asm#" $addr, $data0, $data1, $offset0, $offset1 [M0]",
   []> {
   let mayStore = 1;
@@ -450,7 +478,7 @@ class DS_Store2_Helper <bits<8> op, string asm, RegisterClass regClass> : DS_1A
 class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
   op,
   (outs rc:$vdst),
-  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, i16imm:$offset),
+  (ins i1imm:$gds, VReg_32:$addr, VReg_32:$data0, u16imm:$offset),
   asm#" $vdst, $addr, $data0, $offset, [M0]",
   []> {
 
@@ -462,7 +490,7 @@ class DS_1A1D_RET <bits<8> op, string asm, RegisterClass rc> : DS_1A <
 class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs),
-  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+  (ins regClass:$vdata, u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
    i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
    SReg_128:$srsrc, i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $vdata, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -481,7 +509,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 0 in {
         def _OFFSET : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -497,7 +525,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
       let offen = 0, idxen = 1 in {
         def _IDXEN  : MUBUF <op, (outs regClass:$vdata),
                              (ins SReg_128:$srsrc, VReg_32:$vaddr,
-                             i16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
+                             u16imm:$offset, SSrc_32:$soffset, i1imm:$glc,
                              i1imm:$slc, i1imm:$tfe),
                              asm#" $vdata, $srsrc[$vaddr] + $offset + $soffset, glc=$glc, slc=$slc, tfe=$tfe", []>;
       }
@@ -513,7 +541,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
     let offen = 0, idxen = 0, addr64 = 1, glc = 0, slc = 0, tfe = 0, soffset = 128 /* ZERO */ in {
       def _ADDR64 : MUBUF <op, (outs regClass:$vdata),
-                           (ins SReg_128:$srsrc, VReg_64:$vaddr, i16imm:$offset),
+                           (ins SReg_128:$srsrc, VReg_64:$vaddr, u16imm:$offset),
                            asm#" $vdata, $srsrc + $vaddr + $offset", []>;
     }
   }
@@ -521,7 +549,7 @@ multiclass MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> {
 
 class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
     MUBUF <op, (outs), (ins vdataClass:$vdata, SReg_128:$srsrc, VReg_64:$vaddr,
-                            i16imm:$offset),
+                            u16imm:$offset),
           name#" $vdata, $srsrc + $vaddr + $offset",
          []> {
 
@@ -542,7 +570,7 @@ class MUBUF_Store_Helper <bits<7> op, string name, RegisterClass vdataClass> :
 class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
   op,
   (outs regClass:$dst),
-  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+  (ins u16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
        i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, SReg_128:$srsrc,
        i1imm:$slc, i1imm:$tfe, SSrc_32:$soffset),
   asm#" $dst, $offset, $offen, $idxen, $glc, $addr64, $dfmt,"
@@ -677,4 +705,12 @@ def isDS : InstrMapping {
   let ValueCols = [["8"]];
 }
 
+def getMCOpcode : InstrMapping {
+  let FilterClass = "SIMCInstr";
+  let RowFields = ["PseudoInstr"];
+  let ColFields = ["Subtarget"];
+  let KeyCol = [!cast<string>(SISubtarget.NONE)];
+  let ValueCols = [[!cast<string>(SISubtarget.SI)]];
+}
+
 include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 5232139..500fa78 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -32,9 +32,56 @@ def isSI : Predicate<"Subtarget.getGeneration() "
 def isCI : Predicate<"Subtarget.getGeneration() "
                       ">= AMDGPUSubtarget::SEA_ISLANDS">;
 
+def isCFDepth0 : Predicate<"isCFDepth0()">;
+
 def WAIT_FLAG : InstFlag<"printWaitFlag">;
 
-let Predicates = [isSI] in {
+let SubtargetPredicate = isSI in {
+let OtherPredicates  = [isCFDepth0] in {
+
+//===----------------------------------------------------------------------===//
+// SMRD Instructions
+//===----------------------------------------------------------------------===//
+
+let mayLoad = 1 in {
+
+// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
+// SMRD instructions, because the SGPR_32 register class does not include M0
+// and writing to M0 from an SMRD instruction will hang the GPU.
+defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
+defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
+defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
+
+defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
+  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
+>;
+
+defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
+  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
+>;
+
+defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
+  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
+>;
+
+defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
+  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
+>;
+
+defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
+  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
+>;
+
+} // mayLoad = 1
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP1 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
@@ -45,7 +92,10 @@ def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
 def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
 } // End isMoveImm = 1
 
-def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32",
+  [(set i32:$dst, (not i32:$src0))]
+>;
+
 def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
 def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
 def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
@@ -65,8 +115,13 @@ def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
 //def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
 def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
 //def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
-//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
-//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8",
+  [(set i32:$dst, (sext_inreg i32:$src0, i8))]
+>;
+def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16",
+  [(set i32:$dst, (sext_inreg i32:$src0, i16))]
+>;
+
 ////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
 ////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
 ////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
@@ -99,6 +154,150 @@ def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
 def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
 def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
 def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Instructions
+//===----------------------------------------------------------------------===//
+
+let Defs = [SCC] in { // Carry out goes to SCC
+let isCommutable = 1 in {
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
+  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
+>;
+} // End isCommutable = 1
+
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
+  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
+>;
+
+let Uses = [SCC] in { // Carry in comes from SCC
+let isCommutable = 1 in {
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
+  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End isCommutable = 1
+
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
+  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
+} // End Uses = [SCC]
+} // End Defs = [SCC]
+
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
+>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
+>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
+>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
+>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  []
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]
+>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set i64:$dst, (and i64:$src0, i64:$src1))]
+>;
+
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
+  [(set i64:$dst, (or i64:$src0, i64:$src1))]
+>;
+
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
+
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
+  [(set i64:$dst, (xor i64:$src0, i64:$src1))]
+>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+
+// Use added complexity so these patterns are preferred to the VALU patterns.
+let AddedComplexity = 1 in {
+
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
+def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
+  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
+>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
+def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
+  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
+>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
+def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
+  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+>;
+
+} // End AddedComplexity = 1
+
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPC Instructions
+//===----------------------------------------------------------------------===//
+
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32">;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32">;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32">;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32">;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32">;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32">;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32">;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32">;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32">;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32">;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32">;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32">;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// SOPK Instructions
+//===----------------------------------------------------------------------===//
+
 def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
 def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
 
@@ -147,6 +346,108 @@ def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
 //def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
 //def EXP : EXP_ <0x00000000, "EXP", []>;
 
+} // End let OtherPredicates = [isCFDepth0]
+
+//===----------------------------------------------------------------------===//
+// SOPP Instructions
+//===----------------------------------------------------------------------===//
+
+def S_NOP : SOPP <0x00000000, (ins i16imm:$SIMM16), "S_NOP $SIMM16", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
+  [(br bb:$target)]> {
+  let isBarrier = 1;
+}
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC0 $target", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC1 $target",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ $target",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ $target",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECZ $target",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ $target",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+let hasSideEffects = 1 in {
+def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
+  [(int_AMDGPU_barrier_local)]
+> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+
+let Uses = [EXEC] in {
+  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
+      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
+  > {
+    let DisableEncoding = "$m0";
+  }
+} // End Uses = [EXEC]
+
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+} // End hasSideEffects
+
+//===----------------------------------------------------------------------===//
+// VOPC Instructions
+//===----------------------------------------------------------------------===//
+
 let isCompare = 1 in {
 
 defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32">;
@@ -403,6 +704,10 @@ defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64">;
 
 } // End isCompare = 1
 
+//===----------------------------------------------------------------------===//
+// DS Instructions
+//===----------------------------------------------------------------------===//
+
 def DS_ADD_U32_RTN : DS_1A1D_RET <0x20, "DS_ADD_U32_RTN", VReg_32>;
 def DS_SUB_U32_RTN : DS_1A1D_RET <0x21, "DS_SUB_U32_RTN", VReg_32>;
 def DS_WRITE_B32 : DS_Store_Helper <0x0000000d, "DS_WRITE_B32", VReg_32>;
@@ -427,6 +732,9 @@ def DS_READ2_B64 : DS_Load2_Helper <0x00000075, "DS_READ2_B64", VReg_128>;
 // TODO: DS_READ2ST64_B32, DS_READ2ST64_B64,
 // DS_WRITE2ST64_B32, DS_WRITE2ST64_B64
 
+//===----------------------------------------------------------------------===//
+// MUBUF Instructions
+//===----------------------------------------------------------------------===//
 
 //def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
 //def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
@@ -499,6 +807,11 @@ def BUFFER_STORE_DWORDX4 : MUBUF_Store_Helper <
 //def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
 //def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
 //def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+
+//===----------------------------------------------------------------------===//
+// MTBUF Instructions
+//===----------------------------------------------------------------------===//
+
 //def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
 //def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
 //def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
@@ -508,41 +821,10 @@ def TBUFFER_STORE_FORMAT_XY : MTBUF_Store_Helper <0x00000005, "TBUFFER_STORE_FOR
 def TBUFFER_STORE_FORMAT_XYZ : MTBUF_Store_Helper <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", VReg_128>;
 def TBUFFER_STORE_FORMAT_XYZW : MTBUF_Store_Helper <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", VReg_128>;
 
-let mayLoad = 1 in {
-
-// We are using the SGPR_32 and not the SReg_32 register class for 32-bit
-// SMRD instructions, because the SGPR_32 register class does not include M0
-// and writing to M0 from an SMRD instruction will hang the GPU.
-defm S_LOAD_DWORD : SMRD_Helper <0x00, "S_LOAD_DWORD", SReg_64, SGPR_32>;
-defm S_LOAD_DWORDX2 : SMRD_Helper <0x01, "S_LOAD_DWORDX2", SReg_64, SReg_64>;
-defm S_LOAD_DWORDX4 : SMRD_Helper <0x02, "S_LOAD_DWORDX4", SReg_64, SReg_128>;
-defm S_LOAD_DWORDX8 : SMRD_Helper <0x03, "S_LOAD_DWORDX8", SReg_64, SReg_256>;
-defm S_LOAD_DWORDX16 : SMRD_Helper <0x04, "S_LOAD_DWORDX16", SReg_64, SReg_512>;
-
-defm S_BUFFER_LOAD_DWORD : SMRD_Helper <
-  0x08, "S_BUFFER_LOAD_DWORD", SReg_128, SGPR_32
->;
-
-defm S_BUFFER_LOAD_DWORDX2 : SMRD_Helper <
-  0x09, "S_BUFFER_LOAD_DWORDX2", SReg_128, SReg_64
->;
-
-defm S_BUFFER_LOAD_DWORDX4 : SMRD_Helper <
-  0x0a, "S_BUFFER_LOAD_DWORDX4", SReg_128, SReg_128
->;
-
-defm S_BUFFER_LOAD_DWORDX8 : SMRD_Helper <
-  0x0b, "S_BUFFER_LOAD_DWORDX8", SReg_128, SReg_256
->;
-
-defm S_BUFFER_LOAD_DWORDX16 : SMRD_Helper <
-  0x0c, "S_BUFFER_LOAD_DWORDX16", SReg_128, SReg_512
->;
-
-} // mayLoad = 1
+//===----------------------------------------------------------------------===//
+// MIMG Instructions
+//===----------------------------------------------------------------------===//
 
-//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
-//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
 defm IMAGE_LOAD : MIMG_NoSampler <0x00000000, "IMAGE_LOAD">;
 defm IMAGE_LOAD_MIP : MIMG_NoSampler <0x00000001, "IMAGE_LOAD_MIP">;
 //def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
@@ -638,8 +920,12 @@ defm IMAGE_SAMPLE_C_B : MIMG_Sampler <0x0000002d, "IMAGE_SAMPLE_C_B">;
 //def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
-//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
+//===----------------------------------------------------------------------===//
+// VOP1 Instructions
+//===----------------------------------------------------------------------===//
+
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
 
 let neverHasSideEffects = 1, isMoveImm = 1 in {
 defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
@@ -691,8 +977,13 @@ defm V_CVT_F64_F32 : VOP1_64_32 <0x00000010, "V_CVT_F64_F32",
 //defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
 //defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
 //defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
-//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
-//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_CVT_U32_F64 : VOP1_32_64 <0x00000015, "V_CVT_U32_F64",
+  [(set i32:$dst, (fp_to_uint f64:$src0))]
+>;
+defm V_CVT_F64_U32 : VOP1_64_32 <0x00000016, "V_CVT_F64_U32",
+  [(set f64:$dst, (uint_to_fp i32:$src0))]
+>;
+
 defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
   [(set f32:$dst, (AMDGPUfract f32:$src0))]
 >;
@@ -756,6 +1047,11 @@ defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
 defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
 defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
 
+
+//===----------------------------------------------------------------------===//
+// VINTRP Instructions
+//===----------------------------------------------------------------------===//
+
 def V_INTERP_P1_F32 : VINTRP <
   0x00000000,
   (outs VReg_32:$dst),
@@ -786,97 +1082,9 @@ def V_INTERP_MOV_F32 : VINTRP <
   let DisableEncoding = "$m0";
 }
 
-//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
-
-let isTerminator = 1 in {
-
-def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
-  [(IL_retflag)]> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-}
-
-let isBranch = 1 in {
-def S_BRANCH : SOPP <
-  0x00000002, (ins brtarget:$target), "S_BRANCH $target",
-  [(br bb:$target)]> {
-  let isBarrier = 1;
-}
-
-let DisableEncoding = "$scc" in {
-def S_CBRANCH_SCC0 : SOPP <
-  0x00000004, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC0 $target", []
->;
-def S_CBRANCH_SCC1 : SOPP <
-  0x00000005, (ins brtarget:$target, SCCReg:$scc),
-  "S_CBRANCH_SCC1 $target",
-  []
->;
-} // End DisableEncoding = "$scc"
-
-def S_CBRANCH_VCCZ : SOPP <
-  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCZ $target",
-  []
->;
-def S_CBRANCH_VCCNZ : SOPP <
-  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
-  "S_CBRANCH_VCCNZ $target",
-  []
->;
-
-let DisableEncoding = "$exec" in {
-def S_CBRANCH_EXECZ : SOPP <
-  0x00000008, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECZ $target",
-  []
->;
-def S_CBRANCH_EXECNZ : SOPP <
-  0x00000009, (ins brtarget:$target, EXECReg:$exec),
-  "S_CBRANCH_EXECNZ $target",
-  []
->;
-} // End DisableEncoding = "$exec"
-
-
-} // End isBranch = 1
-} // End isTerminator = 1
-
-let hasSideEffects = 1 in {
-def S_BARRIER : SOPP <0x0000000a, (ins), "S_BARRIER",
-  [(int_AMDGPU_barrier_local)]
-> {
-  let SIMM16 = 0;
-  let isBarrier = 1;
-  let hasCtrlDep = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
-}
-
-def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "S_WAITCNT $simm16",
-  []
->;
-//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
-//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
-//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
-
-let Uses = [EXEC] in {
-  def S_SENDMSG : SOPP <0x00000010, (ins SendMsgImm:$simm16, M0Reg:$m0), "S_SENDMSG $simm16",
-      [(int_SI_sendmsg imm:$simm16, M0Reg:$m0)]
-  > {
-    let DisableEncoding = "$m0";
-  }
-} // End Uses = [EXEC]
-
-//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
-//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
-//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
-//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
-//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
-//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
-} // End hasSideEffects
+//===----------------------------------------------------------------------===//
+// VOP2 Instructions
+//===----------------------------------------------------------------------===//
 
 def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
   (ins VSrc_32:$src0, VReg_32:$src1, VCCReg:$vcc),
@@ -891,18 +1099,11 @@ def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
    InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
   "V_CNDMASK_B32_e64 $dst, $src0, $src1, $src2, $abs, $clamp, $omod, $neg",
   [(set i32:$dst, (select i1:$src2, i32:$src1, i32:$src0))]
->;
-
-//f32 pattern for V_CNDMASK_B32_e64
-def : Pat <
-  (f32 (select i1:$src2, f32:$src1, f32:$src0)),
-  (V_CNDMASK_B32_e64 $src0, $src1, $src2)
->;
-
-def : Pat <
-  (i32 (trunc i64:$val)),
-  (EXTRACT_SUBREG $val, sub0)
->;
+> {
+  let src0_modifiers = 0;
+  let src1_modifiers = 0;
+  let src2_modifiers = 0;
+}
 
 def V_READLANE_B32 : VOP2 <
   0x00000001,
@@ -946,11 +1147,11 @@ defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
 
 
 defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24",
-  [(set i32:$dst, (mul I24:$src0, I24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_i24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
 defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24",
-  [(set i32:$dst, (mul U24:$src0, U24:$src1))]
+  [(set i32:$dst, (AMDGPUmul_u24 i32:$src0, i32:$src1))]
 >;
 //defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
 
@@ -965,27 +1166,43 @@ defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
 
 defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
 defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
-defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
-defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
-defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
-defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32",
+  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32",
+  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32",
+  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32",
+  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]>;
+
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32",
+  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
+>;
 
-defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
 defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", [], "V_LSHR_B32">;
 
-defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32",
+  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
+>;
 defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", [], "V_ASHR_I32">;
 
 let hasPostISelHook = 1 in {
 
-defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32",
+  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
+>;
 
 }
 defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", [], "V_LSHL_B32">;
 
-defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32", []>;
-defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32", []>;
-defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set i32:$dst, (and i32:$src0, i32:$src1))]>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set i32:$dst, (or i32:$src0, i32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
+>;
 
 } // End isCommutable = 1
 
@@ -1001,14 +1218,18 @@ defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
 let isCommutable = 1, Defs = [VCC] in { // Carry-out goes to VCC
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
-defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32", [], VSrc_32>;
-defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32", [], VSrc_32>;
+defm V_ADD_I32 : VOP2b_32 <0x00000025, "V_ADD_I32",
+  [(set i32:$dst, (add i32:$src0, i32:$src1))], VSrc_32>;
+defm V_SUB_I32 : VOP2b_32 <0x00000026, "V_SUB_I32",
+  [(set i32:$dst, (sub i32:$src0, i32:$src1))], VSrc_32>;
 defm V_SUBREV_I32 : VOP2b_32 <0x00000027, "V_SUBREV_I32", [], VSrc_32,
                               "V_SUB_I32">;
 
 let Uses = [VCC] in { // Carry-in comes from VCC
-defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32", [], VReg_32>;
-defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32", [], VReg_32>;
+defm V_ADDC_U32 : VOP2b_32 <0x00000028, "V_ADDC_U32",
+  [(set i32:$dst, (adde i32:$src0, i32:$src1))], VReg_32>;
+defm V_SUBB_U32 : VOP2b_32 <0x00000029, "V_SUBB_U32",
+  [(set i32:$dst, (sube i32:$src0, i32:$src1))], VReg_32>;
 defm V_SUBBREV_U32 : VOP2b_32 <0x0000002a, "V_SUBBREV_U32", [], VReg_32,
                                "V_SUBB_U32">;
 } // End Uses = [VCC]
@@ -1023,63 +1244,51 @@ defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
 >;
 ////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
 ////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
-def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
-def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
-def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
-def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
-def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
-def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
-def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
-def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
-def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
-def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
-def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
-def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
-////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
-////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
-////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
-////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
-//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+//===----------------------------------------------------------------------===//
+// VOP3 Instructions
+//===----------------------------------------------------------------------===//
 
 let neverHasSideEffects = 1 in {
 
-def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
-def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
-def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
-  [(set i32:$dst, (add (mul I24:$src0, I24:$src1), i32:$src2))]
+defm V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+defm V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32",
+  [(set f32:$dst, (fadd (fmul f32:$src0, f32:$src1), f32:$src2))]
+>;
+defm V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24",
+  [(set i32:$dst, (AMDGPUmad_i24 i32:$src0, i32:$src1, i32:$src2))]
 >;
-def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
-  [(set i32:$dst, (add (mul U24:$src0, U24:$src1), i32:$src2))]
+defm V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24",
+  [(set i32:$dst, (AMDGPUmad_u24 i32:$src0, i32:$src1, i32:$src2))]
 >;
 
 } // End neverHasSideEffects
-def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
-def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
-def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
-def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+
+defm V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+defm V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+defm V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+defm V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
 
 let neverHasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
-def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
+defm V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32",
   [(set i32:$dst, (AMDGPUbfe_u32 i32:$src0, i32:$src1, i32:$src2))]>;
-def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
+defm V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32",
   [(set i32:$dst, (AMDGPUbfe_i32 i32:$src0, i32:$src1, i32:$src2))]>;
 }
 
-def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
+defm V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))]>;
-defm : BFIPatterns <V_BFI_B32>;
-def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
+defm V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32",
   [(set f32:$dst, (fma f32:$src0, f32:$src1, f32:$src2))]
 >;
 def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64",
   [(set f64:$dst, (fma f64:$src0, f64:$src1, f64:$src2))]
 >;
 //def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
-def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
-def : ROTRPattern <V_ALIGNBIT_B32>;
+defm V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
 
-def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
-def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+defm V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+defm V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 ////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
 ////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
 ////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
@@ -1092,9 +1301,9 @@ def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
 //def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
 //def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
 //def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
-def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+defm V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
 ////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
-def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+defm V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
 def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
 
 def V_LSHL_B64 : VOP3_64_Shift <0x00000161, "V_LSHL_B64",
@@ -1116,181 +1325,46 @@ def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
 
 } // isCommutable = 1
 
-def : Pat <
-  (fadd f64:$src0, f64:$src1),
-  (V_ADD_F64 $src0, $src1, (i64 0))
->;
-
-def : Pat <
-  (fmul f64:$src0, f64:$src1),
-  (V_MUL_F64 $src0, $src1, (i64 0))
->;
-
 def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
 
 let isCommutable = 1 in {
 
-def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
-def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
-def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
-def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+defm V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+defm V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+defm V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+defm V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
 
 } // isCommutable = 1
 
-def : Pat <
-  (mul i32:$src0, i32:$src1),
-  (V_MUL_LO_I32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhu i32:$src0, i32:$src1),
-  (V_MUL_HI_U32 $src0, $src1, (i32 0))
->;
-
-def : Pat <
-  (mulhs i32:$src0, i32:$src1),
-  (V_MUL_HI_I32 $src0, $src1, (i32 0))
->;
-
-def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+defm V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
 def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
-def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+defm V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
 def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
 //def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
 //def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
 //def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
 def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
 
-let Defs = [SCC] in { // Carry out goes to SCC
-let isCommutable = 1 in {
-def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
-def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32",
-  [(set i32:$dst, (add SSrc_32:$src0, SSrc_32:$src1))]
->;
-} // End isCommutable = 1
-
-def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
-def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32",
-  [(set i32:$dst, (sub SSrc_32:$src0, SSrc_32:$src1))]
->;
-
-let Uses = [SCC] in { // Carry in comes from SCC
-let isCommutable = 1 in {
-def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32",
-  [(set i32:$dst, (adde (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End isCommutable = 1
-
-def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32",
-  [(set i32:$dst, (sube (i32 SSrc_32:$src0), (i32 SSrc_32:$src1)))]>;
-} // End Uses = [SCC]
-} // End Defs = [SCC]
-
-def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32",
-  [(set i32:$dst, (AMDGPUsmin i32:$src0, i32:$src1))]
->;
-def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32",
-  [(set i32:$dst, (AMDGPUumin i32:$src0, i32:$src1))]
->;
-def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32",
-  [(set i32:$dst, (AMDGPUsmax i32:$src0, i32:$src1))]
->;
-def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32",
-  [(set i32:$dst, (AMDGPUumax i32:$src0, i32:$src1))]
->;
-
-def S_CSELECT_B32 : SOP2 <
-  0x0000000a, (outs SReg_32:$dst),
-  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
-  []
->;
-
-def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
-
-def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32",
-  [(set i32:$dst, (and i32:$src0, i32:$src1))]
->;
-
-def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
-  [(set i64:$dst, (and i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (and i1:$src0, i1:$src1)),
-  (S_AND_B64 $src0, $src1)
->;
-
-def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32",
-  [(set i32:$dst, (or i32:$src0, i32:$src1))]
->;
-
-def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64",
-  [(set i64:$dst, (or i64:$src0, i64:$src1))]
->;
-
-def : Pat <
-  (i1 (or i1:$src0, i1:$src1)),
-  (S_OR_B64 $src0, $src1)
->;
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
 
-def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32",
-  [(set i32:$dst, (xor i32:$src0, i32:$src1))]
->;
+let isCodeGenOnly = 1, isPseudo = 1 in {
 
-def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64",
-  [(set i1:$dst, (xor i1:$src0, i1:$src1))]
+def V_MOV_I1 : InstSI <
+  (outs VReg_1:$dst),
+  (ins i1imm:$src),
+  "", [(set i1:$dst, (imm:$src))]
 >;
-def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
-def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
-def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
-def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
-def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
-def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
-def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
-def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
-def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
-def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
-
-// Use added complexity so these patterns are preferred to the VALU patterns.
-let AddedComplexity = 1 in {
 
-def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32",
-  [(set i32:$dst, (shl i32:$src0, i32:$src1))]
->;
-def S_LSHL_B64 : SOP2_SHIFT_64 <0x0000001f, "S_LSHL_B64",
-  [(set i64:$dst, (shl i64:$src0, i32:$src1))]
->;
-def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32",
-  [(set i32:$dst, (srl i32:$src0, i32:$src1))]
->;
-def S_LSHR_B64 : SOP2_SHIFT_64 <0x00000021, "S_LSHR_B64",
-  [(set i64:$dst, (srl i64:$src0, i32:$src1))]
->;
-def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32",
-  [(set i32:$dst, (sra i32:$src0, i32:$src1))]
->;
-def S_ASHR_I64 : SOP2_SHIFT_64 <0x00000023, "S_ASHR_I64",
-  [(set i64:$dst, (sra i64:$src0, i32:$src1))]
+def V_AND_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (and i1:$src0, i1:$src1))]
 >;
 
-} // End AddedComplexity = 1
-
-def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
-def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
-def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
-def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
-def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
-def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
-def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
-//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
-def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1 in {
-
-def LOAD_CONST : AMDGPUShaderInst <
-  (outs GPRF32:$dst),
-  (ins i32imm:$src),
-  "LOAD_CONST $dst, $src",
-  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+def V_OR_I1 : InstSI <
+   (outs VReg_1:$dst), (ins VReg_1:$src0, VReg_1:$src1), "",
+   [(set i1:$dst, (or i1:$src0, i1:$src1))]
 >;
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -1301,19 +1375,19 @@ let mayLoad = 1, mayStore = 1, hasSideEffects = 1,
 
 let isBranch = 1, isTerminator = 1 in {
 
-def SI_IF : InstSI <
+def SI_IF: InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$vcc, brtarget:$target),
-  "SI_IF $dst, $vcc, $target",
+  "",
   [(set i64:$dst, (int_SI_if i1:$vcc, bb:$target))]
 >;
 
 def SI_ELSE : InstSI <
   (outs SReg_64:$dst),
   (ins SReg_64:$src, brtarget:$target),
-  "SI_ELSE $dst, $src, $target",
-  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]> {
-
+  "",
+  [(set i64:$dst, (int_SI_else i64:$src, bb:$target))]
+> {
   let Constraints = "$src = $dst";
 }
 
@@ -1370,7 +1444,7 @@ let Uses = [EXEC], Defs = [EXEC,VCC,M0] in {
 
 let UseNamedOperandTable = 1 in {
 
-def SI_RegisterLoad : AMDGPUShaderInst <
+def SI_RegisterLoad : InstSI <
   (outs VReg_32:$dst, SReg_64:$temp),
   (ins FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1379,7 +1453,7 @@ def SI_RegisterLoad : AMDGPUShaderInst <
   let mayLoad = 1;
 }
 
-class SIRegStore<dag outs> : AMDGPUShaderInst <
+class SIRegStore<dag outs> : InstSI <
   outs,
   (ins VReg_32:$val, FRAMEri32:$addr, i32imm:$chan),
   "", []
@@ -1439,8 +1513,33 @@ def V_SUB_F64 : InstSI <
 
 } // end usesCustomInserter
 
+multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
+
+  def _SAVE : InstSI <
+    (outs VReg_32:$dst),
+    (ins sgpr_class:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+  def _RESTORE : InstSI <
+    (outs sgpr_class:$dst),
+    (ins VReg_32:$src, i32imm:$frame_idx),
+    "", []
+  >;
+
+}
+
+defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
+defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
+defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
+defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+
 } // end IsCodeGenOnly, isPseudo
 
+} // end SubtargetPredicate = SI
+
+let Predicates = [isSI] in {
+
 def : Pat<
   (int_AMDGPU_cndlt f32:$src0, f32:$src1, f32:$src2),
   (V_CNDMASK_B32_e64 $src2, $src1, (V_CMP_GT_F32_e64 0, $src0))
@@ -1453,7 +1552,7 @@ def : Pat <
 
 /* int_SI_vs_load_input */
 def : Pat<
-  (SIload_input i128:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
+  (SIload_input v4i32:$tlst, IMM12bit:$attr_offset, i32:$buf_idx_vgpr),
   (BUFFER_LOAD_FORMAT_XYZW_IDXEN $tlst, $buf_idx_vgpr, imm:$attr_offset, 0, 0, 0, 0)
 >;
 
@@ -1470,40 +1569,116 @@ def : Pat <
   (V_SUB_F64 $src0, $src1)
 >;
 
+//===----------------------------------------------------------------------===//
+// SMRD Patterns
+//===----------------------------------------------------------------------===//
+
+multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
+
+  // 1. Offset as 8bit DWORD immediate
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
+    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
+  >;
+
+  // 2. Offset loaded in an 32bit SGPR
+  def : Pat <
+    (constant_load (add i64:$sbase, (i64 IMM32bit:$offset))),
+    (vt (Instr_SGPR $sbase, (S_MOV_B32 (i32 (as_i32imm $offset)))))
+  >;
+
+  // 3. No offset at all
+  def : Pat <
+    (constant_load i64:$sbase),
+    (vt (Instr_IMM $sbase, 0))
+  >;
+}
+
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
+defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
+defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
+defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
+defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
+
+// 1. Offset as 8bit DWORD immediate
+def : Pat <
+  (SIload_constant v4i32:$sbase, IMM8bitDWORD:$offset),
+  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
+>;
+
+// 2. Offset loaded in an 32bit SGPR
+def : Pat <
+  (SIload_constant v4i32:$sbase, imm:$offset),
+  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
+>;
+
+//===----------------------------------------------------------------------===//
+// SOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (i1 (xor i1:$src0, i1:$src1)),
+  (S_XOR_B64 $src0, $src1)
+>;
+
+//===----------------------------------------------------------------------===//
+// VOP2 Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
+  (or i64:$src0, i64:$src1),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub0),
+                  (EXTRACT_SUBREG i64:$src1, sub0)), sub0),
+    (V_OR_B32_e32 (EXTRACT_SUBREG i64:$src0, sub1),
+                  (EXTRACT_SUBREG i64:$src1, sub1)), sub1)
+>;
+
+class SextInReg <ValueType vt, int ShiftAmt> : Pat <
+  (sext_inreg i32:$src0, vt),
+  (V_ASHRREV_I32_e32 ShiftAmt, (V_LSHLREV_B32_e32 ShiftAmt, $src0))
+>;
+
+def : SextInReg <i8, 24>;
+def : SextInReg <i16, 16>;
+
 /********** ======================= **********/
 /********** Image sampling patterns **********/
 /********** ======================= **********/
 
 /* SIsample for simple 1D texture lookup */
 def : Pat <
-  (SIsample i32:$addr, v32i8:$rsrc, i128:$sampler, imm),
+  (SIsample i32:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
   (IMAGE_SAMPLE_V4_V1 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SamplePattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, imm),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, imm),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleRectPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_RECT),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_RECT),
     (opcode 0xf, 1, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleArrayPattern<SDNode name, MIMG opcode, ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowPattern<SDNode name, MIMG opcode,
                           ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW),
     (opcode 0xf, 0, 0, 0, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
 class SampleShadowArrayPattern<SDNode name, MIMG opcode,
                                ValueType vt> : Pat <
-    (name vt:$addr, v32i8:$rsrc, i128:$sampler, TEX_SHADOW_ARRAY),
+    (name vt:$addr, v32i8:$rsrc, v4i32:$sampler, TEX_SHADOW_ARRAY),
     (opcode 0xf, 0, 0, 1, 0, 0, 0, 0, $addr, $rsrc, $sampler)
 >;
 
@@ -1692,8 +1867,6 @@ def : BitConvert <i64, v2i32, VReg_64>;
 
 def : BitConvert <v4f32, v4i32, VReg_128>;
 def : BitConvert <v4i32, v4f32, VReg_128>;
-def : BitConvert <v4i32, i128,  VReg_128>;
-def : BitConvert <i128, v4i32,  VReg_128>;
 
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, SReg_256>;
@@ -1711,10 +1884,18 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
+def FCLAMP_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FCLAMP_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (int_AMDIL_clamp f32:$src, (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src, (i32 0 /* SRC1 */),
-   0 /* ABS */, 1 /* CLAMP */, 0 /* OMOD */, 0 /* NEG */)
+  (FCLAMP_SI f32:$src)
 >;
 
 /********** ================================ **********/
@@ -1733,14 +1914,32 @@ def : Pat <
   (V_OR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Set sign bit */
 >;
 
+def FABS_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FABS_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fabs f32:$src),
-  (V_AND_B32_e32 $src, (V_MOV_B32_e32 0x7fffffff)) /* Clear sign bit */
+  (FABS_SI f32:$src)
 >;
 
+def FNEG_SI : AMDGPUShaderInst <
+  (outs VReg_32:$dst),
+  (ins VSrc_32:$src0),
+  "FNEG_SI $dst, $src0",
+  []
+> {
+  let usesCustomInserter = 1;
+}
+
 def : Pat <
   (fneg f32:$src),
-  (V_XOR_B32_e32 $src, (V_MOV_B32_e32 0x80000000)) /* Toggle sign bit */
+  (FNEG_SI f32:$src)
 >;
 
 /********** ================== **********/
@@ -1768,30 +1967,10 @@ def : Pat <
 >;
 
 def : Pat <
-  (i1 imm:$imm),
-  (S_MOV_B64 imm:$imm)
->;
-
-def : Pat <
   (i64 InlineImm<i64>:$imm),
   (S_MOV_B64 InlineImm<i64>:$imm)
 >;
 
-// i64 immediates aren't supported in hardware, split it into two 32bit values
-def : Pat <
-  (i64 imm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (S_MOV_B32 (i32 (LO32 imm:$imm))), sub0),
-    (S_MOV_B32 (i32 (HI32 imm:$imm))), sub1)
->;
-
-def : Pat <
-  (f64 fpimm:$imm),
-  (INSERT_SUBREG (INSERT_SUBREG (f64 (IMPLICIT_DEF)),
-    (V_MOV_B32_e32 (f32 (LO32f fpimm:$imm))), sub0),
-    (V_MOV_B32_e32 (f32 (HI32f fpimm:$imm))), sub1)
->;
-
 /********** ===================== **********/
 /********** Interpolation Paterns **********/
 /********** ===================== **********/
@@ -1875,21 +2054,9 @@ class Ext32Pat <SDNode ext> : Pat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// 1. Offset as 8bit DWORD immediate
+// Offset in an 32Bit VGPR
 def : Pat <
-  (SIload_constant i128:$sbase, IMM8bitDWORD:$offset),
-  (S_BUFFER_LOAD_DWORD_IMM $sbase, (as_dword_i32imm $offset))
->;
-
-// 2. Offset loaded in an 32bit SGPR
-def : Pat <
-  (SIload_constant i128:$sbase, imm:$offset),
-  (S_BUFFER_LOAD_DWORD_SGPR $sbase, (S_MOV_B32 imm:$offset))
->;
-
-// 3. Offset in an 32Bit VGPR
-def : Pat <
-  (SIload_constant i128:$sbase, i32:$voff),
+  (SIload_constant v4i32:$sbase, i32:$voff),
   (BUFFER_LOAD_DWORD_OFFEN $sbase, $voff, 0, 0, 0, 0)
 >;
 
@@ -1904,18 +2071,44 @@ def : Pat <
 def : Pat <
   (int_SI_tid),
   (V_MBCNT_HI_U32_B32_e32 0xffffffff,
-                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0, 0, 0))
+                          (V_MBCNT_LO_U32_B32_e64 0xffffffff, 0, 0, 0))
 >;
 
-/********** ================== **********/
-/**********   VOP3 Patterns    **********/
-/********** ================== **********/
+//===----------------------------------------------------------------------===//
+// VOP3 Patterns
+//===----------------------------------------------------------------------===//
+
+def : IMad24Pat<V_MAD_I32_I24>;
+def : UMad24Pat<V_MAD_U32_U24>;
 
 def : Pat <
-  (f32 (fadd (fmul f32:$src0, f32:$src1), f32:$src2)),
-  (V_MAD_F32 $src0, $src1, $src2)
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+>;
+
+def : Pat <
+  (mul i32:$src0, i32:$src1),
+  (V_MUL_LO_I32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhu i32:$src0, i32:$src1),
+  (V_MUL_HI_U32 $src0, $src1, (i32 0))
+>;
+
+def : Pat <
+  (mulhs i32:$src0, i32:$src1),
+  (V_MUL_HI_I32 $src0, $src1, (i32 0))
 >;
 
+defm : BFIPatterns <V_BFI_B32>;
+def : ROTRPattern <V_ALIGNBIT_B32>;
+
 /********** ======================= **********/
 /**********   Load/Store Patterns   **********/
 /********** ======================= **********/
@@ -1962,41 +2155,6 @@ def : Pat <(atomic_load_add_local i32:$ptr, i32:$val),
 def : Pat <(atomic_load_sub_local i32:$ptr, i32:$val),
            (DS_SUB_U32_RTN 0, $ptr, $val, 0)>;
 
-/********** ================== **********/
-/**********   SMRD Patterns    **********/
-/********** ================== **********/
-
-multiclass SMRD_Pattern <SMRD Instr_IMM, SMRD Instr_SGPR, ValueType vt> {
-
-  // 1. Offset as 8bit DWORD immediate
-  def : Pat <
-    (constant_load (add i64:$sbase, (i64 IMM8bitDWORD:$offset))),
-    (vt (Instr_IMM $sbase, (as_dword_i32imm $offset)))
-  >;
-
-  // 2. Offset loaded in an 32bit SGPR
-  def : Pat <
-    (constant_load (SIadd64bit32bit i64:$sbase, imm:$offset)),
-    (vt (Instr_SGPR $sbase, (S_MOV_B32 imm:$offset)))
-  >;
-
-  // 3. No offset at all
-  def : Pat <
-    (constant_load i64:$sbase),
-    (vt (Instr_IMM $sbase, 0))
-  >;
-}
-
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, f32>;
-defm : SMRD_Pattern <S_LOAD_DWORD_IMM, S_LOAD_DWORD_SGPR, i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, i64>;
-defm : SMRD_Pattern <S_LOAD_DWORDX2_IMM, S_LOAD_DWORDX2_SGPR, v2i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, i128>;
-defm : SMRD_Pattern <S_LOAD_DWORDX4_IMM, S_LOAD_DWORDX4_SGPR, v4i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v32i8>;
-defm : SMRD_Pattern <S_LOAD_DWORDX8_IMM, S_LOAD_DWORDX8_SGPR, v8i32>;
-defm : SMRD_Pattern <S_LOAD_DWORDX16_IMM, S_LOAD_DWORDX16_SGPR, v16i32>;
-
 //===----------------------------------------------------------------------===//
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
@@ -2083,7 +2241,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
                              MUBUF bothen> {
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offset $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2091,7 +2249,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm, 1, 0, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (offen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2099,7 +2257,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, i32:$vaddr, i32:$soffset,
                                   imm:$offset, 0, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (idxen $rsrc, $vaddr, (as_i16imm $offset), $soffset, (as_i1imm $glc),
@@ -2107,7 +2265,7 @@ multiclass MUBUF_Load_Dword <ValueType vt, MUBUF offset, MUBUF offen, MUBUF idxe
   >;
 
   def : Pat <
-    (vt (int_SI_buffer_load_dword i128:$rsrc, v2i32:$vaddr, i32:$soffset,
+    (vt (int_SI_buffer_load_dword v4i32:$rsrc, v2i32:$vaddr, i32:$soffset,
                                   imm, 1, 1, imm:$glc, imm:$slc,
                                   imm:$tfe)),
     (bothen $rsrc, $vaddr, $soffset, (as_i1imm $glc), (as_i1imm $slc),
@@ -2128,7 +2286,7 @@ defm : MUBUF_Load_Dword <v4i32, BUFFER_LOAD_DWORDX4_OFFSET, BUFFER_LOAD_DWORDX4_
 
 // TBUFFER_STORE_FORMAT_*, addr64=0
 class MTBUF_StoreResource <ValueType vt, int num_channels, MTBUF opcode> : Pat<
-  (SItbuffer_store i128:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
+  (SItbuffer_store v4i32:$rsrc, vt:$vdata, num_channels, i32:$vaddr,
                    i32:$soffset, imm:$inst_offset, imm:$dfmt,
                    imm:$nfmt, imm:$offen, imm:$idxen,
                    imm:$glc, imm:$slc, imm:$tfe),
@@ -2156,12 +2314,13 @@ defm V_CEIL_F64 : VOP1_64 <0x00000018, "V_CEIL_F64",
 defm V_FLOOR_F64 : VOP1_64 <0x0000001A, "V_FLOOR_F64",
   [(set f64:$dst, (ffloor f64:$src0))]
 >;
+defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64",
+  [(set f64:$dst, (frint f64:$src0))]
+>;
 
-defm V_RNDNE_F64 : VOP1_64 <0x00000019, "V_RNDNE_F64", []>;
-
-def V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
-def V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
-def V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
+defm V_QSAD_PK_U16_U8 : VOP3_32 <0x00000173, "V_QSAD_PK_U16_U8", []>;
+defm V_MQSAD_U16_U8 : VOP3_32 <0x000000172, "V_MQSAD_U16_U8", []>;
+defm V_MQSAD_U32_U8 : VOP3_32 <0x00000175, "V_MQSAD_U32_U8", []>;
 def V_MAD_U64_U32 : VOP3_64 <0x00000176, "V_MAD_U64_U32", []>;
 
 // XXX - Does this set VCC?
@@ -2248,17 +2407,43 @@ def : Pat<
 >;
 
 //===----------------------------------------------------------------------===//
-// Miscellaneous Patterns
+// Conversion Patterns
 //===----------------------------------------------------------------------===//
 
+def : Pat<(i32 (sext_inreg i32:$src, i1)),
+  (S_BFE_I32 i32:$src, 65536)>; // 0 | 1 << 16
+
+// TODO: Match 64-bit BFE. SI has a 64-bit BFE, but it's scalar only so it
+// might not be worth the effort, and will need to expand to shifts when
+// fixing SGPR copies.
+
+// Handle sext_inreg in i64
 def : Pat <
-  (i64 (trunc i128:$x)),
+  (i64 (sext_inreg i64:$src, i1)),
   (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-    (i32 (EXTRACT_SUBREG $x, sub0)), sub0),
-    (i32 (EXTRACT_SUBREG $x, sub1)), sub1)
+    (S_BFE_I32 (EXTRACT_SUBREG i64:$src, sub0), 65536), sub0), // 0 | 1 << 16
+    (S_MOV_B32 -1), sub1)
 >;
 
 def : Pat <
+  (i64 (sext_inreg i64:$src, i8)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I8 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+def : Pat <
+  (i64 (sext_inreg i64:$src, i16)),
+  (INSERT_SUBREG (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+    (S_SEXT_I32_I16 (EXTRACT_SUBREG i64:$src, sub0)), sub0),
+    (S_MOV_B32 -1), sub1)
+>;
+
+//===----------------------------------------------------------------------===//
+// Miscellaneous Patterns
+//===----------------------------------------------------------------------===//
+
+def : Pat <
   (i32 (trunc i64:$a)),
   (EXTRACT_SUBREG $a, sub0)
 >;
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
index c2f8696..6601f2a 100644
--- a/lib/Target/R600/SILowerControlFlow.cpp
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -67,7 +67,7 @@ private:
   static const unsigned SkipThreshold = 12;
 
   static char ID;
-  const TargetRegisterInfo *TRI;
+  const SIRegisterInfo *TRI;
   const SIInstrInfo *TII;
 
   bool shouldSkip(MachineBasicBlock *From, MachineBasicBlock *To);
@@ -92,11 +92,11 @@ private:
 
 public:
   SILowerControlFlowPass(TargetMachine &tm) :
-    MachineFunctionPass(ID), TRI(0), TII(0) { }
+    MachineFunctionPass(ID), TRI(nullptr), TII(nullptr) { }
 
-  virtual bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  const char *getPassName() const {
+  const char *getPassName() const override {
     return "SI Lower control flow instructions";
   }
 
@@ -427,7 +427,7 @@ void SILowerControlFlowPass::IndirectDst(MachineInstr &MI) {
 
 bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
   TII = static_cast<const SIInstrInfo*>(MF.getTarget().getInstrInfo());
-  TRI = MF.getTarget().getRegisterInfo();
+  TRI = static_cast<const SIRegisterInfo*>(MF.getTarget().getRegisterInfo());
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   bool HaveKill = false;
diff --git a/lib/Target/R600/SILowerI1Copies.cpp b/lib/Target/R600/SILowerI1Copies.cpp
new file mode 100644
index 0000000..738c90b
--- /dev/null
+++ b/lib/Target/R600/SILowerI1Copies.cpp
@@ -0,0 +1,148 @@
+//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// i1 values are usually inserted by the CFG Structurize pass and they are
+/// unique in that they can be copied from VALU to SALU registers.
+/// This is not possible for any other value type.  Since there are no
+/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
+///
+//===----------------------------------------------------------------------===//
+//
+
+#define DEBUG_TYPE "si-i1-copies"
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerI1Copies : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SILowerI1Copies() : MachineFunctionPass(ID) {
+    initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) override;
+
+  virtual const char *getPassName() const override {
+    return "SI Lower il Copies";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+  AU.addRequired<MachineDominatorTree>();
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
+                      "SI Lower il Copies", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
+                    "SI Lower il Copies", false, false)
+
+char SILowerI1Copies::ID = 0;
+
+char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;
+
+FunctionPass *llvm::createSILowerI1CopiesPass() {
+  return new SILowerI1Copies();
+}
+
+bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(
+      MF.getTarget().getInstrInfo());
+  const TargetRegisterInfo *TRI = MF.getTarget().getRegisterInfo();
+  std::vector<unsigned> I1Defs;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
+                                                  BI != BE; ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
+      Next = std::next(I);
+      MachineInstr &MI = *I;
+
+      if (MI.getOpcode() == AMDGPU::V_MOV_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_MOV_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_AND_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_AND_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() == AMDGPU::V_OR_I1) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        MI.setDesc(TII->get(AMDGPU::V_OR_B32_e32));
+        continue;
+      }
+
+      if (MI.getOpcode() != AMDGPU::COPY ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()) ||
+          !TargetRegisterInfo::isVirtualRegister(MI.getOperand(1).getReg()))
+        continue;
+
+
+      const TargetRegisterClass *DstRC =
+          MRI.getRegClass(MI.getOperand(0).getReg());
+      const TargetRegisterClass *SrcRC =
+          MRI.getRegClass(MI.getOperand(1).getReg());
+
+      if (DstRC == &AMDGPU::VReg_1RegClass &&
+          TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
+        I1Defs.push_back(MI.getOperand(0).getReg());
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CNDMASK_B32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addImm(-1)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      } else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
+                 SrcRC == &AMDGPU::VReg_1RegClass) {
+        BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::V_CMP_NE_I32_e64))
+                .addOperand(MI.getOperand(0))
+                .addImm(0)
+                .addOperand(MI.getOperand(1))
+                .addImm(0)
+                .addImm(0)
+                .addImm(0)
+                .addImm(0);
+        MI.eraseFromParent();
+      }
+    }
+  }
+
+  for (unsigned Reg : I1Defs)
+    MRI.setRegClass(Reg, &AMDGPU::VReg_32RegClass);
+
+  return false;
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
index ea04346..af60995 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.cpp
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -10,8 +10,11 @@
 
 
 #include "SIMachineFunctionInfo.h"
+#include "SIInstrInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 
 #define MAX_LANES 64
 
@@ -26,21 +29,57 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PSInputAddr(0),
     SpillTracker() { }
 
-static unsigned createLaneVGPR(MachineRegisterInfo &MRI) {
-  return MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+static unsigned createLaneVGPR(MachineRegisterInfo &MRI, MachineFunction *MF) {
+  unsigned VGPR = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+
+  // We need to add this register as live out for the function, in order to
+  // have the live range calculated directly.
+  //
+  // When register spilling begins, we have already calculated the live
+  // live intervals for all the registers.  Since we are spilling SGPRs to
+  // VGPRs, we need to update the Lane VGPR's live interval every time we
+  // spill or restore a register.
+  //
+  // Unfortunately, there is no good way to update the live interval as
+  // the TargetInstrInfo callbacks for spilling and restoring don't give
+  // us access to the live interval information.
+  //
+  // We are lucky, though, because the InlineSpiller calls
+  // LiveRangeEdit::calculateRegClassAndHint() which iterates through
+  // all the new register that have been created when restoring a register
+  // and calls LiveIntervals::getInterval(), which creates and computes
+  // the live interval for the newly created register.  However, once this
+  // live intervals is created, it doesn't change and since we usually reuse
+  // the Lane VGPR multiple times, this means any uses after the first aren't
+  // added to the live interval.
+  //
+  // To work around this, we add Lane VGPRs to the functions live out list,
+  // so that we can guarantee its live range will cover all of its uses.
+
+  for (MachineBasicBlock &MBB : *MF) {
+    if (MBB.back().getOpcode() == AMDGPU::S_ENDPGM) {
+      MBB.back().addOperand(*MF, MachineOperand::CreateReg(VGPR, false, true));
+      return VGPR;
+    }
+  }
+  MF->getFunction()->getContext().emitError(
+      "Could not found S_ENGPGM instrtuction.");
+  return VGPR;
 }
 
-unsigned SIMachineFunctionInfo::RegSpillTracker::getNextLane(MachineRegisterInfo &MRI) {
+unsigned SIMachineFunctionInfo::RegSpillTracker::reserveLanes(
+    MachineRegisterInfo &MRI, MachineFunction *MF, unsigned NumRegs) {
+  unsigned StartLane = CurrentLane;
+  CurrentLane += NumRegs;
   if (!LaneVGPR) {
-    LaneVGPR = createLaneVGPR(MRI);
+    LaneVGPR = createLaneVGPR(MRI, MF);
   } else {
-    CurrentLane++;
-    if (CurrentLane == MAX_LANES) {
-      CurrentLane = 0;
-      LaneVGPR = createLaneVGPR(MRI);
+    if (CurrentLane >= MAX_LANES) {
+      StartLane = CurrentLane = 0;
+      LaneVGPR = createLaneVGPR(MRI, MF);
     }
   }
-  return CurrentLane;
+  return StartLane;
 }
 
 void SIMachineFunctionInfo::RegSpillTracker::addSpilledReg(unsigned FrameIndex,
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
index 8dc82a0..96e619b 100644
--- a/lib/Target/R600/SIMachineFunctionInfo.h
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -25,7 +25,7 @@ class MachineRegisterInfo;
 /// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
 /// tells the hardware which interpolation parameters to load.
 class SIMachineFunctionInfo : public AMDGPUMachineFunction {
-  virtual void anchor();
+  void anchor() override;
 public:
 
   struct SpilledReg {
@@ -43,7 +43,12 @@ public:
   public:
     unsigned LaneVGPR;
     RegSpillTracker() : CurrentLane(0), SpilledRegisters(), LaneVGPR(0) { }
-    unsigned getNextLane(MachineRegisterInfo &MRI);
+    /// \p NumRegs The number of consecutive registers what need to be spilled.
+    ///            This function will ensure that all registers are stored in
+    ///            the same VGPR.
+    /// \returns The lane to be used for storing the first register.
+    unsigned reserveLanes(MachineRegisterInfo &MRI, MachineFunction *MF,
+                          unsigned NumRegs = 1);
     void addSpilledReg(unsigned FrameIndex, unsigned Reg, int Lane = -1);
     const SpilledReg& getSpilledReg(unsigned FrameIndex);
     bool programSpillsRegisters() { return !SpilledRegisters.empty(); }
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
index 6cef195..c72d549 100644
--- a/lib/Target/R600/SIRegisterInfo.cpp
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -71,13 +71,12 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::SReg_256RegClass
   };
 
-  for (unsigned i = 0, e = sizeof(BaseClasses) /
-                           sizeof(const TargetRegisterClass*); i != e; ++i) {
-    if (BaseClasses[i]->contains(Reg)) {
-      return BaseClasses[i];
+  for (const TargetRegisterClass *BaseClass : BaseClasses) {
+    if (BaseClass->contains(Reg)) {
+      return BaseClass;
     }
   }
-  return NULL;
+  return nullptr;
 }
 
 bool SIRegisterInfo::isSGPRClass(const TargetRegisterClass *RC) const {
@@ -113,7 +112,7 @@ const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
     } else if (getCommonSubClass(SRC, &AMDGPU::SReg_512RegClass)) {
       return &AMDGPU::VReg_512RegClass;
     }
-    return NULL;
+    return nullptr;
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -129,3 +128,10 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return &AMDGPU::VGPR_32RegClass;
   }
 }
+
+unsigned SIRegisterInfo::getPhysRegSubReg(unsigned Reg,
+                                          const TargetRegisterClass *SubRC,
+                                          unsigned Channel) const {
+  unsigned Index = getHWRegIndex(Reg);
+  return SubRC->getRegister(Index + Channel);
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
index 8148f7f..36b4fcd 100644
--- a/lib/Target/R600/SIRegisterInfo.h
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -27,22 +27,22 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
 
   SIRegisterInfo(AMDGPUTargetMachine &tm);
 
-  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
-  virtual unsigned getRegPressureLimit(const TargetRegisterClass *RC,
-                                       MachineFunction &MF) const;
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
 
   /// \param RC is an AMDIL reg class.
   ///
   /// \returns the SI register class that is equivalent to \p RC.
-  virtual const TargetRegisterClass *
-    getISARegClass(const TargetRegisterClass *RC) const;
+  const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass *RC) const override;
 
   /// \brief get the register class of the specified type to use in the
   /// CFGStructurizer
-  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+  const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const override;
 
-  virtual unsigned getHWRegIndex(unsigned Reg) const;
+  unsigned getHWRegIndex(unsigned Reg) const override;
 
   /// \brief Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VReg_32 SGPR0_SGPR1 -> SReg_32, etc.
@@ -63,6 +63,12 @@ struct SIRegisterInfo : public AMDGPURegisterInfo {
   /// be returned.
   const TargetRegisterClass *getSubRegClass(const TargetRegisterClass *RC,
                                             unsigned SubIdx) const;
+
+  /// \p Channel This is the register channel (e.g. a value from 0-16), not the
+  ///            SubReg index.
+  /// \returns The sub-register of Reg that is in Channel.
+  unsigned getPhysRegSubReg(unsigned Reg, const TargetRegisterClass *SubRC,
+                            unsigned Channel) const;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
index 65cf311..f1f01de 100644
--- a/lib/Target/R600/SIRegisterInfo.td
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -168,7 +168,7 @@ def SReg_64 : RegisterClass<"AMDGPU", [v2i32, i64, i1], 64,
   (add SGPR_64Regs, VCCReg, EXECReg)
 >;
 
-def SReg_128 : RegisterClass<"AMDGPU", [i128, v4i32], 128, (add SGPR_128)>;
+def SReg_128 : RegisterClass<"AMDGPU", [v4i32], 128, (add SGPR_128)>;
 
 def SReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add SGPR_256)>;
 
@@ -183,14 +183,16 @@ def VReg_96 : RegisterClass<"AMDGPU", [untyped], 96, (add VGPR_96)> {
   let Size = 96;
 }
 
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, i128], 128, (add VGPR_128)>;
+def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128, (add VGPR_128)>;
 
 def VReg_256 : RegisterClass<"AMDGPU", [v32i8, v8i32, v8f32], 256, (add VGPR_256)>;
 
 def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 512, (add VGPR_512)>;
 
+def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)>;
+
 //===----------------------------------------------------------------------===//
-//  [SV]Src_* register classes, can have either an immediate or an register
+//  [SV]Src_(32|64) register classes, can have either an immediate or an register
 //===----------------------------------------------------------------------===//
 
 def SSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add SReg_32)>;
@@ -201,3 +203,9 @@ def VSrc_32 : RegisterClass<"AMDGPU", [i32, f32], 32, (add VReg_32, SReg_32)>;
 
 def VSrc_64 : RegisterClass<"AMDGPU", [i64, f64], 64, (add VReg_64, SReg_64)>;
 
+//===----------------------------------------------------------------------===//
+// SGPR and VGPR register classes
+//===----------------------------------------------------------------------===//
+
+def VSrc_128 : RegisterClass<"AMDGPU", [v4i32, v4f32], 128,
+                             (add VReg_128, SReg_128)>;
diff --git a/lib/Target/R600/SITypeRewriter.cpp b/lib/Target/R600/SITypeRewriter.cpp
index 9bf2caf..a0b6907 100644
--- a/lib/Target/R600/SITypeRewriter.cpp
+++ b/lib/Target/R600/SITypeRewriter.cpp
@@ -35,13 +35,13 @@ class SITypeRewriter : public FunctionPass,
   static char ID;
   Module *Mod;
   Type *v16i8;
-  Type *i128;
+  Type *v4i32;
 
 public:
   SITypeRewriter() : FunctionPass(ID) { }
-  virtual bool doInitialization(Module &M);
-  virtual bool runOnFunction(Function &F);
-  virtual const char *getPassName() const {
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+  const char *getPassName() const override {
     return "SI Type Rewriter";
   }
   void visitLoadInst(LoadInst &I);
@@ -56,7 +56,7 @@ char SITypeRewriter::ID = 0;
 bool SITypeRewriter::doInitialization(Module &M) {
   Mod = &M;
   v16i8 = VectorType::get(Type::getInt8Ty(M.getContext()), 16);
-  i128 = Type::getIntNTy(M.getContext(), 128);
+  v4i32 = VectorType::get(Type::getInt32Ty(M.getContext()), 4);
   return false;
 }
 
@@ -84,7 +84,8 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
   Type *ElemTy = PtrTy->getPointerElementType();
   IRBuilder<> Builder(&I);
   if (ElemTy == v16i8)  {
-    Value *BitCast = Builder.CreateBitCast(Ptr, Type::getIntNPtrTy(I.getContext(), 128, 2));
+    Value *BitCast = Builder.CreateBitCast(Ptr,
+        PointerType::get(v4i32,PtrTy->getPointerAddressSpace()));
     LoadInst *Load = Builder.CreateLoad(BitCast);
     SmallVector <std::pair<unsigned, MDNode*>, 8> MD;
     I.getAllMetadataOtherThanDebugLoc(MD);
@@ -99,6 +100,7 @@ void SITypeRewriter::visitLoadInst(LoadInst &I) {
 
 void SITypeRewriter::visitCallInst(CallInst &I) {
   IRBuilder<> Builder(&I);
+
   SmallVector <Value*, 8> Args;
   SmallVector <Type*, 8> Types;
   bool NeedToReplace = false;
@@ -107,10 +109,10 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
   for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
     Value *Arg = I.getArgOperand(i);
     if (Arg->getType() == v16i8) {
-      Args.push_back(Builder.CreateBitCast(Arg, i128));
-      Types.push_back(i128);
+      Args.push_back(Builder.CreateBitCast(Arg, v4i32));
+      Types.push_back(v4i32);
       NeedToReplace = true;
-      Name = Name + ".i128";
+      Name = Name + ".v4i32";
     } else if (Arg->getType()->isVectorTy() &&
                Arg->getType()->getVectorNumElements() == 1 &&
                Arg->getType()->getVectorElementType() ==
@@ -144,12 +146,12 @@ void SITypeRewriter::visitCallInst(CallInst &I) {
 
 void SITypeRewriter::visitBitCast(BitCastInst &I) {
   IRBuilder<> Builder(&I);
-  if (I.getDestTy() != i128) {
+  if (I.getDestTy() != v4i32) {
     return;
   }
 
   if (BitCastInst *Op = dyn_cast<BitCastInst>(I.getOperand(0))) {
-    if (Op->getSrcTy() == i128) {
+    if (Op->getSrcTy() == v4i32) {
       I.replaceAllUsesWith(Op->getOperand(0));
       I.eraseFromParent();
     }
diff --git a/lib/Target/Sparc/AsmParser/LLVMBuild.txt b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
index c3ddf5a..08fdc9d 100644
--- a/lib/Target/Sparc/AsmParser/LLVMBuild.txt
+++ b/lib/Target/Sparc/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcAsmParser
 parent = Sparc
-required_libraries = MC MCParser Support SparcDesc SparcInfo
+required_libraries = MC MCParser SparcDesc SparcInfo Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 2ff6cdd..da88820 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -49,15 +49,15 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out, unsigned &ErrorInfo,
-                               bool MatchingInlineAsm);
-  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc);
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc,
-                        SmallVectorImpl<MCParsedAsmOperand*> &Operands);
-  bool ParseDirective(AsmToken DirectiveID);
+                        SmallVectorImpl<MCParsedAsmOperand*> &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
 
-  virtual unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
-                                              unsigned Kind);
+  unsigned validateTargetOperandClass(MCParsedAsmOperand *Op,
+                                      unsigned Kind) override;
 
   // Custom parse functions for Sparc specific operands.
   OperandMatchResultTy
@@ -83,7 +83,8 @@ class SparcAsmParser : public MCTargetAsmParser {
   bool is64Bit() const { return STI.getTargetTriple().startswith("sparcv9"); }
 public:
   SparcAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                const MCInstrInfo &MII)
+                const MCInstrInfo &MII,
+                const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -181,10 +182,10 @@ private:
     struct MemOp Mem;
   };
 public:
-  bool isToken() const { return Kind == k_Token; }
-  bool isReg() const { return Kind == k_Register; }
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isMem() const { return isMEMrr() || isMEMri(); }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override { return isMEMrr() || isMEMri(); }
   bool isMEMrr() const { return Kind == k_MemoryReg; }
   bool isMEMri() const { return Kind == k_MemoryImm; }
 
@@ -203,7 +204,7 @@ public:
     return StringRef(Tok.Data, Tok.Length);
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert((Kind == k_Register) && "Invalid access!");
     return Reg.RegNum;
   }
@@ -229,22 +230,22 @@ public:
   }
 
   /// getStartLoc - Get the location of the first token of this operand.
-  SMLoc getStartLoc() const {
+  SMLoc getStartLoc() const override {
     return StartLoc;
   }
   /// getEndLoc - Get the location of the last token of this operand.
-  SMLoc getEndLoc() const {
+  SMLoc getEndLoc() const override {
     return EndLoc;
   }
 
-  virtual void print(raw_ostream &OS) const {
+  void print(raw_ostream &OS) const override {
     switch (Kind) {
     case k_Token:     OS << "Token: " << getToken() << "\n"; break;
     case k_Register:  OS << "Reg: #" << getReg() << "\n"; break;
     case k_Immediate: OS << "Imm: " << getImm() << "\n"; break;
     case k_MemoryReg: OS << "Mem: " << getMemBase() << "+"
                          << getMemOffsetReg() << "\n"; break;
-    case k_MemoryImm: assert(getMemOff() != 0);
+    case k_MemoryImm: assert(getMemOff() != nullptr);
       OS << "Mem: " << getMemBase()
          << "+" << *getMemOff()
          << "\n"; break;
@@ -264,7 +265,7 @@ public:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const{
     // Add as immediate when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -323,7 +324,7 @@ public:
     assert(Op->Reg.Kind == rk_FloatReg);
     unsigned regIdx = Reg - Sparc::F0;
     if (regIdx % 2 || regIdx > 31)
-      return 0;
+      return nullptr;
     Op->Reg.RegNum = DoubleRegs[regIdx / 2];
     Op->Reg.Kind = rk_DoubleReg;
     return Op;
@@ -337,13 +338,13 @@ public:
     case rk_FloatReg:
       regIdx = Reg - Sparc::F0;
       if (regIdx % 4 || regIdx > 31)
-        return 0;
+        return nullptr;
       Reg = QuadFPRegs[regIdx / 4];
       break;
     case rk_DoubleReg:
       regIdx =  Reg - Sparc::D0;
       if (regIdx % 2 || regIdx > 31)
-        return 0;
+        return nullptr;
       Reg = QuadFPRegs[regIdx / 2];
       break;
     }
@@ -357,7 +358,7 @@ public:
     Op->Kind = k_MemoryReg;
     Op->Mem.Base = Base;
     Op->Mem.OffsetReg = offsetReg;
-    Op->Mem.Off = 0;
+    Op->Mem.Off = nullptr;
     return Op;
   }
 
@@ -564,7 +565,7 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
   case AsmToken::Comma:
   case AsmToken::RBrac:
   case AsmToken::EndOfStatement:
-    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, 0, S, E));
+    Operands.push_back(SparcOperand::CreateMEMri(BaseReg, nullptr, S, E));
     return MatchOperand_Success;
 
   case AsmToken:: Plus:
@@ -574,7 +575,7 @@ parseMEMOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands)
     break;
   }
 
-  SparcOperand *Offset = 0;
+  SparcOperand *Offset = nullptr;
   OperandMatchResultTy ResTy = parseSparcAsmOperand(Offset);
   if (ResTy != MatchOperand_Success || !Offset)
     return MatchOperand_NoMatch;
@@ -636,7 +637,7 @@ parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
     return MatchOperand_Success;
   }
 
-  SparcOperand *Op = 0;
+  SparcOperand *Op = nullptr;
 
   ResTy = parseSparcAsmOperand(Op, (Mnemonic == "call"));
   if (ResTy != MatchOperand_Success || !Op)
@@ -656,7 +657,7 @@ SparcAsmParser::parseSparcAsmOperand(SparcOperand *&Op, bool isCall)
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
 
-  Op = 0;
+  Op = nullptr;
   switch (getLexer().getKind()) {
   default:  break;
 
diff --git a/lib/Target/Sparc/DelaySlotFiller.cpp b/lib/Target/Sparc/DelaySlotFiller.cpp
index 88fba39..f3441ff 100644
--- a/lib/Target/Sparc/DelaySlotFiller.cpp
+++ b/lib/Target/Sparc/DelaySlotFiller.cpp
@@ -12,7 +12,6 @@
 // NOP is placed.
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "delay-slot-filler"
 #include "Sparc.h"
 #include "SparcSubtarget.h"
 #include "llvm/ADT/SmallSet.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "delay-slot-filler"
+
 STATISTIC(FilledSlots, "Number of delay slots filled");
 
 static cl::opt<bool> DisableDelaySlotFiller(
@@ -49,12 +50,12 @@ namespace {
         Subtarget(&TM.getSubtarget<SparcSubtarget>()) {
     }
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "SPARC Delay Slot Filler";
     }
 
     bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
-    bool runOnMachineFunction(MachineFunction &F) {
+    bool runOnMachineFunction(MachineFunction &F) override {
       bool Changed = false;
 
       // This pass invalidates liveness information when it reorders
diff --git a/lib/Target/Sparc/Disassembler/LLVMBuild.txt b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
index e7387cd..c27398f 100644
--- a/lib/Target/Sparc/Disassembler/LLVMBuild.txt
+++ b/lib/Target/Sparc/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SparcDisassembler
 parent = Sparc
-required_libraries = MC Support SparcInfo
+required_libraries = MC SparcInfo Support
 add_to_library_groups = Sparc
diff --git a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index 5cd99d6..4df0990 100644
--- a/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparc-disassembler"
-
 #include "Sparc.h"
 #include "SparcRegisterInfo.h"
 #include "SparcSubtarget.h"
@@ -23,6 +21,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sparc-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
@@ -32,22 +32,18 @@ class SparcDisassembler : public MCDisassembler {
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  SparcDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
-    MCDisassembler(STI), RegInfo(Info)
+  SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx)
   {}
   virtual ~SparcDisassembler() {}
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
-
   /// getInstruction - See MCDisassembler.
-  virtual DecodeStatus getInstruction(MCInst &instr,
-                                      uint64_t &size,
-                                      const MemoryObject &region,
-                                      uint64_t address,
-                                      raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
-private:
-  OwningPtr<const MCRegisterInfo> RegInfo;
+  DecodeStatus getInstruction(MCInst &instr,
+                              uint64_t &size,
+                              const MemoryObject &region,
+                              uint64_t address,
+                              raw_ostream &vStream,
+                              raw_ostream &cStream) const override;
 };
 
 }
@@ -58,8 +54,9 @@ namespace llvm {
 
 static MCDisassembler *createSparcDisassembler(
                        const Target &T,
-                       const MCSubtargetInfo &STI) {
-  return new SparcDisassembler(STI, T.createMCRegInfo(""));
+                       const MCSubtargetInfo &STI,
+                       MCContext &Ctx) {
+  return new SparcDisassembler(STI, Ctx);
 }
 
 
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
index fabc125..261fb38 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.cpp
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "SparcInstPrinter.h"
 #include "Sparc.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // The generated AsmMatcher SparcGenAsmWriter uses "Sparc" as the target
 // namespace. But SPARC backend uses "SP" as its namespace.
 namespace llvm {
diff --git a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
index 45ee6c0..8fe4075 100644
--- a/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
+++ b/lib/Target/Sparc/InstPrinter/SparcInstPrinter.h
@@ -30,19 +30,21 @@ public:
                   const MCSubtargetInfo &sti)
    : MCInstPrinter(MAI, MII, MRI), STI(sti) {}
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
   bool printSparcAliasInstr(const MCInst *MI, raw_ostream &OS);
   bool isV9() const;
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &O);
   bool printAliasInstr(const MCInst *MI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
   void printOperand(const MCInst *MI, int opNum, raw_ostream &OS);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &OS,
-                       const char *Modifier = 0);
+                       const char *Modifier = nullptr);
   void printCCOperand(const MCInst *MI, int opNum, raw_ostream &OS);
   bool printGetPCX(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 39c9996..7d517b6 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -102,11 +102,11 @@ namespace {
   public:
     SparcAsmBackend(const Target &T) : MCAsmBackend(), TheTarget(T) {}
 
-    unsigned getNumFixupKinds() const {
+    unsigned getNumFixupKinds() const override {
       return Sparc::NumTargetFixupKinds;
     }
 
-    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const {
+    const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
       const static MCFixupKindInfo Infos[Sparc::NumTargetFixupKinds] = {
         // name                    offset bits  flags
         { "fixup_sparc_call30",     2,     30,  MCFixupKindInfo::FKF_IsPCRel },
@@ -184,7 +184,7 @@ namespace {
       }
     }
 
-    bool mayNeedRelaxation(const MCInst &Inst) const {
+    bool mayNeedRelaxation(const MCInst &Inst) const override {
       // FIXME.
       return false;
     }
@@ -194,17 +194,17 @@ namespace {
     bool fixupNeedsRelaxation(const MCFixup &Fixup,
                               uint64_t Value,
                               const MCRelaxableFragment *DF,
-                              const MCAsmLayout &Layout) const {
+                              const MCAsmLayout &Layout) const override {
       // FIXME.
       assert(0 && "fixupNeedsRelaxation() unimplemented");
       return false;
     }
-    void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    void relaxInstruction(const MCInst &Inst, MCInst &Res) const override {
       // FIXME.
       assert(0 && "relaxInstruction() unimplemented");
     }
 
-    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override {
       // Cannot emit NOP with size not multiple of 32 bits.
       if (Count % 4 != 0)
         return false;
@@ -229,7 +229,7 @@ namespace {
       SparcAsmBackend(T), OSType(OSType) { }
 
     void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel) const {
+                    uint64_t Value, bool IsPCRel) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
@@ -244,7 +244,7 @@ namespace {
 
     }
 
-    MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
+    MCObjectWriter *createObjectWriter(raw_ostream &OS) const override {
       uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
       return createSparcELFObjectWriter(OS, is64Bit(), OSABI);
     }
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index ef5f8ce..6875fc6 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -32,7 +32,7 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(StringRef TT) {
   Data16bitsDirective = "\t.half\t";
   Data32bitsDirective = "\t.word\t";
   // .xword is only supported by V9.
-  Data64bitsDirective = (isV9) ? "\t.xword\t" : 0;
+  Data64bitsDirective = (isV9) ? "\t.xword\t" : nullptr;
   ZeroDirective = "\t.skip\t";
   CommentString = "!";
   HasLEB128 = true;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index d53d09d..e126b68 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -20,15 +20,15 @@ namespace llvm {
 class StringRef;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
-  virtual void anchor();
+  void anchor() override;
 public:
   explicit SparcELFMCAsmInfo(StringRef TT);
-  virtual const MCExpr* getExprForPersonalitySymbol(const MCSymbol *Sym,
-                                                    unsigned Encoding,
-                                                    MCStreamer &Streamer) const;
-  virtual const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
-                                            unsigned Encoding,
-                                            MCStreamer &Streamer) const;
+  const MCExpr*
+  getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
+                              MCStreamer &Streamer) const override;
+  const MCExpr* getExprForFDESymbol(const MCSymbol *Sym,
+                                    unsigned Encoding,
+                                    MCStreamer &Streamer) const override;
 
 };
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 310fbd9..b19ad7b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "SparcMCExpr.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
 #include "SparcMCTargetDesc.h"
@@ -26,6 +25,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
@@ -41,7 +42,7 @@ public:
 
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   // getBinaryCodeForInstr - TableGen'erated function for getting the
   // binary encoding for an instruction.
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
index e6b2aca..ae57fdc 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparcmcexpr"
 #include "SparcMCExpr.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
@@ -23,6 +22,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "sparcmcexpr"
+
 const SparcMCExpr*
 SparcMCExpr::Create(VariantKind Kind, const MCExpr *Expr,
                       MCContext &Ctx) {
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
index be6526e..78dd945 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCExpr.h
@@ -85,15 +85,15 @@ public:
   Sparc::Fixups getFixupKind() const { return getFixupKind(Kind); }
 
   /// @}
-  void PrintImpl(raw_ostream &OS) const;
+  void PrintImpl(raw_ostream &OS) const override;
   bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const {
+                                 const MCAsmLayout *Layout) const override;
+  void AddValueSymbols(MCAssembler *) const override;
+  const MCSection *FindAssociatedSection() const override {
     return getSubExpr()->FindAssociatedSection();
   }
 
-  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const;
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
 
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index c69af56..571017d 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -22,6 +22,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "SparcGenInstrInfo.inc"
 
@@ -31,14 +33,11 @@
 #define GET_REGINFO_MC_DESC
 #include "SparcGenRegisterInfo.inc"
 
-using namespace llvm;
-
-
 static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
                                        StringRef TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -47,7 +46,7 @@ static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
                                        StringRef TT) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, Reg, 2047);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -136,13 +135,12 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
 
 static MCStreamer *
 createMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                    bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                    bool isVerboseAsm, bool useDwarfDirectory,
                     MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                     MCAsmBackend *TAB, bool ShowInst) {
 
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new SparcTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/Sparc/SparcAsmPrinter.cpp b/lib/Target/Sparc/SparcAsmPrinter.cpp
index 50506a6..1b7330e 100644
--- a/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "Sparc.h"
 #include "InstPrinter/SparcInstPrinter.h"
 #include "MCTargetDesc/SparcMCExpr.h"
@@ -35,6 +34,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class SparcAsmPrinter : public AsmPrinter {
     SparcTargetStreamer &getTargetStreamer() {
@@ -45,18 +46,18 @@ namespace {
     explicit SparcAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
       : AsmPrinter(TM, Streamer) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "Sparc Assembly Printer";
     }
 
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
     void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
-                         const char *Modifier = 0);
+                         const char *Modifier = nullptr);
     void printCCOperand(const MachineInstr *MI, int opNum, raw_ostream &OS);
 
-    virtual void EmitFunctionBodyStart();
-    virtual void EmitInstruction(const MachineInstr *MI);
-    virtual void EmitEndOfAsmFile(Module &M);
+    void EmitFunctionBodyStart() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitEndOfAsmFile(Module &M) override;
 
     static const char *getRegisterName(unsigned RegNo) {
       return SparcInstPrinter::getRegisterName(RegNo);
@@ -64,10 +65,10 @@ namespace {
 
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                unsigned AsmVariant, const char *ExtraCode,
-                               raw_ostream &O);
+                               raw_ostream &O) override;
 
     void LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
                                    const MCSubtargetInfo &STI);
diff --git a/lib/Target/Sparc/SparcCodeEmitter.cpp b/lib/Target/Sparc/SparcCodeEmitter.cpp
index 4f8d477..247da2a 100644
--- a/lib/Target/Sparc/SparcCodeEmitter.cpp
+++ b/lib/Target/Sparc/SparcCodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===---------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "Sparc.h"
 #include "MCTargetDesc/SparcMCExpr.h"
 #include "SparcRelocations.h"
@@ -25,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -39,7 +40,7 @@ class SparcCodeEmitter : public MachineFunctionPass {
   const std::vector<MachineConstantPoolEntry> *MCPEs;
   bool IsPIC;
 
-  void getAnalysisUsage(AnalysisUsage &AU) const {
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfo> ();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
@@ -48,13 +49,13 @@ class SparcCodeEmitter : public MachineFunctionPass {
 
 public:
   SparcCodeEmitter(TargetMachine &tm, JITCodeEmitter &mce)
-    : MachineFunctionPass(ID), JTI(0), II(0), TD(0),
-      TM(tm), MCE(mce), MCPEs(0),
+    : MachineFunctionPass(ID), JTI(nullptr), II(nullptr), TD(nullptr),
+      TM(tm), MCE(mce), MCPEs(nullptr),
       IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
-  bool runOnMachineFunction(MachineFunction &MF);
+  bool runOnMachineFunction(MachineFunction &MF) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "Sparc Machine Code Emitter";
   }
 
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index d96a4c0..a37da94 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -109,18 +109,21 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
       MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   // Emit ".cfi_window_save".
   CFIIndex = MMI.addFrameInst(MCCFIInstruction::createWindowSave(nullptr));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 
   unsigned regInRA = MRI->getDwarfRegNum(SP::I7, true);
   unsigned regOutRA = MRI->getDwarfRegNum(SP::O7, true);
   // Emit ".cfi_register 15, 31".
   CFIIndex = MMI.addFrameInst(
       MCCFIInstruction::createRegister(nullptr, regOutRA, regInRA));
-  BuildMI(MBB, MBBI, dl, TII.get(SP::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 void SparcFrameLowering::
diff --git a/lib/Target/Sparc/SparcFrameLowering.h b/lib/Target/Sparc/SparcFrameLowering.h
index 072fde3..bda7b7c 100644
--- a/lib/Target/Sparc/SparcFrameLowering.h
+++ b/lib/Target/Sparc/SparcFrameLowering.h
@@ -31,17 +31,18 @@ public:
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
-  void emitPrologue(MachineFunction &MF) const;
-  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  void emitPrologue(MachineFunction &MF) const override;
+  void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                     MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+  void
+  eliminateCallFramePseudoInstr(MachineFunction &MF,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator I) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const;
-  bool hasFP(const MachineFunction &MF) const;
+  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasFP(const MachineFunction &MF) const override;
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                            RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
 private:
   // Remap input registers to output registers for leaf procedure.
diff --git a/lib/Target/Sparc/SparcISelDAGToDAG.cpp b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
index b012bfd..2fade27 100644
--- a/lib/Target/Sparc/SparcISelDAGToDAG.cpp
+++ b/lib/Target/Sparc/SparcISelDAGToDAG.cpp
@@ -41,7 +41,7 @@ public:
       TM(tm) {
   }
 
-  SDNode *Select(SDNode *N);
+  SDNode *Select(SDNode *N) override;
 
   // Complex Pattern Selectors.
   bool SelectADDRrr(SDValue N, SDValue &R1, SDValue &R2);
@@ -49,11 +49,11 @@ public:
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
   /// inline asm expressions.
-  virtual bool SelectInlineAsmMemoryOperand(const SDValue &Op,
-                                            char ConstraintCode,
-                                            std::vector<SDValue> &OutOps);
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op,
+                                    char ConstraintCode,
+                                    std::vector<SDValue> &OutOps) override;
 
-  virtual const char *getPassName() const {
+  const char *getPassName() const override {
     return "SPARC DAG->DAG Pattern Instruction Selection";
   }
 
@@ -143,7 +143,7 @@ SDNode *SparcDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
     N->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (N->getOpcode()) {
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 8e720ee..ef61466 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -53,7 +53,7 @@ static bool CC_Sparc_Assign_f64(unsigned &ValNo, MVT &ValVT,
                                 MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                                 ISD::ArgFlagsTy &ArgFlags, CCState &State)
 {
-  static const uint16_t RegList[] = {
+  static const MCPhysReg RegList[] = {
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
   // Try to get first reg.
@@ -235,8 +235,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 // Lower return values for the 64-bit ABI.
@@ -315,8 +314,7 @@ SparcTargetLowering::LowerReturn_64(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(SPISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue SparcTargetLowering::
@@ -357,10 +355,13 @@ LowerFormalArguments_32(SDValue Chain,
 
   const unsigned StackOffset = 92;
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+  unsigned InIdx = 0;
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i, ++InIdx) {
     CCValAssign &VA = ArgLocs[i];
 
-    if (i == 0  && Ins[i].Flags.isSRet()) {
+    if (Ins[InIdx].Flags.isSRet()) {
+      if (InIdx != 0)
+        report_fatal_error("sparc only supports sret on the first parameter");
       // Get SRet from [%fp+64].
       int FrameIdx = MF.getFrameInfo()->CreateFixedObject(4, 64, true);
       SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
@@ -493,11 +494,11 @@ LowerFormalArguments_32(SDValue Chain,
 
   // Store remaining ArgRegs to the stack if this is a varargs function.
   if (isVarArg) {
-    static const uint16_t ArgRegs[] = {
+    static const MCPhysReg ArgRegs[] = {
       SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
     };
     unsigned NumAllocated = CCInfo.getFirstUnallocated(ArgRegs, 6);
-    const uint16_t *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
+    const MCPhysReg *CurArgReg = ArgRegs+NumAllocated, *ArgRegEnd = ArgRegs+6;
     unsigned ArgOffset = CCInfo.getNextStackOffset();
     if (NumAllocated == 6)
       ArgOffset += StackOffset;
@@ -528,8 +529,7 @@ LowerFormalArguments_32(SDValue Chain,
 
     if (!OutChains.empty()) {
       OutChains.push_back(Chain);
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &OutChains[0], OutChains.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
     }
   }
 
@@ -644,8 +644,7 @@ LowerFormalArguments_64(SDValue Chain,
   }
 
   if (!OutChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &OutChains[0], OutChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, OutChains);
 
   return Chain;
 }
@@ -663,7 +662,7 @@ static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
   if (CS)
     return CS->hasFnAttr(Attribute::ReturnsTwice);
 
-  const Function *CalleeFn = 0;
+  const Function *CalleeFn = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     CalleeFn = dyn_cast<Function>(G->getGlobal());
   } else if (ExternalSymbolSDNode *E =
@@ -877,8 +876,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
   // Emit all stores, make sure the occur before any copies into physregs.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -927,7 +925,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(SPISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, true),
@@ -961,9 +959,9 @@ static bool isFP128ABICall(const char *CalleeName)
        "_Q_sqrt", "_Q_neg",
        "_Q_itoq", "_Q_stoq", "_Q_dtoq", "_Q_utoq",
        "_Q_lltoq", "_Q_ulltoq",
-       0
+       nullptr
     };
-  for (const char * const *I = ABICalls; *I != 0; ++I)
+  for (const char * const *I = ABICalls; *I != nullptr; ++I)
     if (strcmp(CalleeName, *I) == 0)
       return true;
   return false;
@@ -972,7 +970,7 @@ static bool isFP128ABICall(const char *CalleeName)
 unsigned
 SparcTargetLowering::getSRetArgSize(SelectionDAG &DAG, SDValue Callee) const
 {
-  const Function *CalleeFn = 0;
+  const Function *CalleeFn = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     CalleeFn = dyn_cast<Function>(G->getGlobal());
   } else if (ExternalSymbolSDNode *E =
@@ -1194,8 +1192,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Emit all stores, make sure they occur before the call.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Build a sequence of CopyToReg nodes glued together with token chain and
   // glue operands which copy the outgoing args into registers. The InGlue is
@@ -1245,7 +1242,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Now the call itself.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
-  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(SPISD::CALL, DL, NodeTys, Ops);
   InGlue = Chain.getValue(1);
 
   // Revert the stack pointer immediately after the call.
@@ -1263,7 +1260,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Set inreg flag manually for codegen generated library calls that
   // return float.
-  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == 0)
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && CLI.CS == nullptr)
     CLI.Ins[0].Flags.setInReg();
 
   RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
@@ -1677,7 +1674,7 @@ SparcTargetLowering::SparcTargetLowering(TargetMachine &TM)
 
 const char *SparcTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return 0;
+  default: return nullptr;
   case SPISD::CMPICC:     return "SPISD::CMPICC";
   case SPISD::CMPFCC:     return "SPISD::CMPFCC";
   case SPISD::BRICC:      return "SPISD::BRICC";
@@ -1711,7 +1708,7 @@ EVT SparcTargetLowering::getSetCCResultType(LLVMContext &, EVT VT) const {
 /// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
 /// be zero. Op is expected to be a target specific node. Used by DAG
 /// combiner.
-void SparcTargetLowering::computeMaskedBitsForTargetNode
+void SparcTargetLowering::computeKnownBitsForTargetNode
                                 (const SDValue Op,
                                  APInt &KnownZero,
                                  APInt &KnownOne,
@@ -1725,10 +1722,8 @@ void SparcTargetLowering::computeMaskedBitsForTargetNode
   case SPISD::SELECT_ICC:
   case SPISD::SELECT_XCC:
   case SPISD::SELECT_FCC:
-    DAG.ComputeMaskedBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
-    DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
-    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
-    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+    DAG.computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    DAG.computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
 
     // Only known if known in both the LHS and RHS.
     KnownOne &= KnownOne2;
@@ -1914,7 +1909,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op,
     assert(Mask && "Missing call preserved mask for calling convention");
     Ops.push_back(DAG.getRegisterMask(Mask));
     Ops.push_back(InFlag);
-    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, &Ops[0], Ops.size());
+    Chain = DAG.getNode(SPISD::TLS_CALL, DL, NodeTys, Ops);
     InFlag = Chain.getValue(1);
     Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(1, true),
                                DAG.getIntPtrConstant(0, true), InFlag, DL);
@@ -2033,13 +2028,10 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   for (unsigned i = 0, e = numArgs; i != e; ++i) {
     Chain = LowerF128_LibCallArg(Chain, Args, Op.getOperand(i), SDLoc(Op), DAG);
   }
-  TargetLowering::
-    CallLoweringInfo CLI(Chain,
-                         RetTyABI,
-                         false, false, false, false,
-                         0, CallingConv::C,
-                         false, false, true,
-                         Callee, Args, DAG, SDLoc(Op));
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Op)).setChain(Chain)
+    .setCallee(CallingConv::C, RetTyABI, Callee, &Args, 0);
+
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
   // chain is in second result.
@@ -2065,7 +2057,7 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
                                       SDLoc DL,
                                       SelectionDAG &DAG) const {
 
-  const char *LibCall = 0;
+  const char *LibCall = nullptr;
   bool is64Bit = Subtarget->is64Bit();
   switch(SPCC) {
   default: llvm_unreachable("Unhandled conditional code!");
@@ -2092,13 +2084,9 @@ SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
   Chain = LowerF128_LibCallArg(Chain, Args, LHS, DL, DAG);
   Chain = LowerF128_LibCallArg(Chain, Args, RHS, DL, DAG);
 
-  TargetLowering::
-    CallLoweringInfo CLI(Chain,
-                         RetTy,
-                         false, false, false, false,
-                         0, CallingConv::C,
-                         false, false, true,
-                         Callee, Args, DAG, DL);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
 
@@ -2174,7 +2162,7 @@ LowerF128_FPEXTEND(SDValue Op, SelectionDAG &DAG,
                            TLI.getLibcallName(RTLIB::FPEXT_F32_F128), 1);
 
   llvm_unreachable("fpextend with non-float operand!");
-  return SDValue(0, 0);
+  return SDValue();
 }
 
 static SDValue
@@ -2192,7 +2180,7 @@ LowerF128_FPROUND(SDValue Op, SelectionDAG &DAG,
                            TLI.getLibcallName(RTLIB::FPROUND_F128_F32), 1);
 
   llvm_unreachable("fpround to non-float!");
-  return SDValue(0, 0);
+  return SDValue();
 }
 
 static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
@@ -2213,7 +2201,7 @@ static SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG,
 
   // Expand if the resulting type is illegal.
   if (!TLI.isTypeLegal(VT))
-    return SDValue(0, 0);
+    return SDValue();
 
   // Otherwise, Convert the fp value to integer in an FP register.
   if (VT == MVT::i32)
@@ -2244,7 +2232,7 @@ static SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG,
 
   // Expand if the operand type is illegal.
   if (!TLI.isTypeLegal(OpVT))
-    return SDValue(0, 0);
+    return SDValue();
 
   // Otherwise, Convert the int value to FP in an FP register.
   SDValue Tmp = DAG.getNode(ISD::BITCAST, dl, floatVT, Op.getOperand(0));
@@ -2262,7 +2250,7 @@ static SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG,
   // quad floating point instructions and the resulting type is legal.
   if (Op.getOperand(0).getValueType() != MVT::f128 ||
       (hasHardQuad && TLI.isTypeLegal(VT)))
-    return SDValue(0, 0);
+    return SDValue();
 
   assert(VT == MVT::i32 || VT == MVT::i64);
 
@@ -2283,7 +2271,7 @@ static SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG,
   // Expand if it does not involve f128 or the target has support for
   // quad floating point instructions and the operand type is legal.
   if (Op.getValueType() != MVT::f128 || (hasHardQuad && TLI.isTypeLegal(OpVT)))
-    return SDValue(0, 0);
+    return SDValue();
 
   return TLI.LowerF128Op(Op, DAG,
                          TLI.getLibcallName(OpVT == MVT::i32
@@ -2428,7 +2416,7 @@ static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
   SDValue NewVal = DAG.getNode(ISD::ADD, dl, VT, NewSP,
                                DAG.getConstant(regSpillArea, VT));
   SDValue Ops[2] = { NewVal, Chain };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 
@@ -2597,10 +2585,9 @@ static SDValue LowerF128Load(SDValue Op, SelectionDAG &DAG)
                                SubRegOdd);
   SDValue OutChains[2] = { SDValue(Hi64.getNode(), 1),
                            SDValue(Lo64.getNode(), 1) };
-  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                                 &OutChains[0], 2);
+  SDValue OutChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   SDValue Ops[2] = {SDValue(InFP128,0), OutChain};
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 // Lower a f128 store into two f64 stores.
@@ -2644,8 +2631,7 @@ static SDValue LowerF128Store(SDValue Op, SelectionDAG &DAG) {
                              LoPtr,
                              MachinePointerInfo(),
                              false, false, alignment);
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &OutChains[0], 2);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 static SDValue LowerFNEGorFABS(SDValue Op, SelectionDAG &DAG, bool isV9) {
@@ -2726,7 +2712,7 @@ static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
 
   SDValue Dst = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, Lo);
   SDValue Ops[2] = { Dst, Carry };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 // Custom lower UMULO/SMULO for SPARC. This code is similar to ExpandNode()
@@ -2773,7 +2759,7 @@ static SDValue LowerUMULO_SMULO(SDValue Op, SelectionDAG &DAG,
   DAG.DeleteNode(MulResult.getNode());
 
   SDValue Ops[2] = { BottomHalf, TopHalf } ;
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 static SDValue LowerATOMIC_LOAD_STORE(SDValue Op, SelectionDAG &DAG) {
@@ -3092,7 +3078,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
 
   // Look at the constraint type.
@@ -3117,7 +3103,7 @@ LowerAsmOperandForConstraint(SDValue Op,
                              std::string &Constraint,
                              std::vector<SDValue> &Ops,
                              SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result(nullptr, 0);
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1)
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index f7b45d0..a24cc82 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -55,47 +55,47 @@ namespace llvm {
     const SparcSubtarget *Subtarget;
   public:
     SparcTargetLowering(TargetMachine &TM);
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
-    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    ConstraintType getConstraintType(const std::string &Constraint) const;
+    ConstraintType getConstraintType(const std::string &Constraint) const override;
     ConstraintWeight
     getSingleConstraintMatchWeight(AsmOperandInfo &info,
-                                   const char *constraint) const;
+                                   const char *constraint) const override;
     void LowerAsmOperandForConstraint(SDValue Op,
                                       std::string &Constraint,
                                       std::vector<SDValue> &Ops,
-                                      SelectionDAG &DAG) const;
+                                      SelectionDAG &DAG) const override;
     std::pair<unsigned, const TargetRegisterClass*>
-    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
+    getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const override;
 
-    virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
-    virtual EVT getSetCCResultType(LLVMContext &Context, EVT VT) const;
+    EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerFormalArguments_32(SDValue Chain,
                                     CallingConv::ID CallConv,
                                     bool isVarArg,
@@ -109,20 +109,20 @@ namespace llvm {
                                     SDLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
     SDValue LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
                          SmallVectorImpl<SDValue> &InVals) const;
     SDValue LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
                          SmallVectorImpl<SDValue> &InVals) const;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
     SDValue LowerReturn_32(SDValue Chain,
                            CallingConv::ID CallConv, bool IsVarArg,
                            const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -156,15 +156,15 @@ namespace llvm {
                              SDLoc DL,
                              SelectionDAG &DAG) const;
 
-    bool ShouldShrinkFPConstant(EVT VT) const {
+    bool ShouldShrinkFPConstant(EVT VT) const override {
       // Do not shrink FP constpool if VT == MVT::f128.
       // (ldd, call _Q_fdtoq) is more expensive than two ldds.
       return VT != MVT::f128;
     }
 
-    virtual void ReplaceNodeResults(SDNode *N,
+    void ReplaceNodeResults(SDNode *N,
                                     SmallVectorImpl<SDValue>& Results,
-                                    SelectionDAG &DAG) const;
+                                    SelectionDAG &DAG) const override;
 
     MachineBasicBlock *expandSelectCC(MachineInstr *MI, MachineBasicBlock *BB,
                                       unsigned BROpcode) const;
diff --git a/lib/Target/Sparc/SparcInstr64Bit.td b/lib/Target/Sparc/SparcInstr64Bit.td
index a34ce26..54d8240 100644
--- a/lib/Target/Sparc/SparcInstr64Bit.td
+++ b/lib/Target/Sparc/SparcInstr64Bit.td
@@ -359,9 +359,9 @@ multiclass BranchOnReg<bits<3> cond, string OpcStr> {
 
 multiclass bpr_alias<string OpcStr, Instruction NAPT, Instruction APT> {
   def : InstAlias<!strconcat(OpcStr, ",pt $rs1, $imm16"),
-                  (NAPT I64Regs:$rs1, bprtarget16:$imm16)>;
+                  (NAPT I64Regs:$rs1, bprtarget16:$imm16), 0>;
   def : InstAlias<!strconcat(OpcStr, ",a,pt $rs1, $imm16"),
-                  (APT I64Regs:$rs1, bprtarget16:$imm16)>;
+                  (APT I64Regs:$rs1, bprtarget16:$imm16), 0>;
 }
 
 defm BPZ   : BranchOnReg<0b001, "brz">;
diff --git a/lib/Target/Sparc/SparcInstrAliases.td b/lib/Target/Sparc/SparcInstrAliases.td
index 33c2aa1..d36f67b 100644
--- a/lib/Target/Sparc/SparcInstrAliases.td
+++ b/lib/Target/Sparc/SparcInstrAliases.td
@@ -281,12 +281,12 @@ defm : fp_cond_alias<"o",     0b1111>;
 // Instruction aliases for JMPL.
 
 // jmp addr -> jmpl addr, %g0
-def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr)>;
-def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr)>;
+def : InstAlias<"jmp $addr", (JMPLrr G0, MEMrr:$addr), 0>;
+def : InstAlias<"jmp $addr", (JMPLri G0, MEMri:$addr), 0>;
 
 // call addr -> jmpl addr, %o7
-def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr)>;
-def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr)>;
+def : InstAlias<"call $addr", (JMPLrr O7, MEMrr:$addr), 0>;
+def : InstAlias<"call $addr", (JMPLri O7, MEMri:$addr), 0>;
 
 // retl -> RETL 8
 def : InstAlias<"retl", (RETL 8)>;
diff --git a/lib/Target/Sparc/SparcInstrInfo.cpp b/lib/Target/Sparc/SparcInstrInfo.cpp
index abf6c17..8b2e6bc 100644
--- a/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -24,11 +24,10 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define GET_INSTRINFO_CTOR_DTOR
-#include "SparcGenInstrInfo.inc"
-
 using namespace llvm;
 
+#define GET_INSTRINFO_CTOR_DTOR
+#include "SparcGenInstrInfo.inc"
 
 // Pin the vtable to this file.
 void SparcInstrInfo::anchor() {}
@@ -162,10 +161,10 @@ bool SparcInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
@@ -285,7 +284,7 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                  bool KillSrc) const {
   unsigned numSubRegs = 0;
   unsigned movOpc     = 0;
-  const unsigned *subRegIdx = 0;
+  const unsigned *subRegIdx = nullptr;
 
   const unsigned DFP_FP_SubRegsIdx[]  = { SP::sub_even, SP::sub_odd };
   const unsigned QFP_DFP_SubRegsIdx[] = { SP::sub_even64, SP::sub_odd64 };
@@ -329,11 +328,11 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else
     llvm_unreachable("Impossible reg-to-reg copy");
 
-  if (numSubRegs == 0 || subRegIdx == 0 || movOpc == 0)
+  if (numSubRegs == 0 || subRegIdx == nullptr || movOpc == 0)
     return;
 
   const TargetRegisterInfo *TRI = &getRegisterInfo();
-  MachineInstr *MovMI = 0;
+  MachineInstr *MovMI = nullptr;
 
   for (unsigned i = 0; i != numSubRegs; ++i) {
     unsigned Dst = TRI->getSubReg(DestReg, subRegIdx[i]);
diff --git a/lib/Target/Sparc/SparcInstrInfo.h b/lib/Target/Sparc/SparcInstrInfo.h
index a86cbcb..3a1472e 100644
--- a/lib/Target/Sparc/SparcInstrInfo.h
+++ b/lib/Target/Sparc/SparcInstrInfo.h
@@ -45,52 +45,52 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const SparcRegisterInfo &getRegisterInfo() const { return RI; }
+  const SparcRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
 
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify = false) const ;
-
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MBBI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override ;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
 
   unsigned getGlobalBaseReg(MachineFunction *MF) const;
 };
diff --git a/lib/Target/Sparc/SparcJITInfo.cpp b/lib/Target/Sparc/SparcJITInfo.cpp
index 959d12f..c775e9e 100644
--- a/lib/Target/Sparc/SparcJITInfo.cpp
+++ b/lib/Target/Sparc/SparcJITInfo.cpp
@@ -10,7 +10,6 @@
 // This file implements the JIT interfaces for the Sparc target.
 //
 //===----------------------------------------------------------------------===//
-#define DEBUG_TYPE "jit"
 #include "SparcJITInfo.h"
 #include "Sparc.h"
 #include "SparcRelocations.h"
@@ -20,6 +19,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 /// JITCompilerFunction - This contains the address of the JIT function used to
 /// compile a function lazily.
 static TargetJITInfo::JITCompilerFn JITCompilerFunction;
diff --git a/lib/Target/Sparc/SparcJITInfo.h b/lib/Target/Sparc/SparcJITInfo.h
index 9c6e488..ff1b43a 100644
--- a/lib/Target/Sparc/SparcJITInfo.h
+++ b/lib/Target/Sparc/SparcJITInfo.h
@@ -34,27 +34,27 @@ class SparcJITInfo : public TargetJITInfo {
   /// overwriting OLD with a branch to NEW.  This is used for self-modifying
   /// code.
   ///
-  virtual void replaceMachineCodeForFunction(void *Old, void *New);
+  void replaceMachineCodeForFunction(void *Old, void *New) override;
 
   // getStubLayout - Returns the size and alignment of the largest call stub
   // on Sparc.
-  virtual StubLayout getStubLayout();
+  StubLayout getStubLayout() override;
 
 
   /// emitFunctionStub - Use the specified JITCodeEmitter object to emit a
   /// small native function that simply calls the function at the specified
   /// address.
-  virtual void *emitFunctionStub(const Function *F, void *Fn,
-                                 JITCodeEmitter &JCE);
+  void *emitFunctionStub(const Function *F, void *Fn,
+                         JITCodeEmitter &JCE) override;
 
   /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
-  virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+  LazyResolverFn getLazyResolverFunction(JITCompilerFn) override;
 
   /// relocate - Before the JIT can run a block of code that has been emitted,
   /// it must rewrite the code to contain the actual addresses of any
   /// referenced global symbols.
-  virtual void relocate(void *Function, MachineRelocation *MR,
-                        unsigned NumRelocs, unsigned char *GOTBase);
+  void relocate(void *Function, MachineRelocation *MR,
+                unsigned NumRelocs, unsigned char *GOTBase) override;
 
   /// Initialize - Initialize internal stage for the function being JITted.
   void Initialize(const MachineFunction &MF, bool isPIC) {
diff --git a/lib/Target/Sparc/SparcMCInstLower.cpp b/lib/Target/Sparc/SparcMCInstLower.cpp
index 737e378..9e94d2c 100644
--- a/lib/Target/Sparc/SparcMCInstLower.cpp
+++ b/lib/Target/Sparc/SparcMCInstLower.cpp
@@ -34,7 +34,7 @@ static MCOperand LowerSymbolOperand(const MachineInstr *MI,
 
   SparcMCExpr::VariantKind Kind =
     (SparcMCExpr::VariantKind)MO.getTargetFlags();
-  const MCSymbol *Symbol = 0;
+  const MCSymbol *Symbol = nullptr;
 
   switch(MO.getType()) {
   default: llvm_unreachable("Unknown type in LowerSymbolOperand");
diff --git a/lib/Target/Sparc/SparcRegisterInfo.cpp b/lib/Target/Sparc/SparcRegisterInfo.cpp
index f222382..dc1ec7c 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -25,11 +25,11 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetInstrInfo.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "SparcGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 ReserveAppRegisters("sparc-reserve-app-registers", cl::Hidden, cl::init(false),
                     cl::desc("Reserve application registers (%g2-%g4)"));
@@ -38,8 +38,8 @@ SparcRegisterInfo::SparcRegisterInfo(SparcSubtarget &st)
   : SparcGenRegisterInfo(SP::O7), Subtarget(st) {
 }
 
-const uint16_t* SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
-                                                                         const {
+const MCPhysReg*
+SparcRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_SaveList;
 }
 
diff --git a/lib/Target/Sparc/SparcRegisterInfo.h b/lib/Target/Sparc/SparcRegisterInfo.h
index 00b5a98..77f879a 100644
--- a/lib/Target/Sparc/SparcRegisterInfo.h
+++ b/lib/Target/Sparc/SparcRegisterInfo.h
@@ -31,25 +31,26 @@ struct SparcRegisterInfo : public SparcGenRegisterInfo {
   SparcRegisterInfo(SparcSubtarget &st);
 
   /// Code Generation virtual methods...
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
-  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
+  const uint32_t* getCallPreservedMask(CallingConv::ID CC) const override;
 
   const uint32_t* getRTCallPreservedMask(CallingConv::ID CC) const;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
-                                                unsigned Kind) const;
+                                                unsigned Kind) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                       RegScavenger *RS = NULL) const;
+                                       RegScavenger *RS = nullptr) const;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
index 190c575..eb36d29 100644
--- a/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
+++ b/lib/Target/Sparc/SparcSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "sparc-selectiondag-info"
 #include "SparcTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "sparc-selectiondag-info"
+
 SparcSelectionDAGInfo::SparcSelectionDAGInfo(const SparcTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp
index 6fc9d56..e38fb02 100644
--- a/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/lib/Target/Sparc/SparcSubtarget.cpp
@@ -16,12 +16,14 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "sparc-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "SparcGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void SparcSubtarget::anchor() { }
 
 SparcSubtarget::SparcSubtarget(const std::string &TT, const std::string &CPU,
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 83f3474..2469d93 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -77,8 +77,8 @@ public:
     return getTM<SparcTargetMachine>();
   }
 
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 8c9bcd3..7d04338 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -40,28 +40,28 @@ public:
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL, bool is64bit);
 
-  virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const TargetFrameLowering  *getFrameLowering() const {
+  const SparcInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const TargetFrameLowering  *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const SparcSubtarget   *getSubtargetImpl() const{ return &Subtarget; }
-  virtual const SparcRegisterInfo *getRegisterInfo() const {
+  const SparcSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const SparcRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const SparcTargetLowering* getTargetLowering() const {
+  const SparcTargetLowering* getTargetLowering() const override {
     return &TLInfo;
   }
-  virtual const SparcSelectionDAGInfo* getSelectionDAGInfo() const {
+  const SparcSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
-  virtual SparcJITInfo *getJITInfo() {
+  SparcJITInfo *getJITInfo() override {
     return &JITInfo;
   }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
-  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &JCE) override;
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
diff --git a/lib/Target/Sparc/SparcTargetObjectFile.cpp b/lib/Target/Sparc/SparcTargetObjectFile.cpp
index f1630e0..32b2240 100644
--- a/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -28,7 +28,7 @@ const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
     // Add information about the stub reference to ELFMMI so that the stub
     // gets emitted by the asmprinter.
     MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       MCSymbol *Sym = TM.getSymbol(GV, Mang);
       StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
     }
diff --git a/lib/Target/Sparc/SparcTargetStreamer.h b/lib/Target/Sparc/SparcTargetStreamer.h
index 503ebd9..3767d8e 100644
--- a/lib/Target/Sparc/SparcTargetStreamer.h
+++ b/lib/Target/Sparc/SparcTargetStreamer.h
@@ -31,8 +31,8 @@ class SparcTargetAsmStreamer : public SparcTargetStreamer {
 
 public:
   SparcTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
-  virtual void emitSparcRegisterIgnore(unsigned reg);
-  virtual void emitSparcRegisterScratch(unsigned reg);
+  void emitSparcRegisterIgnore(unsigned reg) override;
+  void emitSparcRegisterScratch(unsigned reg) override;
 
 };
 
@@ -41,8 +41,8 @@ class SparcTargetELFStreamer : public SparcTargetStreamer {
 public:
   SparcTargetELFStreamer(MCStreamer &S);
   MCELFStreamer &getStreamer();
-  virtual void emitSparcRegisterIgnore(unsigned reg) {}
-  virtual void emitSparcRegisterScratch(unsigned reg) {}
+  void emitSparcRegisterIgnore(unsigned reg) override {}
+  void emitSparcRegisterScratch(unsigned reg) override {}
 };
 } // end namespace llvm
 
diff --git a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
index 0b97e71..602898e 100644
--- a/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
+++ b/lib/Target/SystemZ/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZAsmParser
 parent = SystemZ
-required_libraries = SystemZDesc SystemZInfo MC MCParser Support
+required_libraries = MC MCParser Support SystemZDesc SystemZInfo
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a3dd4b6..71de64f 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -110,7 +110,7 @@ private:
 
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     // Add as immediates when possible.  Null MCExpr = 0.
-    if (Expr == 0)
+    if (!Expr)
       Inst.addOperand(MCOperand::CreateImm(0));
     else if (auto *CE = dyn_cast<MCConstantExpr>(Expr))
       Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
@@ -208,7 +208,7 @@ public:
     return (Kind == KindMem &&
             Mem.RegKind == RegKind &&
             (MemKind == BDXMem || !Mem.Index) &&
-            (MemKind == BDLMem) == (Mem.Length != 0));
+            (MemKind == BDLMem) == (Mem.Length != nullptr));
   }
   bool isMemDisp12(RegisterKind RegKind, MemoryKind MemKind) const {
     return isMem(RegKind, MemKind) && inRange(Mem.Disp, 0, 0xfff);
@@ -331,7 +331,8 @@ private:
 
 public:
   SystemZAsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-                   const MCInstrInfo &MII)
+                   const MCInstrInfo &MII,
+                   const MCTargetOptions &Options)
       : MCTargetAsmParser(), STI(sti), Parser(parser) {
     MCAsmParserExtension::Initialize(Parser);
 
@@ -526,7 +527,7 @@ bool SystemZAsmParser::parseAddress(unsigned &Base, const MCExpr *&Disp,
   // Parse the optional base and index.
   Index = 0;
   Base = 0;
-  Length = 0;
+  Length = nullptr;
   if (getLexer().is(AsmToken::LParen)) {
     Parser.Lex();
 
@@ -758,7 +759,7 @@ parseAccessReg(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return MatchOperand_NoMatch;
 
   Register Reg;
-  if (parseRegister(Reg, RegAccess, 0))
+  if (parseRegister(Reg, RegAccess, nullptr))
     return MatchOperand_ParseFail;
 
   Operands.push_back(SystemZOperand::createAccessReg(Reg.Num,
diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
index 59a1fe9..2350776 100644
--- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
+++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp
@@ -17,13 +17,15 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class SystemZDisassembler : public MCDisassembler {
 public:
-  SystemZDisassembler(const MCSubtargetInfo &STI)
-    : MCDisassembler(STI) {}
+  SystemZDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+    : MCDisassembler(STI, Ctx) {}
   virtual ~SystemZDisassembler() {}
 
   // Override MCDisassembler.
@@ -35,8 +37,9 @@ public:
 } // end anonymous namespace
 
 static MCDisassembler *createSystemZDisassembler(const Target &T,
-                                                 const MCSubtargetInfo &STI) {
-  return new SystemZDisassembler(STI);
+                                                 const MCSubtargetInfo &STI,
+                                                 MCContext &Ctx) {
+  return new SystemZDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializeSystemZDisassembler() {
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index e1e64d3..d2ba9b6 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -7,8 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
-
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -16,6 +14,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "SystemZGenAsmWriter.inc"
 
 void SystemZInstPrinter::printAddress(unsigned Base, int64_t Disp,
diff --git a/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
index cbdb59c..dabd214 100644
--- a/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/SystemZ/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = SystemZDesc
 parent = SystemZ
-required_libraries = MC SystemZAsmPrinter SystemZInfo Support
+required_libraries = MC Support SystemZAsmPrinter SystemZInfo
 add_to_library_groups = SystemZ
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
index df50863..27b4bd8 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "MCTargetDesc/SystemZMCFixups.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -21,6 +20,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 namespace {
 class SystemZMCCodeEmitter : public MCCodeEmitter {
   const MCInstrInfo &MCII;
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
index 54c6987..c6a1816 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCObjectWriter.cpp
@@ -82,7 +82,7 @@ static unsigned getPLTReloc(unsigned Kind) {
 unsigned SystemZObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel) const {
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   unsigned Kind = Fixup.getKind();
   switch (Modifier) {
   case MCSymbolRefExpr::VK_None:
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index 8d1bac9..cc94869 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -16,6 +16,8 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "SystemZGenInstrInfo.inc"
 
@@ -25,8 +27,6 @@
 #define GET_REGINFO_MC_DESC
 #include "SystemZGenRegisterInfo.inc"
 
-using namespace llvm;
-
 const unsigned SystemZMC::GR32Regs[16] = {
   SystemZ::R0L, SystemZ::R1L, SystemZ::R2L, SystemZ::R3L,
   SystemZ::R4L, SystemZ::R5L, SystemZ::R6L, SystemZ::R7L,
@@ -98,7 +98,8 @@ static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          StringRef TT) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(0, MRI.getDwarfRegNum(SystemZ::R15D, true),
+      MCCFIInstruction::createDefCfa(nullptr,
+                                     MRI.getDwarfRegNum(SystemZ::R15D, true),
                                      SystemZMC::CFAOffsetFromInitialSP);
   MAI->addInitialFrameState(Inst);
   return MAI;
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index fdf80a9..dc210d6 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -13,8 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-elim-compare"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -28,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-elim-compare"
+
 STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
 STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
 STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
@@ -64,14 +64,14 @@ class SystemZElimCompare : public MachineFunctionPass {
 public:
   static char ID;
   SystemZElimCompare(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(0), TRI(0) {}
+    : MachineFunctionPass(ID), TII(nullptr), TRI(nullptr) {}
 
   const char *getPassName() const override {
     return "SystemZ Comparison Elimination";
   }
 
   bool processBlock(MachineBasicBlock &MBB);
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   Reference getRegReferences(MachineInstr *MI, unsigned Reg);
diff --git a/lib/Target/SystemZ/SystemZFrameLowering.cpp b/lib/Target/SystemZ/SystemZFrameLowering.cpp
index c856955..65f3caf 100644
--- a/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -93,7 +93,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // save and restore the stack pointer at the same time, via STMG and LMG.
   // This allows the deallocation to be done by the LMG, rather than needing
   // a separate %r15 addition.
-  const uint16_t *CSRegs = TRI->getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(&MF);
   for (unsigned I = 0; CSRegs[I]; ++I) {
     unsigned Reg = CSRegs[I];
     if (SystemZ::GR64BitRegClass.contains(Reg) && MRI.isPhysRegUsed(Reg)) {
diff --git a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index f46eb16..24f7584 100644
--- a/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -19,6 +19,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-isel"
+
 namespace {
 // Used to build addressing modes.
 struct SystemZAddressingMode {
@@ -72,14 +74,14 @@ struct SystemZAddressingMode {
     errs() << "SystemZAddressingMode " << this << '\n';
 
     errs() << " Base ";
-    if (Base.getNode() != 0)
+    if (Base.getNode())
       Base.getNode()->dump();
     else
       errs() << "null\n";
 
     if (hasIndexField()) {
       errs() << " Index ";
-      if (Index.getNode() != 0)
+      if (Index.getNode())
         Index.getNode()->dump();
       else
         errs() << "null\n";
@@ -663,7 +665,7 @@ bool SystemZDAGToDAGISel::detectOrAndInsertion(SDValue &Op,
   uint64_t Used = allOnes(Op.getValueType().getSizeInBits());
   if (Used != (AndMask | InsertMask)) {
     APInt KnownZero, KnownOne;
-    CurDAG->ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne);
+    CurDAG->computeKnownBits(Op.getOperand(0), KnownZero, KnownOne);
     if (Used != (AndMask | InsertMask | KnownZero.getZExtValue()))
       return false;
   }
@@ -712,7 +714,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
       APInt KnownZero, KnownOne;
-      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
       Mask |= KnownZero.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
@@ -736,7 +738,7 @@ bool SystemZDAGToDAGISel::expandRxSBG(RxSBGOperands &RxSBG) const {
       // been removed from the mask.  See if adding them back in makes the
       // mask suitable.
       APInt KnownZero, KnownOne;
-      CurDAG->ComputeMaskedBits(Input, KnownZero, KnownOne);
+      CurDAG->computeKnownBits(Input, KnownZero, KnownOne);
       Mask &= ~KnownOne.getZExtValue();
       if (!refineRxSBGMask(RxSBG, Mask))
         return false;
@@ -867,12 +869,12 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
     if (RISBG.Input.getOpcode() != ISD::ANY_EXTEND)
       Count += 1;
   if (Count == 0)
-    return 0;
+    return nullptr;
   if (Count == 1) {
     // Prefer to use normal shift instructions over RISBG, since they can handle
     // all cases and are sometimes shorter.
     if (N->getOpcode() != ISD::AND)
-      return 0;
+      return nullptr;
 
     // Prefer register extensions like LLC over RISBG.  Also prefer to start
     // out with normal ANDs if one instruction would be enough.  We can convert
@@ -889,7 +891,7 @@ SDNode *SystemZDAGToDAGISel::tryRISBGZero(SDNode *N) {
         N = CurDAG->UpdateNodeOperands(N, N->getOperand(0), NewMask);
         return SelectCode(N);
       }
-      return 0;
+      return nullptr;
     }
   }  
 
@@ -927,7 +929,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
 
   // Do nothing if neither operand is suitable.
   if (Count[0] == 0 && Count[1] == 0)
-    return 0;
+    return nullptr;
 
   // Pick the deepest second operand.
   unsigned I = Count[0] > Count[1] ? 0 : 1;
@@ -937,7 +939,7 @@ SDNode *SystemZDAGToDAGISel::tryRxSBG(SDNode *N, unsigned Opcode) {
   if (Opcode == SystemZ::ROSBG && (RxSBG[I].Mask & 0xff) == 0)
     if (auto *Load = dyn_cast<LoadSDNode>(Op0.getNode()))
       if (Load->getMemoryVT() == MVT::i8)
-        return 0;
+        return nullptr;
 
   // See whether we can avoid an AND in the first operand by converting
   // ROSBG to RISBG.
@@ -986,8 +988,8 @@ bool SystemZDAGToDAGISel::canUseBlockOperation(StoreSDNode *Store,
     return true;
 
   // Otherwise we need to check whether there's an alias.
-  const Value *V1 = Load->getSrcValue();
-  const Value *V2 = Store->getSrcValue();
+  const Value *V1 = Load->getMemOperand()->getValue();
+  const Value *V2 = Store->getMemOperand()->getValue();
   if (!V1 || !V2)
     return false;
 
@@ -1037,11 +1039,11 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(errs() << "== "; Node->dump(CurDAG); errs() << "\n");
     Node->setNodeId(-1);
-    return 0;
+    return nullptr;
   }
 
   unsigned Opcode = Node->getOpcode();
-  SDNode *ResNode = 0;
+  SDNode *ResNode = nullptr;
   switch (Opcode) {
   case ISD::OR:
     if (Node->getOperand(1).getOpcode() != ISD::Constant)
@@ -1114,7 +1116,7 @@ SDNode *SystemZDAGToDAGISel::Select(SDNode *Node) {
     ResNode = SelectCode(Node);
 
   DEBUG(errs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
+        if (ResNode == nullptr || ResNode == Node)
           Node->dump(CurDAG);
         else
           ResNode->dump(CurDAG);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index 714b6c9..6fe1fb9 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-lower"
-
 #include "SystemZISelLowering.h"
 #include "SystemZCallingConv.h"
 #include "SystemZConstantPoolValue.h"
@@ -26,6 +24,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-lower"
+
 namespace {
 // Represents a sequence for extracting a 0/1 value from an IPM result:
 // (((X ^ XORValue) + AddValue) >> Bit)
@@ -424,7 +424,7 @@ getSingleConstraintMatchWeight(AsmOperandInfo &info,
   Value *CallOperandVal = info.CallOperandVal;
   // If we don't have a value, we can't do a match,
   // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -492,7 +492,7 @@ parseRegisterNumber(const std::string &Constraint,
     if (Index < 16 && Map[Index])
       return std::make_pair(Map[Index], RC);
   }
-  return std::make_pair(0u, static_cast<TargetRegisterClass*>(0));
+  return std::make_pair(0U, nullptr);
 }
 
 std::pair<unsigned, const TargetRegisterClass *> SystemZTargetLowering::
@@ -772,8 +772,8 @@ LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
       }
       // Join the stores, which are independent of one another.
       Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                          &MemOps[NumFixedFPRs],
-                          SystemZ::NumArgFPRs - NumFixedFPRs);
+                          makeArrayRef(&MemOps[NumFixedFPRs],
+                                       SystemZ::NumArgFPRs-NumFixedFPRs));
     }
   }
 
@@ -875,8 +875,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Join the stores, which are independent of one another.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
 
   // Accept direct calls by converting symbolic call addresses to the
   // associated Target* opcodes.  Force %r1 to be used for indirect
@@ -919,8 +918,8 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   // Emit the call.
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   if (IsTailCall)
-    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, &Ops[0], Ops.size());
-  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
+  Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
@@ -996,8 +995,7 @@ SystemZTargetLowering::LowerReturn(SDValue Chain,
   if (Glue.getNode())
     RetOps.push_back(Glue);
 
-  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other,
-                     RetOps.data(), RetOps.size());
+  return DAG.getNode(SystemZISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
 SDValue SystemZTargetLowering::
@@ -1489,7 +1487,7 @@ static void adjustForTestUnderMask(SelectionDAG &DAG, Comparison &C) {
   // Check whether the nonconstant input is an AND with a constant mask.
   Comparison NewC(C);
   uint64_t MaskVal;
-  ConstantSDNode *Mask = 0;
+  ConstantSDNode *Mask = nullptr;
   if (C.Op0.getOpcode() == ISD::AND) {
     NewC.Op0 = C.Op0.getOperand(0);
     NewC.Op1 = C.Op0.getOperand(1);
@@ -1779,7 +1777,7 @@ SDValue SystemZTargetLowering::lowerSELECT_CC(SDValue Op,
   Ops.push_back(Glue);
 
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
-  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  return DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
 }
 
 SDValue SystemZTargetLowering::lowerGlobalAddress(GlobalAddressSDNode *Node,
@@ -1971,7 +1969,7 @@ SDValue SystemZTargetLowering::lowerVASTART(SDValue Op,
                              false, false, 0);
     Offset += 8;
   }
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps, NumFields);
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
@@ -2012,7 +2010,7 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
   SDValue Result = DAG.getNode(ISD::ADD, DL, MVT::i64, NewSP, ArgAdjust);
 
   SDValue Ops[2] = { Result, Chain };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
@@ -2054,7 +2052,7 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op,
     SDValue NegSum = DAG.getNode(ISD::ADD, DL, VT, NegLLTimesRH, NegLHTimesRL);
     Ops[1] = DAG.getNode(ISD::SUB, DL, VT, Ops[1], NegSum);
   }
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
@@ -2073,7 +2071,7 @@ SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op,
     // low half first, so the results are in reverse order.
     lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64,
                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
@@ -2100,7 +2098,7 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op,
   SDValue Ops[2];
   lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode,
                    Op0, Op1, Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
@@ -2118,7 +2116,7 @@ SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op,
   else
     lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64,
                      Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]);
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
@@ -2127,8 +2125,8 @@ SDValue SystemZTargetLowering::lowerOR(SDValue Op, SelectionDAG &DAG) const {
   // Get the known-zero masks for each operand.
   SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
   APInt KnownZero[2], KnownOne[2];
-  DAG.ComputeMaskedBits(Ops[0], KnownZero[0], KnownOne[0]);
-  DAG.ComputeMaskedBits(Ops[1], KnownZero[1], KnownOne[1]);
+  DAG.computeKnownBits(Ops[0], KnownZero[0], KnownOne[0]);
+  DAG.computeKnownBits(Ops[1], KnownZero[1], KnownOne[1]);
 
   // See if the upper 32 bits of one operand and the lower 32 bits of the
   // other are known zero.  They are the low and high operands respectively.
@@ -2259,7 +2257,6 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
   SDValue Ops[] = { ChainIn, AlignedAddr, Src2, BitShift, NegBitShift,
                     DAG.getConstant(BitSize, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops,
-                                             array_lengthof(Ops),
                                              NarrowVT, MMO);
 
   // Rotate the result of the final CS so that the field is in the lower
@@ -2269,7 +2266,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_LOAD_OP(SDValue Op,
   SDValue Result = DAG.getNode(ISD::ROTL, DL, WideVT, AtomicOp, ResultShift);
 
   SDValue RetOps[2] = { Result, AtomicOp.getValue(1) };
-  return DAG.getMergeValues(RetOps, 2, DL);
+  return DAG.getMergeValues(RetOps, DL);
 }
 
 // Op is an ATOMIC_LOAD_SUB operation.  Lower 8- and 16-bit operations
@@ -2351,8 +2348,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
   SDValue Ops[] = { ChainIn, AlignedAddr, CmpVal, SwapVal, BitShift,
                     NegBitShift, DAG.getConstant(BitSize, WideVT) };
   SDValue AtomicOp = DAG.getMemIntrinsicNode(SystemZISD::ATOMIC_CMP_SWAPW, DL,
-                                             VTList, Ops, array_lengthof(Ops),
-                                             NarrowVT, MMO);
+                                             VTList, Ops, NarrowVT, MMO);
   return AtomicOp;
 }
 
@@ -2388,7 +2384,7 @@ SDValue SystemZTargetLowering::lowerPREFETCH(SDValue Op,
     Op.getOperand(1)
   };
   return DAG.getMemIntrinsicNode(SystemZISD::PREFETCH, SDLoc(Op),
-                                 Node->getVTList(), Ops, array_lengthof(Ops),
+                                 Node->getVTList(), Ops,
                                  Node->getMemoryVT(), Node->getMemOperand());
 }
 
@@ -2517,7 +2513,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(ATOMIC_CMP_SWAPW);
     OPCODE(PREFETCH);
   }
-  return NULL;
+  return nullptr;
 #undef OPCODE
 }
 
@@ -3116,7 +3112,7 @@ SystemZTargetLowering::emitMemMemWrapper(MachineInstr *MI,
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
   MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
-                               splitBlockAfter(MI, MBB) : 0);
+                               splitBlockAfter(MI, MBB) : nullptr);
 
   // Check for the loop form, in which operand 5 is the trip count.
   if (MI->getNumExplicitOperands() > 5) {
diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td
index 50badf8..add675a 100644
--- a/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -516,7 +516,7 @@ class InstSS<bits<8> op, dag outs, dag ins, string asmstr, list<dag> pattern>
 //
 //   Binary:
 //     One register output operand and two input operands.  The first
-//     input operand is always a register and he second may be a register,
+//     input operand is always a register and the second may be a register,
 //     immediate or memory.
 //
 //   Shift:
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index e20834c..6a18b2d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -17,12 +17,12 @@
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #define GET_INSTRMAP_INFO
 #include "SystemZGenInstrInfo.inc"
 
-using namespace llvm;
-
 // Return a mask with Count low bits set.
 static uint64_t allOnes(unsigned int Count) {
   return Count == 0 ? 0 : (uint64_t(1) << (Count - 1) << 1) - 1;
@@ -284,11 +284,11 @@ bool SystemZInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(Branch.Target->getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         continue;
@@ -418,7 +418,7 @@ bool SystemZInstrInfo::analyzeCompare(const MachineInstr *MI,
 static MachineInstr *getDef(unsigned Reg,
                             const MachineRegisterInfo *MRI) {
   if (TargetRegisterInfo::isPhysicalRegister(Reg))
-    return 0;
+    return nullptr;
   return MRI->getUniqueVRegDef(Reg);
 }
 
@@ -442,7 +442,7 @@ static void eraseIfDead(MachineInstr *MI, const MachineRegisterInfo *MRI) {
 static bool removeIPMBasedCompare(MachineInstr *Compare, unsigned SrcReg,
                                   const MachineRegisterInfo *MRI,
                                   const TargetRegisterInfo *TRI) {
-  MachineInstr *LGFR = 0;
+  MachineInstr *LGFR = nullptr;
   MachineInstr *RLL = getDef(SrcReg, MRI);
   if (RLL && RLL->getOpcode() == SystemZ::LGFR) {
     LGFR = RLL;
@@ -542,7 +542,7 @@ PredicateInstruction(MachineInstr *MI,
       MI->setDesc(get(CondOpcode));
       MachineInstrBuilder(*MI->getParent()->getParent(), MI)
         .addImm(CCValid).addImm(CCMask)
-        .addReg(SystemZ::CC, RegState::Implicit);;
+        .addReg(SystemZ::CC, RegState::Implicit);
       return true;
     }
   }
@@ -740,7 +740,7 @@ SystemZInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       return finishConvertToThreeAddress(MI, MIB, LV);
     }
   }
-  return 0;
+  return nullptr;
 }
 
 MachineInstr *
@@ -761,12 +761,12 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         .addFrameIndex(FrameIndex).addImm(0)
         .addImm(MI->getOperand(2).getImm());
     }
-    return 0;
+    return nullptr;
   }
 
   // All other cases require a single operand.
   if (Ops.size() != 1)
-    return 0;
+    return nullptr;
 
   unsigned OpNum = Ops[0];
   assert(Size == MF.getRegInfo()
@@ -858,14 +858,14 @@ SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     }
   }
 
-  return 0;
+  return nullptr;
 }
 
 MachineInstr *
 SystemZInstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr* MI,
                                         const SmallVectorImpl<unsigned> &Ops,
                                         MachineInstr* LoadMI) const {
-  return 0;
+  return nullptr;
 }
 
 bool
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 55f80af..09aee5d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -229,7 +229,7 @@ public:
   // BRANCH exists, return the opcode for the latter, otherwise return 0.
   // MI, if nonnull, is the compare instruction.
   unsigned getCompareAndBranch(unsigned Opcode,
-                               const MachineInstr *MI = 0) const;
+                               const MachineInstr *MI = nullptr) const;
 
   // Emit code before MBBI in MI to move immediate value Value into
   // physical register Reg.
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index 1b88d06..8081334 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -53,8 +53,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-long-branch"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -68,6 +66,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-long-branch"
+
 STATISTIC(LongBranches, "Number of long branches.");
 
 namespace {
@@ -111,7 +111,8 @@ struct TerminatorInfo {
   // otherwise it is zero.
   unsigned ExtraRelaxSize;
 
-  TerminatorInfo() : Branch(0), Size(0), TargetBlock(0), ExtraRelaxSize(0) {}
+  TerminatorInfo() : Branch(nullptr), Size(0), TargetBlock(0),
+                     ExtraRelaxSize(0) {}
 };
 
 // Used to keep track of the current position while iterating over the blocks.
@@ -131,13 +132,13 @@ class SystemZLongBranch : public MachineFunctionPass {
 public:
   static char ID;
   SystemZLongBranch(const SystemZTargetMachine &tm)
-    : MachineFunctionPass(ID), TII(0) {}
+    : MachineFunctionPass(ID), TII(nullptr) {}
 
   const char *getPassName() const override {
     return "SystemZ Long Branch";
   }
 
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   void skipNonTerminators(BlockPosition &Position, MBBInfo &Block);
@@ -424,7 +425,7 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
 
   Terminator.Size += Terminator.ExtraRelaxSize;
   Terminator.ExtraRelaxSize = 0;
-  Terminator.Branch = 0;
+  Terminator.Branch = nullptr;
 
   ++LongBranches;
 }
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 1ac4e32..a04d703 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -12,17 +12,17 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "SystemZGenRegisterInfo.inc"
 
-using namespace llvm;
-
 SystemZRegisterInfo::SystemZRegisterInfo(SystemZTargetMachine &tm)
   : SystemZGenRegisterInfo(SystemZ::R14D), TM(tm) {}
 
-const uint16_t*
+const MCPhysReg*
 SystemZRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     SystemZ::R6D,  SystemZ::R7D,  SystemZ::R8D,  SystemZ::R9D,
     SystemZ::R10D, SystemZ::R11D, SystemZ::R12D, SystemZ::R13D,
     SystemZ::R14D, SystemZ::R15D,
diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.h b/lib/Target/SystemZ/SystemZRegisterInfo.h
index 4ad8048..e236f71 100644
--- a/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -49,7 +49,7 @@ public:
   bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
     return true;
   }
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const
+  const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF = nullptr) const
     override;
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
diff --git a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 7635bdc..97abee3 100644
--- a/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-selectiondag-info"
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-selectiondag-info"
+
 SystemZSelectionDAGInfo::
 SystemZSelectionDAGInfo(const SystemZTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
@@ -230,7 +231,7 @@ EmitTargetCodeForMemchr(SelectionDAG &DAG, SDLoc DL, SDValue Chain,
   Ops.push_back(DAG.getConstant(SystemZ::CCMASK_SRST_FOUND, MVT::i32));
   Ops.push_back(Glue);
   VTs = DAG.getVTList(PtrVT, MVT::Glue);
-  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, &Ops[0], Ops.size());
+  End = DAG.getNode(SystemZISD::SELECT_CCMASK, DL, VTs, Ops);
   return std::make_pair(End, Chain);
 }
 
diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp
index 9350779..aad899c 100644
--- a/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "systemz-shorten-inst"
-
 #include "SystemZTargetMachine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 using namespace llvm;
 
+#define DEBUG_TYPE "systemz-shorten-inst"
+
 namespace {
 class SystemZShortenInst : public MachineFunctionPass {
 public:
@@ -31,7 +31,7 @@ public:
   }
 
   bool processBlock(MachineBasicBlock &MBB);
-  bool runOnMachineFunction(MachineFunction &F);
+  bool runOnMachineFunction(MachineFunction &F) override;
 
 private:
   bool shortenIIF(MachineInstr &MI, unsigned *GPRMap, unsigned LiveOther,
@@ -53,7 +53,7 @@ FunctionPass *llvm::createSystemZShortenInstPass(SystemZTargetMachine &TM) {
 }
 
 SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
-  : MachineFunctionPass(ID), TII(0), LowGPRs(), HighGPRs() {
+  : MachineFunctionPass(ID), TII(nullptr), LowGPRs(), HighGPRs() {
   // Set up LowGPRs and HighGPRs.
   for (unsigned I = 0; I < 16; ++I) {
     LowGPRs[SystemZMC::GR32Regs[I]] |= 1 << I;
diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp
index 33d7e06..a011157 100644
--- a/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -12,12 +12,14 @@
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Host.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "systemz-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "SystemZGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 // Pin the vtabel to this file.
 void SystemZSubtarget::anchor() {}
 
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 627786d..d277f82 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -24,14 +24,6 @@
 
 using namespace llvm;
 
-inline DataLayout *unwrap(LLVMTargetDataRef P) {
-  return reinterpret_cast<DataLayout*>(P);
-}
-
-inline LLVMTargetDataRef wrap(const DataLayout *P) {
-  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
-}
-
 inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
   return reinterpret_cast<TargetLibraryInfo*>(P);
 }
diff --git a/lib/Target/TargetLoweringObjectFile.cpp b/lib/Target/TargetLoweringObjectFile.cpp
index 50b1e31..39e0459 100644
--- a/lib/Target/TargetLoweringObjectFile.cpp
+++ b/lib/Target/TargetLoweringObjectFile.cpp
@@ -46,7 +46,7 @@ void TargetLoweringObjectFile::Initialize(MCContext &ctx,
   InitMCObjectFileInfo(TM.getTargetTriple(),
                        TM.getRelocationModel(), TM.getCodeModel(), *Ctx);
 }
-  
+
 TargetLoweringObjectFile::~TargetLoweringObjectFile() {
 }
 
@@ -62,7 +62,7 @@ static bool isSuitableForBSS(const GlobalVariable *GV, bool NoZerosInBSS) {
     return false;
 
   // If the global has an explicit section specified, don't put it in BSS.
-  if (!GV->getSection().empty())
+  if (GV->hasSection())
     return false;
 
   // If -nozero-initialized-in-bss is specified, don't ever use BSS.
@@ -138,7 +138,7 @@ SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalValue *GV,
 
   // Early exit - functions should be always in text sections.
   const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
-  if (GVar == 0)
+  if (!GVar)
     return SectionKind::getText();
 
   // Handle thread-local data first.
@@ -284,10 +284,10 @@ TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
   if (Kind.isText())
     return getTextSection();
 
-  if (Kind.isBSS() && BSSSection != 0)
+  if (Kind.isBSS() && BSSSection != nullptr)
     return BSSSection;
 
-  if (Kind.isReadOnly() && ReadOnlySection != 0)
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
   return getDataSection();
@@ -298,7 +298,7 @@ TargetLoweringObjectFile::SelectSectionForGlobal(const GlobalValue *GV,
 /// should be placed in.
 const MCSection *
 TargetLoweringObjectFile::getSectionForConstant(SectionKind Kind) const {
-  if (Kind.isReadOnly() && ReadOnlySection != 0)
+  if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
   return DataSection;
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index fe3c870..8365f64 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
@@ -28,24 +29,6 @@
 using namespace llvm;
 
 //---------------------------------------------------------------------------
-// Command-line options that tend to be useful on more than one back-end.
-//
-
-namespace llvm {
-  bool HasDivModLibcall;
-  bool AsmVerbosityDefault(false);
-}
-
-static cl::opt<bool>
-DataSections("fdata-sections",
-  cl::desc("Emit data into separate sections"),
-  cl::init(false));
-static cl::opt<bool>
-FunctionSections("ffunction-sections",
-  cl::desc("Emit functions into separate sections"),
-  cl::init(false));
-
-//---------------------------------------------------------------------------
 // TargetMachine Class
 //
 
@@ -53,12 +36,7 @@ TargetMachine::TargetMachine(const Target &T,
                              StringRef TT, StringRef CPU, StringRef FS,
                              const TargetOptions &Options)
   : TheTarget(T), TargetTriple(TT), TargetCPU(CPU), TargetFS(FS),
-    CodeGenInfo(0), AsmInfo(0),
-    MCRelaxAll(false),
-    MCNoExecStack(false),
-    MCSaveTempLabels(false),
-    MCUseCFI(true),
-    MCUseDwarfDirectory(false),
+    CodeGenInfo(nullptr), AsmInfo(nullptr),
     RequireStructuredCFG(false),
     Options(Options) {
 }
@@ -89,6 +67,8 @@ void TargetMachine::resetTargetOptions(const MachineFunction *MF) const {
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(UseSoftFloat, "use-soft-float");
   RESET_OPTION(DisableTailCalls, "disable-tail-calls");
+
+  TO.MCOptions.SanitizeAddress = F->hasFnAttribute(Attribute::SanitizeAddress);
 }
 
 /// getRelocationModel - Returns the code generation relocation model. The
@@ -126,19 +106,13 @@ static TLSModel::Model getSelectedTLSModel(const GlobalVariable *Var) {
 }
 
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
-  // If GV is an alias then use the aliasee for determining
-  // thread-localness.
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getAliasedGlobal();
-  const GlobalVariable *Var = cast<GlobalVariable>(GV);
-
-  bool isLocal = Var->hasLocalLinkage();
-  bool isDeclaration = Var->isDeclaration();
+  bool isLocal = GV->hasLocalLinkage();
+  bool isDeclaration = GV->isDeclaration();
   bool isPIC = getRelocationModel() == Reloc::PIC_;
   bool isPIE = Options.PositionIndependentExecutable;
   // FIXME: what should we do for protected and internal visibility?
   // For variables, is internal different from hidden?
-  bool isHidden = Var->hasHiddenVisibility();
+  bool isHidden = GV->hasHiddenVisibility();
 
   TLSModel::Model Model;
   if (isPIC && !isPIE) {
@@ -153,10 +127,13 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
       Model = TLSModel::InitialExec;
   }
 
-  // If the user specified a more specific model, use that.
-  TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
-  if (SelectedModel > Model)
-    return SelectedModel;
+  const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV);
+  if (Var) {
+    // If the user specified a more specific model, use that.
+    TLSModel::Model SelectedModel = getSelectedTLSModel(Var);
+    if (SelectedModel > Model)
+      return SelectedModel;
+  }
 
   return Model;
 }
@@ -174,28 +151,28 @@ void TargetMachine::setOptLevel(CodeGenOpt::Level Level) const {
     CodeGenInfo->setOptLevel(Level);
 }
 
-bool TargetMachine::getAsmVerbosityDefault() {
-  return AsmVerbosityDefault;
+bool TargetMachine::getAsmVerbosityDefault() const {
+  return Options.MCOptions.AsmVerbose;
 }
 
 void TargetMachine::setAsmVerbosityDefault(bool V) {
-  AsmVerbosityDefault = V;
+  Options.MCOptions.AsmVerbose = V;
 }
 
-bool TargetMachine::getFunctionSections() {
-  return FunctionSections;
+bool TargetMachine::getFunctionSections() const {
+  return Options.FunctionSections;
 }
 
-bool TargetMachine::getDataSections() {
-  return DataSections;
+bool TargetMachine::getDataSections() const {
+  return Options.DataSections;
 }
 
 void TargetMachine::setFunctionSections(bool V) {
-  FunctionSections = V;
+  Options.FunctionSections = V;
 }
 
 void TargetMachine::setDataSections(bool V) {
-  DataSections = V;
+  Options.DataSections = V;
 }
 
 void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
diff --git a/lib/Target/TargetMachineC.cpp b/lib/Target/TargetMachineC.cpp
index a2829d4..20923c9 100644
--- a/lib/Target/TargetMachineC.cpp
+++ b/lib/Target/TargetMachineC.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/PassManager.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
@@ -29,23 +30,6 @@
 
 using namespace llvm;
 
-inline DataLayout *unwrap(LLVMTargetDataRef P) {
-  return reinterpret_cast<DataLayout*>(P);
-}
-
-inline LLVMTargetDataRef wrap(const DataLayout *P) {
-  return reinterpret_cast<LLVMTargetDataRef>(const_cast<DataLayout*>(P));
-}
-
-inline TargetLibraryInfo *unwrap(LLVMTargetLibraryInfoRef P) {
-  return reinterpret_cast<TargetLibraryInfo*>(P);
-}
-
-inline LLVMTargetLibraryInfoRef wrap(const TargetLibraryInfo *P) {
-  TargetLibraryInfo *X = const_cast<TargetLibraryInfo*>(P);
-  return reinterpret_cast<LLVMTargetLibraryInfoRef>(X);
-}
-
 inline TargetMachine *unwrap(LLVMTargetMachineRef P) {
   return reinterpret_cast<TargetMachine*>(P);
 }
@@ -62,7 +46,7 @@ inline LLVMTargetRef wrap(const Target * P) {
 
 LLVMTargetRef LLVMGetFirstTarget() {
   if(TargetRegistry::begin() == TargetRegistry::end()) {
-    return NULL;
+    return nullptr;
   }
 
   const Target* target = &*TargetRegistry::begin();
@@ -80,7 +64,7 @@ LLVMTargetRef LLVMGetTargetFromName(const char *Name) {
       return wrap(&*IT);
   }
   
-  return NULL;
+  return nullptr;
 }
 
 LLVMBool LLVMGetTargetFromTriple(const char* TripleStr, LLVMTargetRef *T,
diff --git a/lib/Target/TargetSubtargetInfo.cpp b/lib/Target/TargetSubtargetInfo.cpp
index df8948f..3ca13da 100644
--- a/lib/Target/TargetSubtargetInfo.cpp
+++ b/lib/Target/TargetSubtargetInfo.cpp
@@ -24,11 +24,12 @@ TargetSubtargetInfo::TargetSubtargetInfo() {}
 TargetSubtargetInfo::~TargetSubtargetInfo() {}
 
 // Temporary option to compare overall performance change when moving from the
-// SD scheduler to the MachineScheduler pass pipeline. It should be removed
-// before 3.4. The normal way to enable/disable the MachineScheduling pass
-// itself is by using -enable-misched. For targets that already use MI sched
-// (via MySubTarget::enableMachineScheduler()) -misched-bench=false negates the
-// subtarget hook.
+// SD scheduler to the MachineScheduler pass pipeline. This is convenient for
+// benchmarking during the transition from SD to MI scheduling. Once armv7 makes
+// the switch, it should go away. The normal way to enable/disable the
+// MachineScheduling pass itself is by using -enable-misched. For targets that
+// already use MI sched (via MySubTarget::enableMachineScheduler())
+// -misched-bench=false negates the subtarget hook.
 static cl::opt<bool> BenchMachineSched("misched-bench", cl::Hidden,
     cl::desc("Migrate from the target's default SD scheduler to MI scheduler"));
 
diff --git a/lib/Target/X86/Android.mk b/lib/Target/X86/Android.mk
index 73031de..0d0a9ca 100644
--- a/lib/Target/X86/Android.mk
+++ b/lib/Target/X86/Android.mk
@@ -12,7 +12,6 @@ x86_codegen_TBLGEN_TABLES := \
 
 x86_codegen_SRC_FILES := \
   X86AsmPrinter.cpp \
-  X86COFFMachineModuleInfo.cpp \
   X86CodeEmitter.cpp \
   X86FastISel.cpp \
   X86FixupLEAs.cpp \
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index db29228..f3e6b3f 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -11,21 +11,25 @@
 #include "X86AsmInstrumentation.h"
 #include "X86Operand.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/IR/Function.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 
 namespace llvm {
 namespace {
 
-static cl::opt<bool> ClAsanInstrumentInlineAssembly(
-    "asan-instrument-inline-assembly", cl::desc("instrument inline assembly"),
-    cl::Hidden, cl::init(false));
+static cl::opt<bool> ClAsanInstrumentAssembly(
+    "asan-instrument-assembly",
+    cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
+    cl::init(false));
 
 bool IsStackReg(unsigned Reg) {
   return Reg == X86::RSP || Reg == X86::ESP || Reg == X86::SP;
@@ -38,14 +42,14 @@ std::string FuncName(unsigned AccessSize, bool IsWrite) {
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
-  X86AddressSanitizer(MCSubtargetInfo &sti) : STI(sti) {}
+  X86AddressSanitizer(const MCSubtargetInfo &STI) : STI(STI) {}
   virtual ~X86AddressSanitizer() {}
 
   // X86AsmInstrumentation implementation:
   virtual void InstrumentInstruction(
       const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, MCStreamer &Out) override {
-    InstrumentMOV(Inst, Operands, Ctx, Out);
+      MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) override {
+    InstrumentMOV(Inst, Operands, Ctx, MII, Out);
   }
 
   // Should be implemented differently in x86_32 and x86_64 subclasses.
@@ -57,13 +61,13 @@ public:
                             bool IsWrite, MCContext &Ctx, MCStreamer &Out);
   void InstrumentMOV(const MCInst &Inst,
                      SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-                     MCContext &Ctx, MCStreamer &Out);
+                     MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
   void EmitInstruction(MCStreamer &Out, const MCInst &Inst) {
     Out.EmitInstruction(Inst, STI);
   }
 
 protected:
-  MCSubtargetInfo &STI;
+  const MCSubtargetInfo &STI;
 };
 
 void X86AddressSanitizer::InstrumentMemOperand(
@@ -83,68 +87,53 @@ void X86AddressSanitizer::InstrumentMemOperand(
 
 void X86AddressSanitizer::InstrumentMOV(
     const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, MCStreamer &Out) {
+    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {
   // Access size in bytes.
   unsigned AccessSize = 0;
-  unsigned long OpIx = Operands.size();
+
   switch (Inst.getOpcode()) {
   case X86::MOV8mi:
   case X86::MOV8mr:
-    AccessSize = 1;
-    OpIx = 2;
-    break;
   case X86::MOV8rm:
     AccessSize = 1;
-    OpIx = 1;
     break;
   case X86::MOV16mi:
   case X86::MOV16mr:
-    AccessSize = 2;
-    OpIx = 2;
-    break;
   case X86::MOV16rm:
     AccessSize = 2;
-    OpIx = 1;
     break;
   case X86::MOV32mi:
   case X86::MOV32mr:
-    AccessSize = 4;
-    OpIx = 2;
-    break;
   case X86::MOV32rm:
     AccessSize = 4;
-    OpIx = 1;
     break;
   case X86::MOV64mi32:
   case X86::MOV64mr:
-    AccessSize = 8;
-    OpIx = 2;
-    break;
   case X86::MOV64rm:
     AccessSize = 8;
-    OpIx = 1;
     break;
   case X86::MOVAPDmr:
   case X86::MOVAPSmr:
-    AccessSize = 16;
-    OpIx = 2;
-    break;
   case X86::MOVAPDrm:
   case X86::MOVAPSrm:
     AccessSize = 16;
-    OpIx = 1;
     break;
-  }
-  if (OpIx >= Operands.size())
+  default:
     return;
+  }
 
-  const bool IsWrite = (OpIx != 1);
-  InstrumentMemOperand(Operands[OpIx], AccessSize, IsWrite, Ctx, Out);
+  const bool IsWrite = MII.get(Inst.getOpcode()).mayStore();
+  for (unsigned Ix = 0; Ix < Operands.size(); ++Ix) {
+    MCParsedAsmOperand *Op = Operands[Ix];
+    if (Op && Op->isMem())
+      InstrumentMemOperand(Op, AccessSize, IsWrite, Ctx, Out);
+  }
 }
 
 class X86AddressSanitizer32 : public X86AddressSanitizer {
 public:
-  X86AddressSanitizer32(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  X86AddressSanitizer32(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer32() {}
 
   virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -172,14 +161,14 @@ void X86AddressSanitizer32::InstrumentMemOperandImpl(
         MCSymbolRefExpr::Create(FuncSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FuncExpr));
   }
-  EmitInstruction(Out, MCInstBuilder(X86::ADD32ri).addReg(X86::ESP)
-                           .addReg(X86::ESP).addImm(4));
+  EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
   EmitInstruction(Out, MCInstBuilder(X86::POP32r).addReg(X86::EAX));
 }
 
 class X86AddressSanitizer64 : public X86AddressSanitizer {
 public:
-  X86AddressSanitizer64(MCSubtargetInfo &sti) : X86AddressSanitizer(sti) {}
+  X86AddressSanitizer64(const MCSubtargetInfo &STI)
+      : X86AddressSanitizer(STI) {}
   virtual ~X86AddressSanitizer64() {}
 
   virtual void InstrumentMemOperandImpl(X86Operand *Op, unsigned AccessSize,
@@ -187,13 +176,26 @@ public:
                                         MCStreamer &Out) override;
 };
 
-void X86AddressSanitizer64::InstrumentMemOperandImpl(
-    X86Operand *Op, unsigned AccessSize, bool IsWrite, MCContext &Ctx,
-    MCStreamer &Out) {
+void X86AddressSanitizer64::InstrumentMemOperandImpl(X86Operand *Op,
+                                                     unsigned AccessSize,
+                                                     bool IsWrite,
+                                                     MCContext &Ctx,
+                                                     MCStreamer &Out) {
   // FIXME: emit .cfi directives for correct stack unwinding.
-  // Set %rsp below current red zone (128 bytes wide)
-  EmitInstruction(Out, MCInstBuilder(X86::SUB64ri32).addReg(X86::RSP)
-                           .addReg(X86::RSP).addImm(128));
+
+  // Set %rsp below current red zone (128 bytes wide) using LEA instruction to
+  // preserve flags.
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+    const MCExpr *Disp = MCConstantExpr::Create(-128, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
   EmitInstruction(Out, MCInstBuilder(X86::PUSH64r).addReg(X86::RDI));
   {
     MCInst Inst;
@@ -210,8 +212,19 @@ void X86AddressSanitizer64::InstrumentMemOperandImpl(
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FuncExpr));
   }
   EmitInstruction(Out, MCInstBuilder(X86::POP64r).addReg(X86::RDI));
-  EmitInstruction(Out, MCInstBuilder(X86::ADD64ri32).addReg(X86::RSP)
-                           .addReg(X86::RSP).addImm(128));
+
+  // Restore old %rsp value.
+  {
+    MCInst Inst;
+    Inst.setOpcode(X86::LEA64r);
+    Inst.addOperand(MCOperand::CreateReg(X86::RSP));
+
+    const MCExpr *Disp = MCConstantExpr::Create(128, Ctx);
+    std::unique_ptr<X86Operand> Op(
+        X86Operand::CreateMem(0, Disp, X86::RSP, 0, 1, SMLoc(), SMLoc()));
+    Op->addMemOperands(Inst, 5);
+    EmitInstruction(Out, Inst);
+  }
 }
 
 } // End anonymous namespace
@@ -221,10 +234,15 @@ X86AsmInstrumentation::~X86AsmInstrumentation() {}
 
 void X86AsmInstrumentation::InstrumentInstruction(
     const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-    MCContext &Ctx, MCStreamer &Out) {}
-
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI) {
-  if (ClAsanInstrumentInlineAssembly) {
+    MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out) {}
+
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI) {
+  Triple T(STI.getTargetTriple());
+  const bool hasCompilerRTSupport = T.isOSLinux();
+  if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
+      MCOptions.SanitizeAddress) {
     if ((STI.getFeatureBits() & X86::Mode32Bit) != 0)
       return new X86AddressSanitizer32(STI);
     if ((STI.getFeatureBits() & X86::Mode64Bit) != 0)
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index c783a78..0369b14 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -16,13 +16,17 @@ namespace llvm {
 
 class MCContext;
 class MCInst;
+class MCInstrInfo;
 class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
+class MCTargetOptions;
 
 class X86AsmInstrumentation;
 
-X86AsmInstrumentation *CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+X86AsmInstrumentation *
+CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                            const MCContext &Ctx, const MCSubtargetInfo &STI);
 
 class X86AsmInstrumentation {
 public:
@@ -32,15 +36,18 @@ public:
   // instruction is sent to Out.
   virtual void InstrumentInstruction(
       const MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
-      MCContext &Ctx, MCStreamer &Out);
+      MCContext &Ctx,
+      const MCInstrInfo &MII,
+      MCStreamer &Out);
 
 protected:
   friend X86AsmInstrumentation *
-  CreateX86AsmInstrumentation(MCSubtargetInfo &STI);
+  CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                              const MCContext &Ctx, const MCSubtargetInfo &STI);
 
   X86AsmInstrumentation();
 };
 
-}  // End llvm namespace
+} // End llvm namespace
 
-#endif  // X86_ASM_INSTRUMENTATION_H
+#endif // X86_ASM_INSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9eddc74..d3e695e 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
@@ -55,6 +56,7 @@ static const char OpPrecedence[] = {
 class X86AsmParser : public MCTargetAsmParser {
   MCSubtargetInfo &STI;
   MCAsmParser &Parser;
+  const MCInstrInfo &MII;
   ParseInstructionInfo *InstInfo;
   std::unique_ptr<X86AsmInstrumentation> Instrumentation;
 private:
@@ -257,7 +259,7 @@ private:
   public:
     IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
       State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
-      Scale(1), Imm(imm), Sym(0), StopOnLBrac(stoponlbrac),
+      Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
       AddImmPrefix(addimmprefix) { Info.clear(); }
     
     unsigned getBaseReg() { return BaseReg; }
@@ -618,7 +620,7 @@ private:
 
   X86Operand *ErrorOperand(SMLoc Loc, StringRef Msg) {
     Error(Loc, Msg);
-    return 0;
+    return nullptr;
   }
 
   X86Operand *DefaultMemSIOperand(SMLoc Loc);
@@ -710,13 +712,17 @@ private:
 
 public:
   X86AsmParser(MCSubtargetInfo &sti, MCAsmParser &parser,
-               const MCInstrInfo &MII)
-      : MCTargetAsmParser(), STI(sti), Parser(parser), InstInfo(0) {
+               const MCInstrInfo &mii,
+               const MCTargetOptions &Options)
+      : MCTargetAsmParser(), STI(sti), Parser(parser), MII(mii),
+        InstInfo(nullptr) {
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
-    Instrumentation.reset(CreateX86AsmInstrumentation(STI));
+    Instrumentation.reset(
+        CreateX86AsmInstrumentation(Options, Parser.getContext(), STI));
   }
+
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
 
   bool
@@ -1173,9 +1179,9 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   // expression.
   IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
   if (ParseIntelExpression(SM, End))
-    return 0;
+    return nullptr;
 
-  const MCExpr *Disp = 0;
+  const MCExpr *Disp = nullptr;
   if (const MCExpr *Sym = SM.getSym()) {
     // A symbolic displacement.
     Disp = Sym;
@@ -1199,7 +1205,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
   if (Tok.getString().find('.') != StringRef::npos) {
     const MCExpr *NewDisp;
     if (ParseIntelDotOperator(Disp, NewDisp))
-      return 0;
+      return nullptr;
     
     End = Tok.getEndLoc();
     Parser.Lex();  // Eat the field.
@@ -1220,7 +1226,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
     StringRef ErrMsg;
     if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
       Error(StartInBrac, ErrMsg);
-      return 0;
+      return nullptr;
     }
     return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
                                  End, Size);
@@ -1237,7 +1243,7 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
                                         InlineAsmIdentifierInfo &Info,
                                         bool IsUnevaluatedOperand, SMLoc &End) {
   assert (isParsingInlineAsm() && "Expected to be parsing inline assembly.");
-  Val = 0;
+  Val = nullptr;
 
   StringRef LineBuf(Identifier.data());
   SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
@@ -1309,7 +1315,7 @@ X86Operand *X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg,
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
   return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
                                /*Scale=*/1, Start, End, Size, Identifier, Info);
 }
@@ -1337,7 +1343,7 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
 
   if (!getLexer().is(AsmToken::LBrac))
     return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0, /*IndexReg=*/0,
@@ -1349,19 +1355,19 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(int64_t ImmDisp, SMLoc Start,
   IntelExprStateMachine SM(/*ImmDisp=*/0, /*StopOnLBrac=*/true,
                            /*AddImmPrefix=*/false);
   if (ParseIntelExpression(SM, End))
-    return 0;
+    return nullptr;
 
   if (SM.getSym()) {
     Error(Start, "cannot use more than one symbol in memory operand");
-    return 0;
+    return nullptr;
   }
   if (SM.getBaseReg()) {
     Error(Start, "cannot use base register with variable reference");
-    return 0;
+    return nullptr;
   }
   if (SM.getIndexReg()) {
     Error(Start, "cannot use index register with variable reference");
-    return 0;
+    return nullptr;
   }
 
   const MCExpr *Disp = MCConstantExpr::Create(SM.getImm(), getContext());
@@ -1430,7 +1436,7 @@ X86Operand *X86AsmParser::ParseIntelOffsetOfOperator() {
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/false, End))
-    return 0;
+    return nullptr;
 
   // Don't emit the offset operator.
   InstInfo->AsmRewrites->push_back(AsmRewrite(AOK_Skip, OffsetOfLoc, 7));
@@ -1461,13 +1467,13 @@ X86Operand *X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   SMLoc TypeLoc = Tok.getLoc();
   Parser.Lex(); // Eat operator.
 
-  const MCExpr *Val = 0;
+  const MCExpr *Val = nullptr;
   InlineAsmIdentifierInfo Info;
   SMLoc Start = Tok.getLoc(), End;
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/true, End))
-    return 0;
+    return nullptr;
 
   if (!Info.OpDecl)
     return ErrorOperand(Start, "unable to lookup expression");
@@ -1522,7 +1528,7 @@ X86Operand *X86AsmParser::ParseIntelOperand() {
     IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
                              /*AddImmPrefix=*/false);
     if (ParseIntelExpression(SM, End))
-      return 0;
+      return nullptr;
 
     int64_t Imm = SM.getImm();
     if (isParsingInlineAsm()) {
@@ -1580,11 +1586,11 @@ X86Operand *X86AsmParser::ParseATTOperand() {
     // Read the register.
     unsigned RegNo;
     SMLoc Start, End;
-    if (ParseRegister(RegNo, Start, End)) return 0;
+    if (ParseRegister(RegNo, Start, End)) return nullptr;
     if (RegNo == X86::EIZ || RegNo == X86::RIZ) {
       Error(Start, "%eiz and %riz can only be used as index registers",
             SMRange(Start, End));
-      return 0;
+      return nullptr;
     }
 
     // If this is a segment register followed by a ':', then this is the start
@@ -1601,7 +1607,7 @@ X86Operand *X86AsmParser::ParseATTOperand() {
     Parser.Lex();
     const MCExpr *Val;
     if (getParser().parseExpression(Val, End))
-      return 0;
+      return nullptr;
     return X86Operand::CreateImm(Val, Start, End);
   }
   }
@@ -1630,7 +1636,7 @@ X86AsmParser::HandleAVX512Operand(SmallVectorImpl<MCParsedAsmOperand*> &Operands
           StringSwitch<const char*>(getLexer().getTok().getIdentifier())
             .Case("to8",  "{1to8}")
             .Case("to16", "{1to16}")
-            .Default(0);
+            .Default(nullptr);
         if (!BroadcastPrimitive)
           return !ErrorAndEatStatement(getLexer().getLoc(),
                                        "Invalid memory broadcast primitive.");
@@ -1685,7 +1691,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
   if (getLexer().isNot(AsmToken::LParen)) {
     SMLoc ExprEnd;
-    if (getParser().parseExpression(Disp, ExprEnd)) return 0;
+    if (getParser().parseExpression(Disp, ExprEnd)) return nullptr;
 
     // After parsing the base expression we could either have a parenthesized
     // memory address or not.  If not, return now.  If so, eat the (.
@@ -1712,7 +1718,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
       // It must be an parenthesized expression, parse it now.
       if (getParser().parseParenExpression(Disp, ExprEnd))
-        return 0;
+        return nullptr;
 
       // After parsing the base expression we could either have a parenthesized
       // memory address or not.  If not, return now.  If so, eat the (.
@@ -1736,11 +1742,11 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   if (getLexer().is(AsmToken::Percent)) {
     SMLoc StartLoc, EndLoc;
     BaseLoc = Parser.getTok().getLoc();
-    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return 0;
+    if (ParseRegister(BaseReg, StartLoc, EndLoc)) return nullptr;
     if (BaseReg == X86::EIZ || BaseReg == X86::RIZ) {
       Error(StartLoc, "eiz and riz can only be used as index registers",
             SMRange(StartLoc, EndLoc));
-      return 0;
+      return nullptr;
     }
   }
 
@@ -1756,7 +1762,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
     // like "1(%eax,,1)", the assembler doesn't. Use "eiz" or "riz" for this.
     if (getLexer().is(AsmToken::Percent)) {
       SMLoc L;
-      if (ParseRegister(IndexReg, L, L)) return 0;
+      if (ParseRegister(IndexReg, L, L)) return nullptr;
 
       if (getLexer().isNot(AsmToken::RParen)) {
         // Parse the scale amount:
@@ -1764,7 +1770,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
         if (getLexer().isNot(AsmToken::Comma)) {
           Error(Parser.getTok().getLoc(),
                 "expected comma in scale expression");
-          return 0;
+          return nullptr;
         }
         Parser.Lex(); // Eat the comma.
 
@@ -1774,18 +1780,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
           int64_t ScaleVal;
           if (getParser().parseAbsoluteExpression(ScaleVal)){
             Error(Loc, "expected scale expression");
-            return 0;
+            return nullptr;
           }
 
           // Validate the scale amount.
 	  if (X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) &&
               ScaleVal != 1) {
             Error(Loc, "scale factor in 16-bit address must be 1");
-            return 0;
+            return nullptr;
 	  }
           if (ScaleVal != 1 && ScaleVal != 2 && ScaleVal != 4 && ScaleVal != 8){
             Error(Loc, "scale factor in address must be 1, 2, 4 or 8");
-            return 0;
+            return nullptr;
           }
           Scale = (unsigned)ScaleVal;
         }
@@ -1797,7 +1803,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
 
       int64_t Value;
       if (getParser().parseAbsoluteExpression(Value))
-        return 0;
+        return nullptr;
 
       if (Value != 1)
         Warning(Loc, "scale factor without index register is ignored");
@@ -1808,7 +1814,7 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
   // Ok, we've eaten the memory operand, verify we have a ')' and eat it too.
   if (getLexer().isNot(AsmToken::RParen)) {
     Error(Parser.getTok().getLoc(), "unexpected token in memory operand");
-    return 0;
+    return nullptr;
   }
   SMLoc MemEnd = Parser.getTok().getEndLoc();
   Parser.Lex(); // Eat the ')'.
@@ -1821,18 +1827,18 @@ X86Operand *X86AsmParser::ParseMemOperand(unsigned SegReg, SMLoc MemStart) {
                          BaseReg != X86::SI && BaseReg != X86::DI)) &&
       BaseReg != X86::DX) {
     Error(BaseLoc, "invalid 16-bit base register");
-    return 0;
+    return nullptr;
   }
   if (BaseReg == 0 &&
       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
     Error(IndexLoc, "16-bit memory operand may not include only index register");
-    return 0;
+    return nullptr;
   }
 
   StringRef ErrMsg;
   if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
     Error(BaseLoc, ErrMsg);
-    return 0;
+    return nullptr;
   }
 
   return X86Operand::CreateMem(SegReg, Disp, BaseReg, IndexReg, Scale,
@@ -1851,7 +1857,7 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
     PatchedName = PatchedName.substr(0, Name.size()-1);
 
   // FIXME: Hack to recognize cmp<comparison code>{ss,sd,ps,pd}.
-  const MCExpr *ExtraImmOp = 0;
+  const MCExpr *ExtraImmOp = nullptr;
   if ((PatchedName.startswith("cmp") || PatchedName.startswith("vcmp")) &&
       (PatchedName.endswith("ss") || PatchedName.endswith("sd") ||
        PatchedName.endswith("ps") || PatchedName.endswith("pd"))) {
@@ -2070,8 +2076,10 @@ ParseInstruction(ParseInstructionInfo &Info, StringRef Name, SMLoc NameLoc,
       (Name == "smov" || Name == "smovb" || Name == "smovw" ||
        Name == "smovl" || Name == "smovd" || Name == "smovq"))) {
     if (Operands.size() == 1) {
-      if (Name == "movsd")
+      if (Name == "movsd") {
+        delete Operands.back();
         Operands.back() = X86Operand::CreateToken("movsl", NameLoc);
+      }
       if (isParsingIntelSyntax()) {
         Operands.push_back(DefaultMemDIOperand(NameLoc));
         Operands.push_back(DefaultMemSIOperand(NameLoc));
@@ -2253,7 +2261,8 @@ static const char *getSubtargetFeatureName(unsigned Val);
 void X86AsmParser::EmitInstruction(
     MCInst &Inst, SmallVectorImpl<MCParsedAsmOperand *> &Operands,
     MCStreamer &Out) {
-  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), Out);
+  Instrumentation->InstrumentInstruction(Inst, Operands, getContext(), MII,
+                                         Out);
   Out.EmitInstruction(Inst, STI);
 }
 
@@ -2291,7 +2300,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
         .Case("fstsw",  "fnstsw")
         .Case("fstsww", "fnstsw")
         .Case("fclex",  "fnclex")
-        .Default(0);
+        .Default(nullptr);
     assert(Repl && "Unknown wait-prefixed instruction");
     delete Operands[0];
     Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 45fe2a9..de3be38 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -422,7 +422,7 @@ struct X86Operand : public MCParsedAsmOperand {
                                bool AddressOf = false,
                                SMLoc OffsetOfLoc = SMLoc(),
                                StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
     Res->AddressOf = AddressOf;
@@ -441,7 +441,7 @@ struct X86Operand : public MCParsedAsmOperand {
   /// Create an absolute memory operand.
   static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
                                unsigned Size = 0, StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -461,7 +461,7 @@ struct X86Operand : public MCParsedAsmOperand {
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
                                unsigned Size = 0,
                                StringRef SymName = StringRef(),
-                               void *OpDecl = 0) {
+                               void *OpDecl = nullptr) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 206b651..c54fbc1 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -14,7 +14,6 @@ add_public_tablegen_target(X86CommonTableGen)
 
 set(sources
   X86AsmPrinter.cpp
-  X86COFFMachineModuleInfo.cpp
   X86CodeEmitter.cpp
   X86FastISel.cpp
   X86FloatingPoint.cpp
diff --git a/lib/Target/X86/Disassembler/Android.mk b/lib/Target/X86/Disassembler/Android.mk
index 3984266..0b3b8a5 100644
--- a/lib/Target/X86/Disassembler/Android.mk
+++ b/lib/Target/X86/Disassembler/Android.mk
@@ -8,7 +8,8 @@ x86_disassembler_TBLGEN_TABLES := \
 
 x86_disassembler_SRC_FILES := \
   X86Disassembler.cpp \
-  X86DisassemblerDecoder.c
+  X86DisassemblerDecoder.cpp
+
 
 # For the device
 # =====================================================
diff --git a/lib/Target/X86/Disassembler/CMakeLists.txt b/lib/Target/X86/Disassembler/CMakeLists.txt
index deed115..4370282 100644
--- a/lib/Target/X86/Disassembler/CMakeLists.txt
+++ b/lib/Target/X86/Disassembler/CMakeLists.txt
@@ -1,4 +1,4 @@
 add_llvm_library(LLVMX86Disassembler
   X86Disassembler.cpp
-  X86DisassemblerDecoder.c
+  X86DisassemblerDecoder.cpp
   )
diff --git a/lib/Target/X86/Disassembler/Makefile b/lib/Target/X86/Disassembler/Makefile
index 8669fd8..51e7b82 100644
--- a/lib/Target/X86/Disassembler/Makefile
+++ b/lib/Target/X86/Disassembler/Makefile
@@ -10,7 +10,9 @@
 LEVEL = ../../../..
 LIBRARYNAME = LLVMX86Disassembler
 
-# Hack: we need to include 'main' x86 target directory to grab private headers
+# Hack: we need to include 'main' x86 target directory to grab private headers.
 CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
 
 include $(LEVEL)/Makefile.common
+
+.PHONY: $(PROJ_SRC_DIR)/X86DisassemblerDecoder.c
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index d5759cd..c366725 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -27,6 +27,11 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
 
+using namespace llvm;
+using namespace llvm::X86Disassembler;
+
+#define DEBUG_TYPE "x86-disassembler"
+
 #define GET_REGINFO_ENUM
 #include "X86GenRegisterInfo.inc"
 #define GET_INSTRINFO_ENUM
@@ -34,21 +39,18 @@
 #define GET_SUBTARGETINFO_ENUM
 #include "X86GenSubtargetInfo.inc"
 
-using namespace llvm;
-using namespace llvm::X86Disassembler;
-
-void x86DisassemblerDebug(const char *file,
-                          unsigned line,
-                          const char *s) {
+void llvm::X86Disassembler::Debug(const char *file, unsigned line,
+                                  const char *s) {
   dbgs() << file << ":" << line << ": " << s;
 }
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii) {
+const char *llvm::X86Disassembler::GetInstrName(unsigned Opcode,
+                                                const void *mii) {
   const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
   return MII->getName(Opcode);
 }
 
-#define debug(s) DEBUG(x86DisassemblerDebug(__FILE__, __LINE__, s));
+#define debug(s) DEBUG(Debug(__FILE__, __LINE__, s));
 
 namespace llvm {  
   
@@ -74,9 +76,11 @@ static bool translateInstruction(MCInst &target,
                                 InternalInstruction &source,
                                 const MCDisassembler *Dis);
 
-X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
-                                               const MCInstrInfo *MII)
-  : MCDisassembler(STI), MII(MII) {
+X86GenericDisassembler::X86GenericDisassembler(
+                                         const MCSubtargetInfo &STI,
+                                         MCContext &Ctx,
+                                         std::unique_ptr<const MCInstrInfo> MII)
+  : MCDisassembler(STI, Ctx), MII(std::move(MII)) {
   switch (STI.getFeatureBits() &
           (X86::Mode16Bit | X86::Mode32Bit | X86::Mode64Bit)) {
   case X86::Mode16Bit:
@@ -93,10 +97,6 @@ X86GenericDisassembler::X86GenericDisassembler(const MCSubtargetInfo &STI,
   }
 }
 
-X86GenericDisassembler::~X86GenericDisassembler() {
-  delete MII;
-}
-
 /// regionReader - a callback function that wraps the readByte method from
 ///   MemoryObject.
 ///
@@ -140,14 +140,14 @@ X86GenericDisassembler::getInstruction(MCInst &instr,
 
   dlog_t loggerFn = logger;
   if (&vStream == &nulls())
-    loggerFn = 0; // Disable logging completely if it's going to nulls().
+    loggerFn = nullptr; // Disable logging completely if it's going to nulls().
   
   int ret = decodeInstruction(&internalInstr,
                               regionReader,
                               (const void*)&region,
                               loggerFn,
                               (void*)&vStream,
-                              (const void*)MII,
+                              (const void*)MII.get(),
                               address,
                               fMode);
 
@@ -319,7 +319,7 @@ static void translateImmediate(MCInst &mcInst, uint64_t immediate,
   }
   // By default sign-extend all X86 immediates based on their encoding.
   else if (type == TYPE_IMM8 || type == TYPE_IMM16 || type == TYPE_IMM32 ||
-           type == TYPE_IMM64) {
+           type == TYPE_IMM64 || type == TYPE_IMMv) {
     uint32_t Opcode = mcInst.getOpcode();
     switch (operand.encoding) {
     default:
@@ -787,13 +787,11 @@ static bool translateInstruction(MCInst &mcInst,
       mcInst.setOpcode(X86::XACQUIRE_PREFIX);
   }
   
-  int index;
-  
   insn.numImmediatesTranslated = 0;
   
-  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    if (insn.operands[index].encoding != ENCODING_NONE) {
-      if (translateOperand(mcInst, insn.operands[index], insn, Dis)) {
+  for (const auto &Op : insn.operands) {
+    if (Op.encoding != ENCODING_NONE) {
+      if (translateOperand(mcInst, Op, insn, Dis)) {
         return true;
       }
     }
@@ -803,9 +801,10 @@ static bool translateInstruction(MCInst &mcInst,
 }
 
 static MCDisassembler *createX86Disassembler(const Target &T,
-                                             const MCSubtargetInfo &STI) {
-  return new X86Disassembler::X86GenericDisassembler(STI,
-                                                     T.createMCInstrInfo());
+                                             const MCSubtargetInfo &STI,
+                                             MCContext &Ctx) {
+  std::unique_ptr<const MCInstrInfo> MII(T.createMCInstrInfo());
+  return new X86Disassembler::X86GenericDisassembler(STI, Ctx, std::move(MII));
 }
 
 extern "C" void LLVMInitializeX86Disassembler() { 
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.h b/lib/Target/X86/Disassembler/X86Disassembler.h
index 4e6e297..4dc7c29 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.h
+++ b/lib/Target/X86/Disassembler/X86Disassembler.h
@@ -74,17 +74,7 @@
 #ifndef X86DISASSEMBLER_H
 #define X86DISASSEMBLER_H
 
-#define INSTRUCTION_SPECIFIER_FIELDS \
-  uint16_t operands;
-
-#define INSTRUCTION_IDS               \
-  uint16_t instructionIDs;
-
 #include "X86DisassemblerDecoderCommon.h"
-
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
-
 #include "llvm/MC/MCDisassembler.h"
 
 namespace llvm {
@@ -101,13 +91,12 @@ namespace X86Disassembler {
 ///   All each platform class should have to do is subclass the constructor, and
 ///   provide a different disassemblerMode value.
 class X86GenericDisassembler : public MCDisassembler {
-  const MCInstrInfo *MII;
+  std::unique_ptr<const MCInstrInfo> MII;
 public:
   /// Constructor     - Initializes the disassembler.
   ///
-  X86GenericDisassembler(const MCSubtargetInfo &STI, const MCInstrInfo *MII);
-private:
-  ~X86GenericDisassembler();
+  X86GenericDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                         std::unique_ptr<const MCInstrInfo> MII);
 public:
 
   /// getInstruction - See MCDisassembler.
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 0801c96..804606d 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -1,17 +1,17 @@
-/*===-- X86DisassemblerDecoder.c - Disassembler decoder ------------*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the implementation of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
+//===-- X86DisassemblerDecoder.c - Disassembler decoder -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the implementation of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
 
 #include <stdarg.h>   /* for va_*()       */
 #include <stdio.h>    /* for vsnprintf()  */
@@ -20,13 +20,35 @@
 
 #include "X86DisassemblerDecoder.h"
 
-#include "X86GenDisassemblerTables.inc"
+using namespace llvm::X86Disassembler;
+
+/// Specifies whether a ModR/M byte is needed and (if so) which
+/// instruction each possible value of the ModR/M byte corresponds to.  Once
+/// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+  uint8_t modrm_type;
+  uint16_t instructionIDs;
+};
+
+/// Specifies which set of ModR/M->instruction tables to look at
+/// given a particular opcode.
+struct OpcodeDecision {
+  ModRMDecision modRMDecisions[256];
+};
+
+/// Specifies which opcode->instruction tables to look at given
+/// a particular context (set of attributes).  Since there are many possible
+/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+/// applies given a specific set of attributes.  Hence there are only IC_max
+/// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+  OpcodeDecision opcodeDecisions[IC_max];
+};
 
-#define TRUE  1
-#define FALSE 0
+#include "X86GenDisassemblerTables.inc"
 
 #ifndef NDEBUG
-#define debug(s) do { x86DisassemblerDebug(__FILE__, __LINE__, s); } while (0)
+#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
 #else
 #define debug(s) do { } while (0)
 #endif
@@ -41,7 +63,7 @@
  *                    an instruction with these attributes.
  */
 static InstructionContext contextForAttrs(uint16_t attrMask) {
-  return CONTEXTS_SYM[attrMask];
+  return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
 }
 
 /*
@@ -53,12 +75,12 @@ static InstructionContext contextForAttrs(uint16_t attrMask) {
  *                      contextForAttrs.
  * @param opcode      - The last byte of the instruction's opcode, not counting
  *                      ModR/M extensions and escapes.
- * @return            - TRUE if the ModR/M byte is required, FALSE otherwise.
+ * @return            - true if the ModR/M byte is required, false otherwise.
  */
 static int modRMRequired(OpcodeType type,
                          InstructionContext insnContext,
                          uint16_t opcode) {
-  const struct ContextDecision* decision = 0;
+  const struct ContextDecision* decision = nullptr;
 
   switch (type) {
   case ONEBYTE:
@@ -102,7 +124,7 @@ static InstrUID decode(OpcodeType type,
                        InstructionContext insnContext,
                        uint8_t opcode,
                        uint8_t modRM) {
-  const struct ModRMDecision* dec = 0;
+  const struct ModRMDecision* dec = nullptr;
 
   switch (type) {
   case ONEBYTE:
@@ -284,15 +306,15 @@ static void setPrefixPresent(struct InternalInstruction* insn,
  * @param location  - The location to query.
  * @return          - Whether the prefix is at that location.
  */
-static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
+static bool isPrefixAtLocation(struct InternalInstruction* insn,
                                uint8_t prefix,
                                uint64_t location)
 {
   if (insn->prefixPresent[prefix] == 1 &&
      insn->prefixLocations[prefix] == location)
-    return TRUE;
+    return true;
   else
-    return FALSE;
+    return false;
 }
 
 /*
@@ -305,14 +327,14 @@ static BOOL isPrefixAtLocation(struct InternalInstruction* insn,
  *                bytes, and no prefixes conflicted; nonzero otherwise.
  */
 static int readPrefixes(struct InternalInstruction* insn) {
-  BOOL isPrefix = TRUE;
-  BOOL prefixGroups[4] = { FALSE };
+  bool isPrefix = true;
+  bool prefixGroups[4] = { false };
   uint64_t prefixLocation;
   uint8_t byte = 0;
   uint8_t nextByte;
 
-  BOOL hasAdSize = FALSE;
-  BOOL hasOpSize = FALSE;
+  bool hasAdSize = false;
+  bool hasOpSize = false;
 
   dbgprintf(insn, "readPrefixes()");
 
@@ -344,7 +366,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       if ((byte == 0xf2 || byte == 0xf3) &&
           ((nextByte == 0xf0) |
           ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
-        insn->xAcquireRelease = TRUE;
+        insn->xAcquireRelease = true;
       /*
        * Also if the byte is 0xf3, and the following condition is met:
        * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
@@ -354,7 +376,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       if (byte == 0xf3 &&
           (nextByte == 0x88 || nextByte == 0x89 ||
            nextByte == 0xc6 || nextByte == 0xc7))
-        insn->xAcquireRelease = TRUE;
+        insn->xAcquireRelease = true;
       if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
         if (consumeByte(insn, &nextByte))
           return -1;
@@ -372,7 +394,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
     case 0xf3:  /* REP or REPE/REPZ */
       if (prefixGroups[0])
         dbgprintf(insn, "Redundant Group 1 prefix");
-      prefixGroups[0] = TRUE;
+      prefixGroups[0] = true;
       setPrefixPresent(insn, byte, prefixLocation);
       break;
     case 0x2e:  /* CS segment override -OR- Branch not taken */
@@ -406,25 +428,25 @@ static int readPrefixes(struct InternalInstruction* insn) {
       }
       if (prefixGroups[1])
         dbgprintf(insn, "Redundant Group 2 prefix");
-      prefixGroups[1] = TRUE;
+      prefixGroups[1] = true;
       setPrefixPresent(insn, byte, prefixLocation);
       break;
     case 0x66:  /* Operand-size override */
       if (prefixGroups[2])
         dbgprintf(insn, "Redundant Group 3 prefix");
-      prefixGroups[2] = TRUE;
-      hasOpSize = TRUE;
+      prefixGroups[2] = true;
+      hasOpSize = true;
       setPrefixPresent(insn, byte, prefixLocation);
       break;
     case 0x67:  /* Address-size override */
       if (prefixGroups[3])
         dbgprintf(insn, "Redundant Group 4 prefix");
-      prefixGroups[3] = TRUE;
-      hasAdSize = TRUE;
+      prefixGroups[3] = true;
+      hasAdSize = true;
       setPrefixPresent(insn, byte, prefixLocation);
       break;
     default:    /* Not a prefix byte */
-      isPrefix = FALSE;
+      isPrefix = false;
       break;
     }
 
@@ -549,7 +571,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       default:
         break;
       case VEX_PREFIX_66:
-        hasOpSize = TRUE;
+        hasOpSize = true;
         break;
       }
 
@@ -595,7 +617,7 @@ static int readPrefixes(struct InternalInstruction* insn) {
       default:
         break;
       case VEX_PREFIX_66:
-        hasOpSize = TRUE;
+        hasOpSize = true;
         break;
       }
 
@@ -790,11 +812,9 @@ static int readModRM(struct InternalInstruction* insn);
 static int getIDWithAttrMask(uint16_t* instructionID,
                              struct InternalInstruction* insn,
                              uint16_t attrMask) {
-  BOOL hasModRMExtension;
-
-  uint16_t instructionClass;
+  bool hasModRMExtension;
 
-  instructionClass = contextForAttrs(attrMask);
+  InstructionContext instructionClass = contextForAttrs(attrMask);
 
   hasModRMExtension = modRMRequired(insn->opcodeType,
                                     instructionClass,
@@ -825,14 +845,14 @@ static int getIDWithAttrMask(uint16_t* instructionID,
  * @param orig  - The instruction that is not 16-bit
  * @param equiv - The instruction that is 16-bit
  */
-static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
+static bool is16BitEquivalent(const char* orig, const char* equiv) {
   off_t i;
 
   for (i = 0;; i++) {
     if (orig[i] == '\0' && equiv[i] == '\0')
-      return TRUE;
+      return true;
     if (orig[i] == '\0' || equiv[i] == '\0')
-      return FALSE;
+      return false;
     if (orig[i] != equiv[i]) {
       if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
         continue;
@@ -840,7 +860,7 @@ static BOOL is16BitEquivalent(const char* orig, const char* equiv) {
         continue;
       if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
         continue;
-      return FALSE;
+      return false;
     }
   }
 }
@@ -1011,9 +1031,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
       return 0;
     }
 
-    specName = x86DisassemblerGetInstrName(instructionID, miiArg);
-    specWithOpSizeName =
-      x86DisassemblerGetInstrName(instructionIDWithOpsize, miiArg);
+    specName = GetInstrName(instructionID, miiArg);
+    specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
 
     if (is16BitEquivalent(specName, specWithOpSizeName) &&
         (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
@@ -1077,8 +1096,8 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
  * @return      - 0 if the SIB byte was successfully read; nonzero otherwise.
  */
 static int readSIB(struct InternalInstruction* insn) {
-  SIBIndex sibIndexBase = 0;
-  SIBBase sibBaseBase = 0;
+  SIBIndex sibIndexBase = SIB_INDEX_NONE;
+  SIBBase sibBaseBase = SIB_BASE_NONE;
   uint8_t index, base;
 
   dbgprintf(insn, "readSIB()");
@@ -1086,7 +1105,7 @@ static int readSIB(struct InternalInstruction* insn) {
   if (insn->consumedSIB)
     return 0;
 
-  insn->consumedSIB = TRUE;
+  insn->consumedSIB = true;
 
   switch (insn->addressSize) {
   case 2:
@@ -1184,12 +1203,12 @@ static int readDisplacement(struct InternalInstruction* insn) {
   if (insn->consumedDisplacement)
     return 0;
 
-  insn->consumedDisplacement = TRUE;
+  insn->consumedDisplacement = true;
   insn->displacementOffset = insn->readerCursor - insn->startLocation;
 
   switch (insn->eaDisplacement) {
   case EA_DISP_NONE:
-    insn->consumedDisplacement = FALSE;
+    insn->consumedDisplacement = false;
     break;
   case EA_DISP_8:
     if (consumeInt8(insn, &d8))
@@ -1208,7 +1227,7 @@ static int readDisplacement(struct InternalInstruction* insn) {
     break;
   }
 
-  insn->consumedDisplacement = TRUE;
+  insn->consumedDisplacement = true;
   return 0;
 }
 
@@ -1229,7 +1248,7 @@ static int readModRM(struct InternalInstruction* insn) {
 
   if (consumeByte(insn, &insn->modRM))
     return -1;
-  insn->consumedModRM = TRUE;
+  insn->consumedModRM = true;
 
   mod     = modFromModRM(insn->modRM);
   rm      = rmFromModRM(insn->modRM);
@@ -1599,20 +1618,22 @@ static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
 static int readVVVV(struct InternalInstruction* insn) {
   dbgprintf(insn, "readVVVV()");
 
+  int vvvv;
   if (insn->vectorExtensionType == TYPE_EVEX)
-    insn->vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
+    vvvv = vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]);
   else if (insn->vectorExtensionType == TYPE_VEX_3B)
-    insn->vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+    vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
   else if (insn->vectorExtensionType == TYPE_VEX_2B)
-    insn->vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+    vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
   else if (insn->vectorExtensionType == TYPE_XOP)
-    insn->vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+    vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
   else
     return -1;
 
   if (insn->mode != MODE_64BIT)
-    insn->vvvv &= 0x7;
+    vvvv &= 0x7;
 
+  insn->vvvv = static_cast<Reg>(vvvv);
   return 0;
 }
 
@@ -1629,7 +1650,8 @@ static int readMaskRegister(struct InternalInstruction* insn) {
   if (insn->vectorExtensionType != TYPE_EVEX)
     return -1;
 
-  insn->writemask = aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+  insn->writemask =
+      static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
   return 0;
 }
 
@@ -1641,7 +1663,6 @@ static int readMaskRegister(struct InternalInstruction* insn) {
  * @return      - 0 if all operands could be read; nonzero otherwise.
  */
 static int readOperands(struct InternalInstruction* insn) {
-  int index;
   int hasVVVV, needVVVV;
   int sawRegImm = 0;
 
@@ -1652,8 +1673,8 @@ static int readOperands(struct InternalInstruction* insn) {
   hasVVVV = !readVVVV(insn);
   needVVVV = hasVVVV && (insn->vvvv != 0);
 
-  for (index = 0; index < X86_MAX_OPERANDS; ++index) {
-    switch (x86OperandSets[insn->spec->operands][index].encoding) {
+  for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+    switch (Op.encoding) {
     case ENCODING_NONE:
     case ENCODING_SI:
     case ENCODING_DI:
@@ -1662,7 +1683,7 @@ static int readOperands(struct InternalInstruction* insn) {
     case ENCODING_RM:
       if (readModRM(insn))
         return -1;
-      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+      if (fixupReg(insn, &Op))
         return -1;
       break;
     case ENCODING_CB:
@@ -1684,14 +1705,14 @@ static int readOperands(struct InternalInstruction* insn) {
       }
       if (readImmediate(insn, 1))
         return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM3 &&
+      if (Op.type == TYPE_IMM3 &&
           insn->immediates[insn->numImmediatesConsumed - 1] > 7)
         return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_IMM5 &&
+      if (Op.type == TYPE_IMM5 &&
           insn->immediates[insn->numImmediatesConsumed - 1] > 31)
         return -1;
-      if (x86OperandSets[insn->spec->operands][index].type == TYPE_XMM128 ||
-          x86OperandSets[insn->spec->operands][index].type == TYPE_XMM256)
+      if (Op.type == TYPE_XMM128 ||
+          Op.type == TYPE_XMM256)
         sawRegImm = 1;
       break;
     case ENCODING_IW:
@@ -1740,7 +1761,7 @@ static int readOperands(struct InternalInstruction* insn) {
       needVVVV = 0; /* Mark that we have found a VVVV operand. */
       if (!hasVVVV)
         return -1;
-      if (fixupReg(insn, &x86OperandSets[insn->spec->operands][index]))
+      if (fixupReg(insn, &Op))
         return -1;
       break;
     case ENCODING_WRITEMASK:
@@ -1781,14 +1802,10 @@ static int readOperands(struct InternalInstruction* insn) {
  * @return          - 0 if the instruction's memory could be read; nonzero if
  *                    not.
  */
-int decodeInstruction(struct InternalInstruction* insn,
-                      byteReader_t reader,
-                      const void* readerArg,
-                      dlog_t logger,
-                      void* loggerArg,
-                      const void* miiArg,
-                      uint64_t startLoc,
-                      DisassemblerMode mode) {
+int llvm::X86Disassembler::decodeInstruction(
+    struct InternalInstruction *insn, byteReader_t reader,
+    const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
+    uint64_t startLoc, DisassemblerMode mode) {
   memset(insn, 0, sizeof(struct InternalInstruction));
 
   insn->reader = reader;
@@ -1807,7 +1824,7 @@ int decodeInstruction(struct InternalInstruction* insn,
       readOperands(insn))
     return -1;
 
-  insn->operands = &x86OperandSets[insn->spec->operands][0];
+  insn->operands = x86OperandSets[insn->spec->operands];
 
   insn->length = insn->readerCursor - insn->startLocation;
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index ac3b39d..8c45402 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -1,39 +1,28 @@
-/*===-- X86DisassemblerDecoderInternal.h - Disassembler decoder ---*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains the public interface of the instruction decoder.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
+//===-- X86DisassemblerDecoderInternal.h - Disassembler decoder -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains the public interface of the instruction decoder.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef X86DISASSEMBLERDECODER_H
 #define X86DISASSEMBLERDECODER_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#define INSTRUCTION_SPECIFIER_FIELDS \
-  uint16_t operands;
-
-#define INSTRUCTION_IDS     \
-  uint16_t instructionIDs;
-
 #include "X86DisassemblerDecoderCommon.h"
+#include "llvm/ADT/ArrayRef.h"
 
-#undef INSTRUCTION_SPECIFIER_FIELDS
-#undef INSTRUCTION_IDS
+namespace llvm {
+namespace X86Disassembler {
 
-/*
- * Accessor functions for various fields of an Intel instruction
- */
+// Accessor functions for various fields of an Intel instruction
 #define modFromModRM(modRM)  (((modRM) & 0xc0) >> 6)
 #define regFromModRM(modRM)  (((modRM) & 0x38) >> 3)
 #define rmFromModRM(modRM)   ((modRM) & 0x7)
@@ -83,10 +72,7 @@ extern "C" {
 #define lFromXOP3of3(xop)       (((xop) & 0x4) >> 2)
 #define ppFromXOP3of3(xop)      ((xop) & 0x3)
 
-/*
- * These enums represent Intel registers for use by the decoder.
- */
-
+// These enums represent Intel registers for use by the decoder.
 #define REGS_8BIT     \
   ENTRY(AL)           \
   ENTRY(CL)           \
@@ -392,13 +378,11 @@ extern "C" {
   REGS_CONTROL        \
   ENTRY(RIP)
 
-/*
- * EABase - All possible values of the base field for effective-address
- *   computations, a.k.a. the Mod and R/M fields of the ModR/M byte.  We
- *   distinguish between bases (EA_BASE_*) and registers that just happen to be
- *   referred to when Mod == 0b11 (EA_REG_*).
- */
-typedef enum {
+/// \brief All possible values of the base field for effective-address
+/// computations, a.k.a. the Mod and R/M fields of the ModR/M byte.
+/// We distinguish between bases (EA_BASE_*) and registers that just happen
+/// to be referred to when Mod == 0b11 (EA_REG_*).
+enum EABase {
   EA_BASE_NONE,
 #define ENTRY(x) EA_BASE_##x,
   ALL_EA_BASES
@@ -407,15 +391,13 @@ typedef enum {
   ALL_REGS
 #undef ENTRY
   EA_max
-} EABase;
-
-/*
- * SIBIndex - All possible values of the SIB index field.
- *   Borrows entries from ALL_EA_BASES with the special case that
- *   sib is synonymous with NONE.
- * Vector SIB: index can be XMM or YMM.
- */
-typedef enum {
+};
+
+/// \brief All possible values of the SIB index field.
+/// borrows entries from ALL_EA_BASES with the special case that
+/// sib is synonymous with NONE.
+/// Vector SIB: index can be XMM or YMM.
+enum SIBIndex {
   SIB_INDEX_NONE,
 #define ENTRY(x) SIB_INDEX_##x,
   ALL_EA_BASES
@@ -424,23 +406,18 @@ typedef enum {
   REGS_ZMM
 #undef ENTRY
   SIB_INDEX_max
-} SIBIndex;
+};
 
-/*
- * SIBBase - All possible values of the SIB base field.
- */
-typedef enum {
+/// \brief All possible values of the SIB base field.
+enum SIBBase {
   SIB_BASE_NONE,
 #define ENTRY(x) SIB_BASE_##x,
   ALL_SIB_BASES
 #undef ENTRY
   SIB_BASE_max
-} SIBBase;
+};
 
-/*
- * EADisplacement - Possible displacement types for effective-address
- *   computations.
- */
+/// \brief Possible displacement types for effective-address computations.
 typedef enum {
   EA_DISP_NONE,
   EA_DISP_8,
@@ -448,20 +425,16 @@ typedef enum {
   EA_DISP_32
 } EADisplacement;
 
-/*
- * Reg - All possible values of the reg field in the ModR/M byte.
- */
-typedef enum {
+/// \brief All possible values of the reg field in the ModR/M byte.
+enum Reg {
 #define ENTRY(x) MODRM_REG_##x,
   ALL_REGS
 #undef ENTRY
   MODRM_REG_max
-} Reg;
+};
 
-/*
- * SegmentOverride - All possible segment overrides.
- */
-typedef enum {
+/// \brief All possible segment overrides.
+enum SegmentOverride {
   SEG_OVERRIDE_NONE,
   SEG_OVERRIDE_CS,
   SEG_OVERRIDE_SS,
@@ -470,235 +443,220 @@ typedef enum {
   SEG_OVERRIDE_FS,
   SEG_OVERRIDE_GS,
   SEG_OVERRIDE_max
-} SegmentOverride;
-
-/*
- * VEXLeadingOpcodeByte - Possible values for the VEX.m-mmmm field
- */
+};
 
-typedef enum {
+/// \brief Possible values for the VEX.m-mmmm field
+enum VEXLeadingOpcodeByte {
   VEX_LOB_0F = 0x1,
   VEX_LOB_0F38 = 0x2,
   VEX_LOB_0F3A = 0x3
-} VEXLeadingOpcodeByte;
+};
 
-typedef enum {
+enum XOPMapSelect {
   XOP_MAP_SELECT_8 = 0x8,
   XOP_MAP_SELECT_9 = 0x9,
   XOP_MAP_SELECT_A = 0xA
-} XOPMapSelect;
-
-/*
- * VEXPrefixCode - Possible values for the VEX.pp/EVEX.pp field
- */
+};
 
-typedef enum {
+/// \brief Possible values for the VEX.pp/EVEX.pp field
+enum VEXPrefixCode {
   VEX_PREFIX_NONE = 0x0,
   VEX_PREFIX_66 = 0x1,
   VEX_PREFIX_F3 = 0x2,
   VEX_PREFIX_F2 = 0x3
-} VEXPrefixCode;
+};
 
-typedef enum {
+enum VectorExtensionType {
   TYPE_NO_VEX_XOP   = 0x0,
   TYPE_VEX_2B       = 0x1,
   TYPE_VEX_3B       = 0x2,
   TYPE_EVEX         = 0x3,
   TYPE_XOP          = 0x4
-} VectorExtensionType;
-
-typedef uint8_t BOOL;
-
-/*
- * byteReader_t - Type for the byte reader that the consumer must provide to
- *   the decoder.  Reads a single byte from the instruction's address space.
- * @param arg     - A baton that the consumer can associate with any internal
- *                  state that it needs.
- * @param byte    - A pointer to a single byte in memory that should be set to
- *                  contain the value at address.
- * @param address - The address in the instruction's address space that should
- *                  be read from.
- * @return        - -1 if the byte cannot be read for any reason; 0 otherwise.
- */
-typedef int (*byteReader_t)(const void* arg, uint8_t* byte, uint64_t address);
-
-/*
- * dlog_t - Type for the logging function that the consumer can provide to
- *   get debugging output from the decoder.
- * @param arg     - A baton that the consumer can associate with any internal
- *                  state that it needs.
- * @param log     - A string that contains the message.  Will be reused after
- *                  the logger returns.
- */
-typedef void (*dlog_t)(void* arg, const char *log);
-
-/*
- * The x86 internal instruction, which is produced by the decoder.
- */
+};
+
+/// \brief Type for the byte reader that the consumer must provide to
+/// the decoder. Reads a single byte from the instruction's address space.
+/// \param arg     A baton that the consumer can associate with any internal
+///                state that it needs.
+/// \param byte    A pointer to a single byte in memory that should be set to
+///                contain the value at address.
+/// \param address The address in the instruction's address space that should
+///                be read from.
+/// \return        -1 if the byte cannot be read for any reason; 0 otherwise.
+typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
+
+/// \brief Type for the logging function that the consumer can provide to
+/// get debugging output from the decoder.
+/// \param arg A baton that the consumer can associate with any internal
+///            state that it needs.
+/// \param log A string that contains the message.  Will be reused after
+///            the logger returns.
+typedef void (*dlog_t)(void *arg, const char *log);
+
+/// The specification for how to extract and interpret a full instruction and
+/// its operands.
+struct InstructionSpecifier {
+  uint16_t operands;
+};
+
+/// The x86 internal instruction, which is produced by the decoder.
 struct InternalInstruction {
-  /* Reader interface (C) */
+  // Reader interface (C)
   byteReader_t reader;
-  /* Opaque value passed to the reader */
+  // Opaque value passed to the reader
   const void* readerArg;
-  /* The address of the next byte to read via the reader */
+  // The address of the next byte to read via the reader
   uint64_t readerCursor;
 
-  /* Logger interface (C) */
+  // Logger interface (C)
   dlog_t dlog;
-  /* Opaque value passed to the logger */
+  // Opaque value passed to the logger
   void* dlogArg;
 
-  /* General instruction information */
+  // General instruction information
 
-  /* The mode to disassemble for (64-bit, protected, real) */
+  // The mode to disassemble for (64-bit, protected, real)
   DisassemblerMode mode;
-  /* The start of the instruction, usable with the reader */
+  // The start of the instruction, usable with the reader
   uint64_t startLocation;
-  /* The length of the instruction, in bytes */
+  // The length of the instruction, in bytes
   size_t length;
 
-  /* Prefix state */
+  // Prefix state
 
-  /* 1 if the prefix byte corresponding to the entry is present; 0 if not */
+  // 1 if the prefix byte corresponding to the entry is present; 0 if not
   uint8_t prefixPresent[0x100];
-  /* contains the location (for use with the reader) of the prefix byte */
+  // contains the location (for use with the reader) of the prefix byte
   uint64_t prefixLocations[0x100];
-  /* The value of the vector extension prefix(EVEX/VEX/XOP), if present */
+  // The value of the vector extension prefix(EVEX/VEX/XOP), if present
   uint8_t vectorExtensionPrefix[4];
-  /* The type of the vector extension prefix */
+  // The type of the vector extension prefix
   VectorExtensionType vectorExtensionType;
-  /* The value of the REX prefix, if present */
+  // The value of the REX prefix, if present
   uint8_t rexPrefix;
-  /* The location where a mandatory prefix would have to be (i.e., right before
-     the opcode, or right before the REX prefix if one is present) */
+  // The location where a mandatory prefix would have to be (i.e., right before
+  // the opcode, or right before the REX prefix if one is present).
   uint64_t necessaryPrefixLocation;
-  /* The segment override type */
+  // The segment override type
   SegmentOverride segmentOverride;
-  /* 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease */
-  BOOL xAcquireRelease;
+  // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
+  bool xAcquireRelease;
 
-  /* Sizes of various critical pieces of data, in bytes */
+  // Sizes of various critical pieces of data, in bytes
   uint8_t registerSize;
   uint8_t addressSize;
   uint8_t displacementSize;
   uint8_t immediateSize;
 
-  /* Offsets from the start of the instruction to the pieces of data, which is
-     needed to find relocation entries for adding symbolic operands */
+  // Offsets from the start of the instruction to the pieces of data, which is
+  // needed to find relocation entries for adding symbolic operands.
   uint8_t displacementOffset;
   uint8_t immediateOffset;
 
-  /* opcode state */
+  // opcode state
 
-  /* The last byte of the opcode, not counting any ModR/M extension */
+  // The last byte of the opcode, not counting any ModR/M extension
   uint8_t opcode;
-  /* The ModR/M byte of the instruction, if it is an opcode extension */
+  // The ModR/M byte of the instruction, if it is an opcode extension
   uint8_t modRMExtension;
 
-  /* decode state */
+  // decode state
 
-  /* The type of opcode, used for indexing into the array of decode tables */
+  // The type of opcode, used for indexing into the array of decode tables
   OpcodeType opcodeType;
-  /* The instruction ID, extracted from the decode table */
+  // The instruction ID, extracted from the decode table
   uint16_t instructionID;
-  /* The specifier for the instruction, from the instruction info table */
-  const struct InstructionSpecifier *spec;
+  // The specifier for the instruction, from the instruction info table
+  const InstructionSpecifier *spec;
 
-  /* state for additional bytes, consumed during operand decode.  Pattern:
-     consumed___ indicates that the byte was already consumed and does not
-     need to be consumed again */
+  // state for additional bytes, consumed during operand decode.  Pattern:
+  // consumed___ indicates that the byte was already consumed and does not
+  // need to be consumed again.
 
-  /* The VEX.vvvv field, which contains a third register operand for some AVX
-     instructions */
+  // The VEX.vvvv field, which contains a third register operand for some AVX
+  // instructions.
   Reg                           vvvv;
 
-  /* The writemask for AVX-512 instructions which is contained in EVEX.aaa */
+  // The writemask for AVX-512 instructions which is contained in EVEX.aaa
   Reg                           writemask;
 
-  /* The ModR/M byte, which contains most register operands and some portion of
-     all memory operands */
-  BOOL                          consumedModRM;
+  // The ModR/M byte, which contains most register operands and some portion of
+  // all memory operands.
+  bool                          consumedModRM;
   uint8_t                       modRM;
 
-  /* The SIB byte, used for more complex 32- or 64-bit memory operands */
-  BOOL                          consumedSIB;
+  // The SIB byte, used for more complex 32- or 64-bit memory operands
+  bool                          consumedSIB;
   uint8_t                       sib;
 
-  /* The displacement, used for memory operands */
-  BOOL                          consumedDisplacement;
+  // The displacement, used for memory operands
+  bool                          consumedDisplacement;
   int32_t                       displacement;
 
-  /* Immediates.  There can be two in some cases */
+  // Immediates.  There can be two in some cases
   uint8_t                       numImmediatesConsumed;
   uint8_t                       numImmediatesTranslated;
   uint64_t                      immediates[2];
 
-  /* A register or immediate operand encoded into the opcode */
+  // A register or immediate operand encoded into the opcode
   Reg                           opcodeRegister;
 
-  /* Portions of the ModR/M byte */
+  // Portions of the ModR/M byte
 
-  /* These fields determine the allowable values for the ModR/M fields, which
-     depend on operand and address widths */
+  // These fields determine the allowable values for the ModR/M fields, which
+  // depend on operand and address widths.
   EABase                        eaBaseBase;
   EABase                        eaRegBase;
   Reg                           regBase;
 
-  /* The Mod and R/M fields can encode a base for an effective address, or a
-     register.  These are separated into two fields here */
+  // The Mod and R/M fields can encode a base for an effective address, or a
+  // register.  These are separated into two fields here.
   EABase                        eaBase;
   EADisplacement                eaDisplacement;
-  /* The reg field always encodes a register */
+  // The reg field always encodes a register
   Reg                           reg;
 
-  /* SIB state */
+  // SIB state
   SIBIndex                      sibIndex;
   uint8_t                       sibScale;
   SIBBase                       sibBase;
 
-  const struct OperandSpecifier *operands;
+  ArrayRef<OperandSpecifier> operands;
 };
 
-/* decodeInstruction - Decode one instruction and store the decoding results in
- *   a buffer provided by the consumer.
- * @param insn      - The buffer to store the instruction in.  Allocated by the
- *                    consumer.
- * @param reader    - The byteReader_t for the bytes to be read.
- * @param readerArg - An argument to pass to the reader for storing context
- *                    specific to the consumer.  May be NULL.
- * @param logger    - The dlog_t to be used in printing status messages from the
- *                    disassembler.  May be NULL.
- * @param loggerArg - An argument to pass to the logger for storing context
- *                    specific to the logger.  May be NULL.
- * @param startLoc  - The address (in the reader's address space) of the first
- *                    byte in the instruction.
- * @param mode      - The mode (16-bit, 32-bit, 64-bit) to decode in.
- * @return          - Nonzero if there was an error during decode, 0 otherwise.
- */
-int decodeInstruction(struct InternalInstruction* insn,
+/// \brief Decode one instruction and store the decoding results in
+/// a buffer provided by the consumer.
+/// \param insn      The buffer to store the instruction in.  Allocated by the
+///                  consumer.
+/// \param reader    The byteReader_t for the bytes to be read.
+/// \param readerArg An argument to pass to the reader for storing context
+///                  specific to the consumer.  May be NULL.
+/// \param logger    The dlog_t to be used in printing status messages from the
+///                  disassembler.  May be NULL.
+/// \param loggerArg An argument to pass to the logger for storing context
+///                  specific to the logger.  May be NULL.
+/// \param startLoc  The address (in the reader's address space) of the first
+///                  byte in the instruction.
+/// \param mode      The mode (16-bit, 32-bit, 64-bit) to decode in.
+/// \return          Nonzero if there was an error during decode, 0 otherwise.
+int decodeInstruction(InternalInstruction *insn,
                       byteReader_t reader,
-                      const void* readerArg,
+                      const void *readerArg,
                       dlog_t logger,
-                      void* loggerArg,
-                      const void* miiArg,
+                      void *loggerArg,
+                      const void *miiArg,
                       uint64_t startLoc,
                       DisassemblerMode mode);
 
-/* x86DisassemblerDebug - C-accessible function for printing a message to
- *   debugs()
- * @param file  - The name of the file printing the debug message.
- * @param line  - The line number that printed the debug message.
- * @param s     - The message to print.
- */
+/// \brief Print a message to debugs()
+/// \param file The name of the file printing the debug message.
+/// \param line The line number that printed the debug message.
+/// \param s    The message to print.
+void Debug(const char *file, unsigned line, const char *s);
 
-void x86DisassemblerDebug(const char *file,
-                          unsigned line,
-                          const char *s);
+const char *GetInstrName(unsigned Opcode, const void *mii);
 
-const char *x86DisassemblerGetInstrName(unsigned Opcode, const void *mii);
-
-#ifdef __cplusplus
-}
-#endif
+} // namespace X86Disassembler
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index 523ae99..f59e0b6 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -1,29 +1,27 @@
-/*===-- X86DisassemblerDecoderCommon.h - Disassembler decoder -----*- C -*-===*
- *
- *                     The LLVM Compiler Infrastructure
- *
- * This file is distributed under the University of Illinois Open Source
- * License. See LICENSE.TXT for details.
- *
- *===----------------------------------------------------------------------===*
- *
- * This file is part of the X86 Disassembler.
- * It contains common definitions used by both the disassembler and the table
- *  generator.
- * Documentation for the disassembler can be found in X86Disassembler.h.
- *
- *===----------------------------------------------------------------------===*/
-
-/*
- * This header file provides those definitions that need to be shared between
- * the decoder and the table generator in a C-friendly manner.
- */
+//===-- X86DisassemblerDecoderCommon.h - Disassembler decoder ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the X86 Disassembler.
+// It contains common definitions used by both the disassembler and the table
+//  generator.
+// Documentation for the disassembler can be found in X86Disassembler.h.
+//
+//===----------------------------------------------------------------------===//
 
 #ifndef X86DISASSEMBLERDECODERCOMMON_H
 #define X86DISASSEMBLERDECODERCOMMON_H
 
 #include "llvm/Support/DataTypes.h"
 
+namespace llvm {
+namespace X86Disassembler {
+
 #define INSTRUCTIONS_SYM  x86DisassemblerInstrSpecifiers
 #define CONTEXTS_SYM      x86DisassemblerContexts
 #define ONEBYTE_SYM       x86DisassemblerOneByteOpcodes
@@ -44,11 +42,9 @@
 #define XOP9_MAP_STR      "x86DisassemblerXOP9Opcodes"
 #define XOPA_MAP_STR      "x86DisassemblerXOPAOpcodes"
 
-/*
- * Attributes of an instruction that must be known before the opcode can be
- * processed correctly.  Most of these indicate the presence of particular
- * prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
- */
+// Attributes of an instruction that must be known before the opcode can be
+// processed correctly.  Most of these indicate the presence of particular
+// prefixes, but ATTR_64BIT is simply an attribute of the decoding context.
 #define ATTRIBUTE_BITS                  \
   ENUM_ENTRY(ATTR_NONE,   0x00)         \
   ENUM_ENTRY(ATTR_64BIT,  (0x1 << 0))   \
@@ -73,13 +69,11 @@ enum attributeBits {
 };
 #undef ENUM_ENTRY
 
-/*
- * Combinations of the above attributes that are relevant to instruction
- * decode. Although other combinations are possible, they can be reduced to
- * these without affecting the ultimately decoded instruction.
- */
+// Combinations of the above attributes that are relevant to instruction
+// decode. Although other combinations are possible, they can be reduced to
+// these without affecting the ultimately decoded instruction.
 
-/*           Class name           Rank  Rationale for rank assignment         */
+//           Class name           Rank  Rationale for rank assignment
 #define INSTRUCTION_CONTEXTS                                                   \
   ENUM_ENTRY(IC,                    0,  "says nothing about the instruction")  \
   ENUM_ENTRY(IC_64BIT,              1,  "says the instruction applies in "     \
@@ -274,17 +268,15 @@ enum attributeBits {
   ENUM_ENTRY(IC_EVEX_L2_W_OPSIZE_KZ, 4,  "requires EVEX_KZ, L2, W and OpSize")     
 
 #define ENUM_ENTRY(n, r, d) n,
-typedef enum {
+enum InstructionContext {
   INSTRUCTION_CONTEXTS
   IC_max
-} InstructionContext;
+};
 #undef ENUM_ENTRY
 
-/*
- * Opcode types, which determine which decode table to use, both in the Intel
- * manual and also for the decoder.
- */
-typedef enum {
+// Opcode types, which determine which decode table to use, both in the Intel
+// manual and also for the decoder.
+enum OpcodeType {
   ONEBYTE       = 0,
   TWOBYTE       = 1,
   THREEBYTE_38  = 2,
@@ -292,39 +284,33 @@ typedef enum {
   XOP8_MAP      = 4,
   XOP9_MAP      = 5,
   XOPA_MAP      = 6
-} OpcodeType;
-
-/*
- * The following structs are used for the hierarchical decode table.  After
- * determining the instruction's class (i.e., which IC_* constant applies to
- * it), the decoder reads the opcode.  Some instructions require specific
- * values of the ModR/M byte, so the ModR/M byte indexes into the final table.
- *
- * If a ModR/M byte is not required, "required" is left unset, and the values
- * for each instructionID are identical.
- */
+};
 
+// The following structs are used for the hierarchical decode table.  After
+// determining the instruction's class (i.e., which IC_* constant applies to
+// it), the decoder reads the opcode.  Some instructions require specific
+// values of the ModR/M byte, so the ModR/M byte indexes into the final table.
+//
+// If a ModR/M byte is not required, "required" is left unset, and the values
+// for each instructionID are identical.
 typedef uint16_t InstrUID;
 
-/*
- * ModRMDecisionType - describes the type of ModR/M decision, allowing the
- * consumer to determine the number of entries in it.
- *
- * MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
- *                  instruction is the same.
- * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
- *                  corresponds to one instruction; otherwise, it corresponds to
- *                  a different instruction.
- * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
- *                  divided by 8 is used to select instruction; otherwise, each
- *                  value of the ModR/M byte could correspond to a different
- *                  instruction.
- * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
-                    corresponds to instructions that use reg field as opcode
- * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
- *                  to a different instruction.
- */
-
+// ModRMDecisionType - describes the type of ModR/M decision, allowing the
+// consumer to determine the number of entries in it.
+//
+// MODRM_ONEENTRY - No matter what the value of the ModR/M byte is, the decoded
+//                  instruction is the same.
+// MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
+//                  corresponds to one instruction; otherwise, it corresponds to
+//                  a different instruction.
+// MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+//                  divided by 8 is used to select instruction; otherwise, each
+//                  value of the ModR/M byte could correspond to a different
+//                  instruction.
+// MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
+//                  corresponds to instructions that use reg field as opcode
+// MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
+//                  to a different instruction.
 #define MODRMTYPES            \
   ENUM_ENTRY(MODRM_ONEENTRY)  \
   ENUM_ENTRY(MODRM_SPLITRM)   \
@@ -333,47 +319,13 @@ typedef uint16_t InstrUID;
   ENUM_ENTRY(MODRM_FULL)
 
 #define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModRMDecisionType {
   MODRMTYPES
   MODRM_max
-} ModRMDecisionType;
-#undef ENUM_ENTRY
-
-/*
- * ModRMDecision - Specifies whether a ModR/M byte is needed and (if so) which
- *  instruction each possible value of the ModR/M byte corresponds to.  Once
- *  this information is known, we have narrowed down to a single instruction.
- */
-struct ModRMDecision {
-  uint8_t     modrm_type;
-
-  /* The macro below must be defined wherever this file is included. */
-  INSTRUCTION_IDS
-};
-
-/*
- * OpcodeDecision - Specifies which set of ModR/M->instruction tables to look at
- *   given a particular opcode.
- */
-struct OpcodeDecision {
-  struct ModRMDecision modRMDecisions[256];
-};
-
-/*
- * ContextDecision - Specifies which opcode->instruction tables to look at given
- *   a particular context (set of attributes).  Since there are many possible
- *   contexts, the decoder first uses CONTEXTS_SYM to determine which context
- *   applies given a specific set of attributes.  Hence there are only IC_max
- *   entries in this table, rather than 2^(ATTR_max).
- */
-struct ContextDecision {
-  struct OpcodeDecision opcodeDecisions[IC_max];
 };
+#undef ENUM_ENTRY
 
-/*
- * Physical encodings of instruction operands.
- */
-
+// Physical encodings of instruction operands.
 #define ENCODINGS                                                              \
   ENUM_ENTRY(ENCODING_NONE,   "")                                              \
   ENUM_ENTRY(ENCODING_REG,    "Register operand in ModR/M byte.")              \
@@ -408,16 +360,13 @@ struct ContextDecision {
   ENUM_ENTRY(ENCODING_DI,     "Destination index; encoded in prefixes")
 
 #define ENUM_ENTRY(n, d) n,
-  typedef enum {
-    ENCODINGS
-    ENCODING_max
-  } OperandEncoding;
+enum OperandEncoding {
+  ENCODINGS
+  ENCODING_max
+};
 #undef ENUM_ENTRY
 
-/*
- * Semantic interpretations of instruction operands.
- */
-
+// Semantic interpretations of instruction operands.
 #define TYPES                                                                  \
   ENUM_ENTRY(TYPE_NONE,       "")                                              \
   ENUM_ENTRY(TYPE_REL8,       "1-byte immediate address")                      \
@@ -508,56 +457,42 @@ struct ContextDecision {
   ENUM_ENTRY(TYPE_M512,       "512-bit FPU/MMX/XMM/MXCSR state")
 
 #define ENUM_ENTRY(n, d) n,
-typedef enum {
+enum OperandType {
   TYPES
   TYPE_max
-} OperandType;
+};
 #undef ENUM_ENTRY
 
-/*
- * OperandSpecifier - The specification for how to extract and interpret one
- *   operand.
- */
+/// \brief The specification for how to extract and interpret one operand.
 struct OperandSpecifier {
   uint8_t encoding;
   uint8_t type;
 };
 
-/*
- * Indicates where the opcode modifier (if any) is to be found.  Extended
- * opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
- */
-
+// Indicates where the opcode modifier (if any) is to be found.  Extended
+// opcodes with AddRegFrm have the opcode modifier in the ModR/M byte.
 #define MODIFIER_TYPES        \
   ENUM_ENTRY(MODIFIER_NONE)
 
 #define ENUM_ENTRY(n) n,
-typedef enum {
+enum ModifierType {
   MODIFIER_TYPES
   MODIFIER_max
-} ModifierType;
+};
 #undef ENUM_ENTRY
 
-#define X86_MAX_OPERANDS 5
-
-/*
- * The specification for how to extract and interpret a full instruction and
- * its operands.
- */
-struct InstructionSpecifier {
-  /* The macro below must be defined wherever this file is included. */
-  INSTRUCTION_SPECIFIER_FIELDS
-};
+static const unsigned X86_MAX_OPERANDS = 5;
 
-/*
- * Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
- * are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
- * respectively.
- */
-typedef enum {
+/// Decoding mode for the Intel disassembler.  16-bit, 32-bit, and 64-bit mode
+/// are supported, and represent real mode, IA-32e, and IA-32e in 64-bit mode,
+/// respectively.
+enum DisassemblerMode {
   MODE_16BIT,
   MODE_32BIT,
   MODE_64BIT
-} DisassemblerMode;
+};
+
+} // namespace X86Disassembler
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index eea0a76..b45b118 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
@@ -28,6 +27,8 @@
 #include <map>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 // Include the auto-generated portion of the assembly writer.
 #define PRINT_ALIAS_INSTR
 #include "X86GenAsmWriter.inc"
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index f34e633..531183b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -32,6 +32,8 @@ public:
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
   bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
+                               unsigned PrintMethodIdx, raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, raw_ostream &OS);
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index db61fb0..baf6507 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -32,7 +32,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
                                   const char *(*getRegName)(unsigned)) {
   // If this is a shuffle operation, the switch should fill in this state.
   SmallVector<int, 8> ShuffleMask;
-  const char *DestName = 0, *Src1Name = 0, *Src2Name = 0;
+  const char *DestName = nullptr, *Src1Name = nullptr, *Src2Name = nullptr;
 
   switch (MI->getOpcode()) {
   case X86::INSERTPSrr:
@@ -492,7 +492,7 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   // If this was a shuffle operation, print the shuffle mask.
   if (!ShuffleMask.empty()) {
-    if (DestName == 0) DestName = Src1Name;
+    if (!DestName) DestName = Src1Name;
     OS << (DestName ? DestName : "mem") << " = ";
 
     // If the two sources are the same, canonicalize the input elements to be
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 1c95d37..1c8466b 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
@@ -25,6 +24,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "X86GenAsmWriter1.inc"
 
 void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/Target/X86/MCTargetDesc/Android.mk b/lib/Target/X86/MCTargetDesc/Android.mk
index ee37c27..a3c9bc8 100644
--- a/lib/Target/X86/MCTargetDesc/Android.mk
+++ b/lib/Target/X86/MCTargetDesc/Android.mk
@@ -14,7 +14,8 @@ x86_mc_desc_SRC_FILES := \
   X86MCCodeEmitter.cpp \
   X86MachORelocationInfo.cpp \
   X86MachObjectWriter.cpp \
-  X86WinCOFFObjectWriter.cpp
+  X86WinCOFFObjectWriter.cpp \
+  X86WinCOFFStreamer.cpp
 
 # For the host
 # =====================================================
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 3f5a0e2..129c28d 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc
   X86MCCodeEmitter.cpp
   X86MachObjectWriter.cpp
   X86ELFObjectWriter.cpp
+  X86WinCOFFStreamer.cpp
   X86WinCOFFObjectWriter.cpp
   X86MachORelocationInfo.cpp
   X86ELFRelocationInfo.cpp
diff --git a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
index 9e1d29c..146d111 100644
--- a/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/X86/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = X86Desc
 parent = X86
-required_libraries = MC Support X86AsmPrinter X86Info
+required_libraries = MC Object Support X86AsmPrinter X86Info
 add_to_library_groups = X86
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 23763f7..bf30a8e 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -37,23 +37,29 @@ MCDisableArithRelaxation("mc-x86-disable-arith-relaxation",
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
   switch (Kind) {
-  default: llvm_unreachable("invalid fixup kind!");
+  default:
+    llvm_unreachable("invalid fixup kind!");
   case FK_PCRel_1:
   case FK_SecRel_1:
-  case FK_Data_1: return 0;
+  case FK_Data_1:
+    return 0;
   case FK_PCRel_2:
   case FK_SecRel_2:
-  case FK_Data_2: return 1;
+  case FK_Data_2:
+    return 1;
   case FK_PCRel_4:
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_movq_load:
   case X86::reloc_signed_4byte:
   case X86::reloc_global_offset_table:
   case FK_SecRel_4:
-  case FK_Data_4: return 2;
+  case FK_Data_4:
+    return 2;
   case FK_PCRel_8:
   case FK_SecRel_8:
-  case FK_Data_8: return 3;
+  case FK_Data_8:
+  case X86::reloc_global_offset_table8:
+    return 3;
   }
 }
 
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 38fab15..6aeb1f2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -643,6 +643,10 @@ namespace X86II {
   /// counted as one operand.
   ///
   inline int getMemoryOperandNo(uint64_t TSFlags, unsigned Opcode) {
+    bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
+    bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
+    bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
+    
     switch (TSFlags & X86II::FormMask) {
     default: llvm_unreachable("Unknown FormMask value in getMemoryOperandNo!");
     case X86II::Pseudo:
@@ -660,9 +664,6 @@ namespace X86II {
     case X86II::MRMDestMem:
       return 0;
     case X86II::MRMSrcMem: {
-      bool HasVEX_4V = (TSFlags >> X86II::VEXShift) & X86II::VEX_4V;
-      bool HasMemOp4 = (TSFlags >> X86II::VEXShift) & X86II::MemOp4;
-      bool HasEVEX_K = ((TSFlags >> X86II::VEXShift) & X86II::EVEX_K);
       unsigned FirstMemOp = 1;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register source (which is encoded in VEX_VVVV).
@@ -690,6 +691,8 @@ namespace X86II {
       unsigned FirstMemOp = 0;
       if (HasVEX_4V)
         ++FirstMemOp;// Skip the register dest (which is encoded in VEX_VVVV).
+      if (HasEVEX_K)
+        ++FirstMemOp;// Skip the mask register
       return FirstMemOp;
     }
     case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index c44d88d..3fdec87 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -43,7 +43,7 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
                                           bool IsPCRel) const {
   // determine the type of the relocation
 
-  MCSymbolRefExpr::VariantKind Modifier = Fixup.getAccessVariant();
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   unsigned Type;
   if (getEMachine() == ELF::EM_X86_64) {
     if (IsPCRel) {
@@ -98,6 +98,12 @@ unsigned X86ELFObjectWriter::GetRelocType(const MCValue &Target,
     } else {
       switch ((unsigned)Fixup.getKind()) {
       default: llvm_unreachable("invalid fixup kind!");
+      case X86::reloc_global_offset_table8:
+        Type = ELF::R_X86_64_GOTPC64;
+        break;
+      case X86::reloc_global_offset_table:
+        Type = ELF::R_X86_64_GOTPC32;
+        break;
       case FK_Data_8:
         switch (Modifier) {
         default:
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
index 4fa519c..b679316 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFRelocationInfo.cpp
@@ -39,7 +39,7 @@ public:
     if (Sym->isVariable() == false)
       Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
 
-    const MCExpr *Expr = 0;
+    const MCExpr *Expr = nullptr;
     // If hasAddend is true, then we need to add Addend (r_addend) to Expr.
     bool hasAddend = false;
 
diff --git a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index f2e34cb..09396b7 100644
--- a/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -23,6 +23,7 @@ enum Fixups {
   reloc_global_offset_table,                 // 32-bit, relative to the start
                                              // of the instruction. Used only
                                              // for _GLOBAL_OFFSET_TABLE_.
+  reloc_global_offset_table8,                // 64-bit variant.
   // Marker
   LastTargetFixupKind,
   NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 6561804..39480ea 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -51,7 +51,7 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   TextAlignFillValue = 0x90;
 
   if (!is64Bit)
-    Data64bitsDirective = 0;       // we can't emit a 64-bit unit
+    Data64bitsDirective = nullptr;       // we can't emit a 64-bit unit
 
   // Use ## as a comment string so that .s files generated by llvm can go
   // through the GCC preprocessor without causing an error.  This is needed
@@ -115,7 +115,7 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
   // into two .words.
   if ((T.getOS() == Triple::OpenBSD || T.getOS() == Triple::Bitrig) &&
        T.getArch() == Triple::x86)
-    Data64bitsDirective = 0;
+    Data64bitsDirective = nullptr;
 
   // Always enable the integrated assembler by default.
   // Clang also enabled it when the OS is Solaris but that is redundant here.
@@ -157,8 +157,10 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
 void X86MCAsmInfoGNUCOFF::anchor() { }
 
 X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
-  if (Triple.getArch() == Triple::x86_64)
+  if (Triple.getArch() == Triple::x86_64) {
     PrivateGlobalPrefix = ".L";
+    PointerSize = 8;
+  }
 
   AssemblerDialect = AsmWriterFlavor;
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index e6fb037..2152b21 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "mccodeemitter"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
@@ -27,6 +26,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "mccodeemitter"
+
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
   X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
@@ -285,7 +286,7 @@ enum GlobalOffsetTableExprKind {
 };
 static GlobalOffsetTableExprKind
 StartsWithGlobalOffsetTable(const MCExpr *Expr) {
-  const MCExpr *RHS = 0;
+  const MCExpr *RHS = nullptr;
   if (Expr->getKind() == MCExpr::Binary) {
     const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
     Expr = BE->getLHS();
@@ -316,7 +317,7 @@ void X86MCCodeEmitter::
 EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
               MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
               SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
-  const MCExpr *Expr = NULL;
+  const MCExpr *Expr = nullptr;
   if (DispOp.isImm()) {
     // If this is a simple integer displacement that doesn't require a
     // relocation, emit it now.
@@ -339,7 +340,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
     if (Kind != GOT_None) {
       assert(ImmOffset == 0);
 
-      FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      if (Size == 8) {
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table8);
+      } else {
+        assert(Size == 4);
+        FixupKind = MCFixupKind(X86::reloc_global_offset_table);
+      }
+
       if (Kind == GOT_Normal)
         ImmOffset = CurByte;
     } else if (Expr->getKind() == MCExpr::SymbolRef) {
@@ -1421,6 +1428,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM6r: case X86II::MRM7r: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     uint64_t Form = TSFlags & X86II::FormMask;
     EmitRegModRMByte(MI.getOperand(CurOp++),
@@ -1436,6 +1445,8 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM6m: case X86II::MRM7m: {
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
+    if (HasEVEX_K) // Skip writemask
+      ++CurOp;
     EmitByte(BaseOpcode, CurByte, OS);
     uint64_t Form = TSFlags & X86II::FormMask;
     EmitMemModRMByte(MI, CurOp, (Form == X86II::MRMXm) ? 0 : Form-X86II::MRM0m,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 09fdb9c..e63036c 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -27,6 +27,12 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
+#if _MSC_VER
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
 #define GET_REGINFO_MC_DESC
 #include "X86GenRegisterInfo.inc"
 
@@ -36,13 +42,6 @@
 #define GET_SUBTARGETINFO_MC_DESC
 #include "X86GenSubtargetInfo.inc"
 
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
-using namespace llvm;
-
-
 std::string X86_MC::ParseX86Triple(StringRef TT) {
   Triple TheTriple(TT);
   std::string FS;
@@ -230,14 +229,8 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(StringRef TT, StringRef CPU,
   }
 
   std::string CPUName = CPU;
-  if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-    CPUName = sys::getHostCPUName();
-#else
+  if (CPUName.empty())
     CPUName = "generic";
-#endif
-  }
 
   MCSubtargetInfo *X = new MCSubtargetInfo();
   InitX86MCSubtargetInfo(X, TT, CPUName, ArchFS);
@@ -294,13 +287,13 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, StringRef TT) {
   // Initial state of the frame pointer is esp+stackGrowth.
   unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
   MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
-      0, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
+      nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
   MAI->addInitialFrameState(Inst);
 
   // Add return address to move list
   unsigned InstPtr = is64Bit ? X86::RIP : X86::EIP;
   MCCFIInstruction Inst2 = MCCFIInstruction::createOffset(
-      0, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
+      nullptr, MRI.getDwarfRegNum(InstPtr, true), stackGrowth);
   MAI->addInitialFrameState(Inst2);
 
   return MAI;
@@ -365,13 +358,16 @@ static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
                                     bool NoExecStack) {
   Triple TheTriple(TT);
 
-  if (TheTriple.isOSBinFormatMachO())
+  switch (TheTriple.getObjectFormat()) {
+  default: llvm_unreachable("unsupported object format");
+  case Triple::MachO:
     return createMachOStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll);
-
-  if (TheTriple.isOSWindows() && !TheTriple.isOSBinFormatELF())
-    return createWinCOFFStreamer(Ctx, MAB, *_Emitter, _OS, RelaxAll);
-
-  return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  case Triple::COFF:
+    assert(TheTriple.isOSWindows() && "only Windows COFF is supported");
+    return createX86WinCOFFStreamer(Ctx, MAB, _Emitter, _OS, RelaxAll);
+  case Triple::ELF:
+    return createELFStreamer(Ctx, MAB, _OS, _Emitter, RelaxAll, NoExecStack);
+  }
 }
 
 static MCInstPrinter *createX86MCInstPrinter(const Target &T,
@@ -384,7 +380,7 @@ static MCInstPrinter *createX86MCInstPrinter(const Target &T,
     return new X86ATTInstPrinter(MAI, MII, MRI);
   if (SyntaxVariant == 1)
     return new X86IntelInstPrinter(MAI, MII, MRI);
-  return 0;
+  return nullptr;
 }
 
 static MCRelocationInfo *createX86MCRelocationInfo(StringRef TT,
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 41ae435..8fe40fd 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -26,6 +26,7 @@ class MCObjectWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCRelocationInfo;
+class MCStreamer;
 class Target;
 class StringRef;
 class raw_ostream;
@@ -84,6 +85,14 @@ MCAsmBackend *createX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
 MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
                                      StringRef TT, StringRef CPU);
 
+/// createX86WinCOFFStreamer - Construct an X86 Windows COFF machine code
+/// streamer which will generate PE/COFF format object files.
+///
+/// Takes ownership of \p AB and \p CE.
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll);
+
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
                                           bool Is64Bit,
diff --git a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
index f2023e3..3b81d53 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachORelocationInfo.cpp
@@ -40,7 +40,7 @@ public:
     // FIXME: check that the value is actually the same.
     if (Sym->isVariable() == false)
       Sym->setVariableValue(MCConstantExpr::Create(SymAddr, Ctx));
-    const MCExpr *Expr = 0;
+    const MCExpr *Expr = nullptr;
 
     switch(RelType) {
     case X86_64_RELOC_TLV:
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 1a35ced..ead3338 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -146,13 +146,13 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     const MCSymbol *A = &Target.getSymA()->getSymbol();
     if (A->isTemporary())
       A = &A->AliasedSymbol();
-    MCSymbolData &A_SD = Asm.getSymbolData(*A);
+    const MCSymbolData &A_SD = Asm.getSymbolData(*A);
     const MCSymbolData *A_Base = Asm.getAtom(&A_SD);
 
     const MCSymbol *B = &Target.getSymB()->getSymbol();
     if (B->isTemporary())
       B = &B->AliasedSymbol();
-    MCSymbolData &B_SD = Asm.getSymbolData(*B);
+    const MCSymbolData &B_SD = Asm.getSymbolData(*B);
     const MCSymbolData *B_Base = Asm.getAtom(&B_SD);
 
     // Neither symbol can be modified.
@@ -186,9 +186,9 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
                          false);
 
     Value += Writer->getSymbolAddress(&A_SD, Layout) -
-      (A_Base == NULL ? 0 : Writer->getSymbolAddress(A_Base, Layout));
+      (!A_Base ? 0 : Writer->getSymbolAddress(A_Base, Layout));
     Value -= Writer->getSymbolAddress(&B_SD, Layout) -
-      (B_Base == NULL ? 0 : Writer->getSymbolAddress(B_Base, Layout));
+      (!B_Base ? 0 : Writer->getSymbolAddress(B_Base, Layout));
 
     if (A_Base) {
       Index = A_Base->getIndex();
@@ -220,7 +220,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
     Type = MachO::X86_64_RELOC_SUBTRACTOR;
   } else {
     const MCSymbol *Symbol = &Target.getSymA()->getSymbol();
-    MCSymbolData &SD = Asm.getSymbolData(*Symbol);
+    const MCSymbolData &SD = Asm.getSymbolData(*Symbol);
     const MCSymbolData *Base = Asm.getAtom(&SD);
 
     // Relocations inside debug sections always use local relocations when
@@ -231,7 +231,7 @@ void X86MachObjectWriter::RecordX86_64Relocation(MachObjectWriter *Writer,
       const MCSectionMachO &Section = static_cast<const MCSectionMachO&>(
         Fragment->getParent()->getSection());
       if (Section.hasAttribute(MachO::S_ATTR_DEBUG))
-        Base = 0;
+        Base = nullptr;
     }
 
     // x86_64 almost always uses external relocations, except when there is no
@@ -369,7 +369,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
 
   // See <reloc.h>.
   const MCSymbol *A = &Target.getSymA()->getSymbol();
-  MCSymbolData *A_SD = &Asm.getSymbolData(*A);
+  const MCSymbolData *A_SD = &Asm.getSymbolData(*A);
 
   if (!A_SD->getFragment())
     report_fatal_error("symbol '" + A->getName() +
@@ -382,7 +382,7 @@ bool X86MachObjectWriter::RecordScatteredRelocation(MachObjectWriter *Writer,
   uint32_t Value2 = 0;
 
   if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
+    const MCSymbolData *B_SD = &Asm.getSymbolData(B->getSymbol());
 
     if (!B_SD->getFragment())
       report_fatal_error("symbol '" + B->getSymbol().getName() +
@@ -465,7 +465,7 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
   unsigned IsPCRel = 0;
 
   // Get the symbol data.
-  MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
+  const MCSymbolData *SD_A = &Asm.getSymbolData(Target.getSymA()->getSymbol());
   unsigned Index = SD_A->getIndex();
 
   // We're only going to have a second symbol in pic mode and it'll be a
@@ -476,7 +476,8 @@ void X86MachObjectWriter::RecordTLVPRelocation(MachObjectWriter *Writer,
     // If this is a subtraction then we're pcrel.
     uint32_t FixupAddress =
       Writer->getFragmentAddress(Fragment, Layout) + Fixup.getOffset();
-    MCSymbolData *SD_B = &Asm.getSymbolData(Target.getSymB()->getSymbol());
+    const MCSymbolData *SD_B =
+        &Asm.getSymbolData(Target.getSymB()->getSymbol());
     IsPCRel = 1;
     FixedValue = (FixupAddress - Writer->getSymbolAddress(SD_B, Layout) +
                   Target.getConstant());
@@ -524,7 +525,7 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
   }
 
   // Get the symbol data, if any.
-  MCSymbolData *SD = 0;
+  const MCSymbolData *SD = nullptr;
   if (Target.getSymA())
     SD = &Asm.getSymbolData(Target.getSymA()->getSymbol());
 
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index ffc9e8d..40af822 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -23,10 +23,8 @@ namespace llvm {
 
 namespace {
   class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
-    const bool Is64Bit;
-
   public:
-    X86WinCOFFObjectWriter(bool Is64Bit_);
+    X86WinCOFFObjectWriter(bool Is64Bit);
     virtual ~X86WinCOFFObjectWriter();
 
     unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
@@ -34,10 +32,9 @@ namespace {
   };
 }
 
-X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit_)
-  : MCWinCOFFObjectTargetWriter(Is64Bit_ ? COFF::IMAGE_FILE_MACHINE_AMD64 :
-                                COFF::IMAGE_FILE_MACHINE_I386),
-    Is64Bit(Is64Bit_) {}
+X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
+    : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
+                                          : COFF::IMAGE_FILE_MACHINE_I386) {}
 
 X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
 
@@ -49,29 +46,46 @@ unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
   MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
     MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
 
-  switch (FixupKind) {
-  case FK_PCRel_4:
-  case X86::reloc_riprel_4byte:
-  case X86::reloc_riprel_4byte_movq_load:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_REL32 : COFF::IMAGE_REL_I386_REL32;
-  case FK_Data_4:
-  case X86::reloc_signed_4byte:
-    if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
-      return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32NB :
-                       COFF::IMAGE_REL_I386_DIR32NB;
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_ADDR32 : COFF::IMAGE_REL_I386_DIR32;
-  case FK_Data_8:
-    if (Is64Bit)
+  if (getMachine() == COFF::IMAGE_FILE_MACHINE_AMD64) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_AMD64_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_AMD64_ADDR32NB;
+      return COFF::IMAGE_REL_AMD64_ADDR32;
+    case FK_Data_8:
       return COFF::IMAGE_REL_AMD64_ADDR64;
-    llvm_unreachable("unsupported relocation type");
-  case FK_SecRel_2:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECTION
-                   : COFF::IMAGE_REL_I386_SECTION;
-  case FK_SecRel_4:
-    return Is64Bit ? COFF::IMAGE_REL_AMD64_SECREL : COFF::IMAGE_REL_I386_SECREL;
-  default:
-    llvm_unreachable("unsupported relocation type");
-  }
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_AMD64_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_AMD64_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else if (getMachine() == COFF::IMAGE_FILE_MACHINE_I386) {
+    switch (FixupKind) {
+    case FK_PCRel_4:
+    case X86::reloc_riprel_4byte:
+    case X86::reloc_riprel_4byte_movq_load:
+      return COFF::IMAGE_REL_I386_REL32;
+    case FK_Data_4:
+    case X86::reloc_signed_4byte:
+      if (Modifier == MCSymbolRefExpr::VK_COFF_IMGREL32)
+        return COFF::IMAGE_REL_I386_DIR32NB;
+      return COFF::IMAGE_REL_I386_DIR32;
+    case FK_SecRel_2:
+      return COFF::IMAGE_REL_I386_SECTION;
+    case FK_SecRel_4:
+      return COFF::IMAGE_REL_I386_SECREL;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  } else
+    llvm_unreachable("Unsupported COFF machine type.");
 }
 
 MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
new file mode 100644
index 0000000..c62fd0a
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -0,0 +1,51 @@
+//===-- X86WinCOFFStreamer.cpp - X86 Target WinCOFF Streamer ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "llvm/MC/MCWinCOFFStreamer.h"
+
+using namespace llvm;
+
+namespace {
+class X86WinCOFFStreamer : public MCWinCOFFStreamer {
+public:
+  X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
+                     raw_ostream &OS)
+    : MCWinCOFFStreamer(C, AB, *CE, OS) { }
+
+  void EmitWin64EHHandlerData() override;
+  void FinishImpl() override;
+};
+
+void X86WinCOFFStreamer::EmitWin64EHHandlerData() {
+  MCStreamer::EmitWin64EHHandlerData();
+
+  // We have to emit the unwind info now, because this directive
+  // actually switches to the .xdata section!
+  MCWin64EHUnwindEmitter::EmitUnwindInfo(*this, getCurrentW64UnwindInfo());
+}
+
+void X86WinCOFFStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+  EmitW64Tables();
+
+  MCWinCOFFStreamer::FinishImpl();
+}
+}
+
+namespace llvm {
+MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+                                     MCCodeEmitter *CE, raw_ostream &OS,
+                                     bool RelaxAll) {
+  X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+  S->getAssembler().setRelaxAll(RelaxAll);
+  return S;
+}
+}
+
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 18e6845..64e8ea8 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -30,9 +30,9 @@ class X86TargetMachine;
 FunctionPass *createX86ISelDag(X86TargetMachine &TM,
                                CodeGenOpt::Level OptLevel);
 
-/// createGlobalBaseRegPass - This pass initializes a global base
+/// createX86GlobalBaseRegPass - This pass initializes a global base
 /// register for PIC on x86-32.
-FunctionPass* createGlobalBaseRegPass();
+FunctionPass* createX86GlobalBaseRegPass();
 
 /// createCleanupLocalDynamicTLSPass() - This pass combines multiple accesses
 /// to local-dynamic TLS variables so that the TLS base address for the module
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 78edcf0..6912b57 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -166,6 +166,8 @@ def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
                                      "Call register indirect">;
 def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
                                    "LEA instruction needs inputs at AG stage">;
+def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
+                                   "LEA instruction with certain arguments is slow">;
 
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
@@ -195,8 +197,7 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
 def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
-                               FeatureFastUAMem]>;
+
 // Intel Core Duo.
 def : ProcessorModel<"yonah", SandyBridgeModel,
                      [FeatureSSE3, FeatureSlowBTMem]>;
@@ -227,6 +228,7 @@ def : ProcessorModel<"slm",  SLMModel, [ProcIntelSLM,
                                FeaturePCLMUL, FeatureAES,
                                FeatureCallRegIndirect,
                                FeaturePRFCHW,
+                               FeatureSlowLEA,
                                FeatureSlowBTMem, FeatureFastUAMem]>;
 // "Arrandale" along with corei3 and corei5
 def : ProcessorModel<"corei7", SandyBridgeModel,
@@ -329,6 +331,13 @@ def : Proc<"bdver3",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeaturePOPCNT, FeatureBMI,  FeatureTBM,
                                FeatureFMA, FeatureFSGSBase]>;
 
+// Excavator
+def : Proc<"bdver4",          [FeatureAVX2, FeatureXOP, FeatureFMA4,
+                               FeatureCMPXCHG16B, FeatureAES, FeaturePRFCHW,
+                               FeaturePCLMUL, FeatureF16C, FeatureLZCNT,
+                               FeaturePOPCNT, FeatureBMI, FeatureBMI2,
+                               FeatureTBM, FeatureFMA, FeatureFSGSBase]>;
+
 def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
@@ -336,6 +345,20 @@ def : Proc<"winchip2",        [Feature3DNow]>;
 def : Proc<"c3",              [Feature3DNow]>;
 def : Proc<"c3-2",            [FeatureSSE1]>;
 
+// We also provide a generic 64-bit specific x86 processor model which tries to
+// be good for modern chips without enabling instruction set encodings past the
+// basic SSE2 and 64-bit ones. It disables slow things from any mainstream and
+// modern 64-bit x86 chip, and enables features that are generally beneficial.
+// 
+// We currently use the Sandy Bridge model as the default scheduling model as
+// we use it across Nehalem, Westmere, Sandy Bridge, and Ivy Bridge which
+// covers a huge swath of x86 processors. If there are specific scheduling
+// knobs which need to be tuned differently for AMD chips, we might consider
+// forming a common base for them.
+def : ProcessorModel<"x86-64", SandyBridgeModel,
+                     [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+                      FeatureFastUAMem]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index fb66acc..1dca568 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -15,7 +15,6 @@
 #include "X86AsmPrinter.h"
 #include "InstPrinter/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "X86COFFMachineModuleInfo.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -102,7 +101,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
       MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE){
@@ -110,14 +109,14 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getHiddenGVStubEntry(
               Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     } else if (MO.getTargetFlags() == X86II::MO_DARWIN_STUB) {
       MCSymbol *Sym = P.getSymbolWithGlobalValueBase(GV, "$stub");
       MachineModuleInfoImpl::StubValueTy &StubSym =
           P.MMI->getObjFileInfo<MachineModuleInfoMachO>().getFnStubEntry(Sym);
-      if (StubSym.getPointer() == 0)
+      if (!StubSym.getPointer())
         StubSym = MachineModuleInfoImpl::
           StubValueTy(P.getSymbol(GV), !GV->hasInternalLinkage());
     }
@@ -174,7 +173,7 @@ static void printSymbolOperand(X86AsmPrinter &P, const MachineOperand &MO,
 
 static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
                          unsigned OpNo, raw_ostream &O,
-                         const char *Modifier = 0, unsigned AsmVariant = 0);
+                         const char *Modifier = nullptr, unsigned AsmVariant = 0);
 
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value.  These print slightly differently, for
@@ -232,7 +231,7 @@ static void printOperand(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                                  unsigned Op, raw_ostream &O,
-                                 const char *Modifier = NULL) {
+                                 const char *Modifier = nullptr) {
   const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   const MachineOperand &IndexReg = MI->getOperand(Op+X86::AddrIndexReg);
   const MachineOperand &DispSpec = MI->getOperand(Op+X86::AddrDisp);
@@ -284,7 +283,7 @@ static void printLeaMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                               unsigned Op, raw_ostream &O,
-                              const char *Modifier = NULL) {
+                              const char *Modifier = nullptr) {
   assert(isMem(MI, Op) && "Invalid memory reference!");
   const MachineOperand &Segment = MI->getOperand(Op+X86::AddrSegmentReg);
   if (Segment.getReg()) {
@@ -296,7 +295,7 @@ static void printMemReference(X86AsmPrinter &P, const MachineInstr *MI,
 
 static void printIntelMemReference(X86AsmPrinter &P, const MachineInstr *MI,
                                    unsigned Op, raw_ostream &O,
-                                   const char *Modifier = NULL,
+                                   const char *Modifier = nullptr,
                                    unsigned AsmVariant = 1) {
   const MachineOperand &BaseReg  = MI->getOperand(Op+X86::AddrBaseReg);
   unsigned ScaleVal = MI->getOperand(Op+X86::AddrScaleAmt).getImm();
@@ -464,7 +463,7 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     }
   }
 
-  printOperand(*this, MI, OpNo, O, /*Modifier*/ 0, AsmVariant);
+  printOperand(*this, MI, OpNo, O, /*Modifier*/ nullptr, AsmVariant);
   return false;
 }
 
@@ -527,6 +526,55 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
   }
 }
 
+static void
+emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
+                         MachineModuleInfoImpl::StubValueTy &MCSym) {
+  // L_foo$stub:
+  OutStreamer.EmitLabel(StubLabel);
+  //   .indirect_symbol _foo
+  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+
+  if (MCSym.getInt())
+    // External to current translation unit.
+    OutStreamer.EmitIntValue(0, 4/*size*/);
+  else
+    // Internal to current translation unit.
+    //
+    // When we place the LSDA into the TEXT section, the type info
+    // pointers need to be indirect and pc-rel. We accomplish this by
+    // using NLPs; however, sometimes the types are local to the file.
+    // We need to fill in the value for the NLP in those cases.
+    OutStreamer.EmitValue(
+        MCSymbolRefExpr::Create(MCSym.getPointer(), OutStreamer.getContext()),
+        4 /*size*/);
+}
+
+void X86AsmPrinter::GenerateExportDirective(const MCSymbol *Sym, bool IsData) {
+  SmallString<128> Directive;
+  raw_svector_ostream OS(Directive);
+  StringRef Name = Sym->getName();
+
+  if (Subtarget->isTargetKnownWindowsMSVC())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if ((Subtarget->isTargetWindowsGNU() || Subtarget->isTargetWindowsCygwin()) &&
+      (Name[0] == getDataLayout().getGlobalPrefix()))
+    Name = Name.drop_front();
+
+  OS << Name;
+
+  if (IsData) {
+    if (Subtarget->isTargetKnownWindowsMSVC())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+
+  OS.flush();
+  OutStreamer.EmitBytes(Directive);
+}
 
 void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetMacho()) {
@@ -547,11 +595,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
                                    5, SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
+      for (const auto &Stub : Stubs) {
         // L_foo$stub:
-        OutStreamer.EmitLabel(Stubs[i].first);
+        OutStreamer.EmitLabel(Stub.first);
         //   .indirect_symbol _foo
-        OutStreamer.EmitSymbolAttribute(Stubs[i].second.getPointer(),
+        OutStreamer.EmitSymbolAttribute(Stub.second.getPointer(),
                                         MCSA_IndirectSymbol);
         // hlt; hlt; hlt; hlt; hlt     hlt = 0xf4.
         const char HltInsts[] = "\xf4\xf4\xf4\xf4\xf4";
@@ -571,44 +619,24 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
                                    SectionKind::getMetadata());
       OutStreamer.SwitchSection(TheSection);
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$non_lazy_ptr:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        // .indirect_symbol _foo
-        MachineModuleInfoImpl::StubValueTy &MCSym = Stubs[i].second;
-        OutStreamer.EmitSymbolAttribute(MCSym.getPointer(),
-                                        MCSA_IndirectSymbol);
-        // .long 0
-        if (MCSym.getInt())
-          // External to current translation unit.
-          OutStreamer.EmitIntValue(0, 4/*size*/);
-        else
-          // Internal to current translation unit.
-          //
-          // When we place the LSDA into the TEXT section, the type info
-          // pointers need to be indirect and pc-rel. We accomplish this by
-          // using NLPs.  However, sometimes the types are local to the file. So
-          // we need to fill in the value for the NLP in those cases.
-          OutStreamer.EmitValue(MCSymbolRefExpr::Create(MCSym.getPointer(),
-                                                        OutContext), 4/*size*/);
-      }
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
       Stubs.clear();
       OutStreamer.AddBlankLine();
     }
 
     Stubs = MMIMacho.GetHiddenGVStubList();
     if (!Stubs.empty()) {
-      OutStreamer.SwitchSection(getObjFileLowering().getDataSection());
-      EmitAlignment(2);
-
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        // L_foo$non_lazy_ptr:
-        OutStreamer.EmitLabel(Stubs[i].first);
-        // .long _foo
-        OutStreamer.EmitValue(MCSymbolRefExpr::
-                              Create(Stubs[i].second.getPointer(),
-                                     OutContext), 4/*size*/);
-      }
+      const MCSection *TheSection =
+        OutContext.getMachOSection("__IMPORT", "__pointers",
+                                   MachO::S_NON_LAZY_SYMBOL_POINTERS,
+                                   SectionKind::getMetadata());
+      OutStreamer.SwitchSection(TheSection);
+
+      for (auto &Stub : Stubs)
+        emitNonLazySymbolPointer(OutStreamer, Stub.first, Stub.second);
+
       Stubs.clear();
       OutStreamer.AddBlankLine();
     }
@@ -630,46 +658,25 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 
   if (Subtarget->isTargetCOFF()) {
-    X86COFFMachineModuleInfo &COFFMMI =
-      MMI->getObjFileInfo<X86COFFMachineModuleInfo>();
-
-    // Emit type information for external functions
-    typedef X86COFFMachineModuleInfo::externals_iterator externals_iterator;
-    for (externals_iterator I = COFFMMI.externals_begin(),
-                            E = COFFMMI.externals_end();
-                            I != E; ++I) {
-      OutStreamer.BeginCOFFSymbolDef(CurrentFnSym);
-      OutStreamer.EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_EXTERNAL);
-      OutStreamer.EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
-                                               << COFF::SCT_COMPLEX_TYPE_SHIFT);
-      OutStreamer.EndCOFFSymbolDef();
-    }
-
     // Necessary for dllexport support
     std::vector<const MCSymbol*> DLLExportedFns, DLLExportedGlobals;
 
-    for (Module::const_iterator I = M.begin(), E = M.end(); I != E; ++I)
-      if (I->hasDLLExportStorageClass())
-        DLLExportedFns.push_back(getSymbol(I));
+    for (const auto &Function : M)
+      if (Function.hasDLLExportStorageClass())
+        DLLExportedFns.push_back(getSymbol(&Function));
 
-    for (Module::const_global_iterator I = M.global_begin(),
-           E = M.global_end(); I != E; ++I)
-      if (I->hasDLLExportStorageClass())
-        DLLExportedGlobals.push_back(getSymbol(I));
+    for (const auto &Global : M.globals())
+      if (Global.hasDLLExportStorageClass())
+        DLLExportedGlobals.push_back(getSymbol(&Global));
 
-    for (Module::const_alias_iterator I = M.alias_begin(), E = M.alias_end();
-                                      I != E; ++I) {
-      const GlobalValue *GV = I;
-      if (!GV->hasDLLExportStorageClass())
+    for (const auto &Alias : M.aliases()) {
+      if (!Alias.hasDLLExportStorageClass())
         continue;
 
-      while (const GlobalAlias *A = dyn_cast<GlobalAlias>(GV))
-        GV = A->getAliasedGlobal();
-
-      if (isa<Function>(GV))
-        DLLExportedFns.push_back(getSymbol(I));
-      else if (isa<GlobalVariable>(GV))
-        DLLExportedGlobals.push_back(getSymbol(I));
+      if (Alias.getType()->getElementType()->isFunctionTy())
+        DLLExportedFns.push_back(getSymbol(&Alias));
+      else
+        DLLExportedGlobals.push_back(getSymbol(&Alias));
     }
 
     // Output linker support code for dllexported globals on windows.
@@ -678,28 +685,11 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
         static_cast<const TargetLoweringObjectFileCOFF&>(getObjFileLowering());
 
       OutStreamer.SwitchSection(TLOFCOFF.getDrectveSection());
-      SmallString<128> name;
-      for (unsigned i = 0, e = DLLExportedGlobals.size(); i != e; ++i) {
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name = " /EXPORT:";
-        else
-          name = " -export:";
-        name += DLLExportedGlobals[i]->getName();
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name += ",DATA";
-        else
-        name += ",data";
-        OutStreamer.EmitBytes(name);
-      }
 
-      for (unsigned i = 0, e = DLLExportedFns.size(); i != e; ++i) {
-        if (Subtarget->isTargetKnownWindowsMSVC())
-          name = " /EXPORT:";
-        else
-          name = " -export:";
-        name += DLLExportedFns[i]->getName();
-        OutStreamer.EmitBytes(name);
-      }
+      for (auto & Symbol : DLLExportedGlobals)
+        GenerateExportDirective(Symbol, /*IsData=*/true);
+      for (auto & Symbol : DLLExportedFns)
+        GenerateExportDirective(Symbol, /*IsData=*/false);
     }
   }
 
@@ -715,9 +705,9 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       OutStreamer.SwitchSection(TLOFELF.getDataRelSection());
       const DataLayout *TD = TM.getDataLayout();
 
-      for (unsigned i = 0, e = Stubs.size(); i != e; ++i) {
-        OutStreamer.EmitLabel(Stubs[i].first);
-        OutStreamer.EmitSymbolValue(Stubs[i].second.getPointer(),
+      for (const auto &Stub : Stubs) {
+        OutStreamer.EmitLabel(Stub.first);
+        OutStreamer.EmitSymbolValue(Stub.second.getPointer(),
                                     TD->getPointerSize());
       }
       Stubs.clear();
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 3308cc2..e4eef5d 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -16,13 +16,15 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-
 class MCStreamer;
+class MCSymbol;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget;
   StackMaps SM;
 
+  void GenerateExportDirective(const MCSymbol *Sym, bool IsData);
+
  public:
   explicit X86AsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
     : AsmPrinter(TM, Streamer), SM(*this) {
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.cpp b/lib/Target/X86/X86COFFMachineModuleInfo.cpp
deleted file mode 100644
index 6a6125b..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.cpp
+++ /dev/null
@@ -1,19 +0,0 @@
-//===-- X86COFFMachineModuleInfo.cpp - X86 COFF MMI Impl ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86COFFMachineModuleInfo.h"
-using namespace llvm;
-
-
-X86COFFMachineModuleInfo::~X86COFFMachineModuleInfo() {
-}
diff --git a/lib/Target/X86/X86COFFMachineModuleInfo.h b/lib/Target/X86/X86COFFMachineModuleInfo.h
deleted file mode 100644
index 0dfeb42..0000000
--- a/lib/Target/X86/X86COFFMachineModuleInfo.h
+++ /dev/null
@@ -1,46 +0,0 @@
-//===-- X86coffmachinemoduleinfo.h - X86 COFF MMI Impl ----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This is an MMI implementation for X86 COFF (windows) targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef X86COFF_MACHINEMODULEINFO_H
-#define X86COFF_MACHINEMODULEINFO_H
-
-#include "X86MachineFunctionInfo.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-
-namespace llvm {
-  class X86MachineFunctionInfo;
-  class DataLayout;
-
-/// X86COFFMachineModuleInfo - This is a MachineModuleInfoImpl implementation
-/// for X86 COFF targets.
-class X86COFFMachineModuleInfo : public MachineModuleInfoImpl {
-  DenseSet<MCSymbol const *> Externals;
-public:
-  X86COFFMachineModuleInfo(const MachineModuleInfo &) {}
-  virtual ~X86COFFMachineModuleInfo();
-
-  void addExternalFunction(MCSymbol* Symbol) {
-    Externals.insert(Symbol);
-  }
-
-  typedef DenseSet<MCSymbol const *>::const_iterator externals_iterator;
-  externals_iterator externals_begin() const { return Externals.begin(); }
-  externals_iterator externals_end() const { return Externals.end(); }
-};
-
-
-
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/X86/X86CallingConv.h b/lib/Target/X86/X86CallingConv.h
index 040da35..e76f9fd 100644
--- a/lib/Target/X86/X86CallingConv.h
+++ b/lib/Target/X86/X86CallingConv.h
@@ -29,33 +29,6 @@ inline bool CC_X86_AnyReg_Error(unsigned &, MVT &, MVT &,
   return false;
 }
 
-inline bool CC_X86_CDeclMethod_SRet(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
-                                    CCValAssign::LocInfo &LocInfo,
-                                    ISD::ArgFlagsTy &ArgFlags, CCState &State) {
-  // Swap the order of the first two parameters if the first parameter is sret.
-  if (ArgFlags.isSRet()) {
-    assert(ValNo == 0);
-    assert(ValVT == MVT::i32);
-    State.AllocateStack(8, 4);
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 4, LocVT, LocInfo));
-
-    // Indicate that we need to swap the order of the first and second
-    // parameters by "allocating" register zero.  There are no register
-    // parameters with cdecl methods, so we can use this to communicate to the
-    // next call.
-    State.AllocateReg(1);
-    return true;
-  } else if (ValNo == 1 && State.isAllocated(1)) {
-    assert(ValVT == MVT::i32 && "non-i32-sized this param unsupported");
-    // Stack was already allocated while processing sret.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, 0, LocVT, LocInfo));
-    return true;
-  }
-
-  // All other args use the C calling convention.
-  return false;
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 1cfd827..0824d4e 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -485,15 +485,6 @@ def CC_X86_32_ThisCall_Win : CallingConv<[
   CCDelegateTo<CC_X86_32_ThisCall_Common>
 ]>;
 
-def CC_X86_CDeclMethod : CallingConv<[
-  // Promote i8/i16 arguments to i32.
-  CCIfType<[i8, i16], CCPromoteToType<i32>>,
-
-  CCCustom<"CC_X86_CDeclMethod_SRet">,
-
-  CCDelegateTo<CC_X86_32_Common>
-]>;
-
 def CC_X86_32_ThisCall : CallingConv<[
   CCIfSubtarget<"isTargetCygMing()", CCDelegateTo<CC_X86_32_ThisCall_Mingw>>,
   CCDelegateTo<CC_X86_32_ThisCall_Win>
@@ -583,7 +574,6 @@ def CC_Intel_OCL_BI : CallingConv<[
 def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
   CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
-  CCIfCC<"CallingConv::X86_CDeclMethod", CCDelegateTo<CC_X86_CDeclMethod>>,
   CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index f6c4c2e..76718d0 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-emitter"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86JITInfo.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-emitter"
+
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
 
 namespace {
@@ -52,7 +53,7 @@ namespace {
   public:
     static char ID;
     explicit Emitter(X86TargetMachine &tm, CodeEmitter &mce)
-      : MachineFunctionPass(ID), II(0), TD(0), TM(tm),
+      : MachineFunctionPass(ID), II(nullptr), TD(nullptr), TM(tm),
         MCE(mce), PICBaseOffset(0), Is64BitMode(false),
         IsPIC(TM.getRelocationModel() == Reloc::PIC_) {}
 
@@ -450,7 +451,7 @@ void Emitter<CodeEmitter>::emitMemModRMByte(const MachineInstr &MI,
                                             intptr_t PCAdj) {
   const MachineOperand &Op3 = MI.getOperand(Op+3);
   int DispVal = 0;
-  const MachineOperand *DispForReloc = 0;
+  const MachineOperand *DispForReloc = nullptr;
 
   // Figure out what sort of displacement we have to handle here.
   if (Op3.isGlobal()) {
@@ -1475,7 +1476,7 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 #ifndef NDEBUG
     dbgs() << "Cannot encode all operands of: " << MI << "\n";
 #endif
-    llvm_unreachable(0);
+    llvm_unreachable(nullptr);
   }
 
   MCE.processDebugLoc(MI.getDebugLoc(), false);
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 1aab1ea..56bcfa3 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -183,7 +183,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, const X86AddressMode &AM,
                                   unsigned &ResultReg) {
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
@@ -363,7 +363,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
     // it works...).
     if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
       if (const GlobalVariable *GVar =
-              dyn_cast_or_null<GlobalVariable>(GA->getAliasedGlobal()))
+              dyn_cast_or_null<GlobalVariable>(GA->getAliasee()))
         if (GVar->isThreadLocal())
           return false;
 
@@ -406,7 +406,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       } else {
         // Issue load from stub.
         unsigned Opc = 0;
-        const TargetRegisterClass *RC = NULL;
+        const TargetRegisterClass *RC = nullptr;
         X86AddressMode StubAM;
         StubAM.Base.Reg = AM.Base.Reg;
         StubAM.GV = GV;
@@ -441,7 +441,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
       // Now construct the final address. Note that the Disp, Scale,
       // and Index values may already be set here.
       AM.Base.Reg = LoadReg;
-      AM.GV = 0;
+      AM.GV = nullptr;
       return true;
     }
   }
@@ -467,7 +467,7 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
   SmallVector<const Value *, 32> GEPs;
 redo_gep:
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   if (const Instruction *I = dyn_cast<Instruction>(V)) {
     // Don't walk into other basic blocks; it's possible we haven't
@@ -626,7 +626,7 @@ redo_gep:
 /// X86SelectCallAddress - Attempt to fill in an address from the given value.
 ///
 bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
-  const User *U = NULL;
+  const User *U = nullptr;
   unsigned Opcode = Instruction::UserOp1;
   const Instruction *I = dyn_cast<Instruction>(V);
   // Record if the value is defined in the same basic block.
@@ -1247,7 +1247,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
 
 bool X86FastISel::X86SelectShift(const Instruction *I) {
   unsigned CReg = 0, OpReg = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   if (I->getType()->isIntegerTy(8)) {
     CReg = X86::CL;
     RC = &X86::GR8RegClass;
@@ -1487,7 +1487,7 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
   if (!Subtarget->hasCMov()) return false;
 
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   if (VT == MVT::i16) {
     Opc = X86::CMOVE16rr;
     RC = &X86::GR16RegClass;
@@ -1821,10 +1821,10 @@ bool X86FastISel::FastLowerArguments() {
     }
   }
 
-  static const uint16_t GPR32ArgRegs[] = {
+  static const MCPhysReg GPR32ArgRegs[] = {
     X86::EDI, X86::ESI, X86::EDX, X86::ECX, X86::R8D, X86::R9D
   };
-  static const uint16_t GPR64ArgRegs[] = {
+  static const MCPhysReg GPR64ArgRegs[] = {
     X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8 , X86::R9
   };
 
@@ -1865,7 +1865,7 @@ bool X86FastISel::X86SelectCall(const Instruction *I) {
   if (cast<CallInst>(I)->isTailCall())
     return false;
 
-  return DoSelectCall(I, 0);
+  return DoSelectCall(I, nullptr);
 }
 
 static unsigned computeBytesPoppedByCallee(const X86Subtarget &Subtarget,
@@ -1936,8 +1936,8 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (!X86SelectCallAddress(Callee, CalleeAM))
     return false;
   unsigned CalleeOp = 0;
-  const GlobalValue *GV = 0;
-  if (CalleeAM.GV != 0) {
+  const GlobalValue *GV = nullptr;
+  if (CalleeAM.GV != nullptr) {
     GV = CalleeAM.GV;
   } else if (CalleeAM.Base.Reg != 0) {
     CalleeOp = CalleeAM.Base.Reg;
@@ -2163,7 +2163,7 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
 
   if (Subtarget->is64Bit() && isVarArg && !isWin64) {
     // Count the number of XMM registers allocated.
-    static const uint16_t XMMArgRegs[] = {
+    static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
@@ -2387,7 +2387,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
 
   // Get opcode and regclass of the output for the given load instruction.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::i8:
@@ -2437,7 +2437,7 @@ unsigned X86FastISel::TargetMaterializeConstant(const Constant *C) {
       // If the expression is just a basereg, then we're done, otherwise we need
       // to emit an LEA.
       if (AM.BaseType == X86AddressMode::RegBase &&
-          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == 0)
+          AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
         return AM.Base.Reg;
 
       Opc = TLI.getPointerTy() == MVT::i32 ? X86::LEA32r : X86::LEA64r;
@@ -2510,7 +2510,7 @@ unsigned X86FastISel::TargetMaterializeFloatZero(const ConstantFP *CF) {
 
   // Get opcode and regclass for the given zero.
   unsigned Opc = 0;
-  const TargetRegisterClass *RC = NULL;
+  const TargetRegisterClass *RC = nullptr;
   switch (VT.SimpleTy) {
   default: return 0;
   case MVT::f32:
@@ -2558,7 +2558,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
 
   MachineInstr *Result =
     XII.foldMemoryOperandImpl(*FuncInfo.MF, MI, OpNo, AddrOps, Size, Alignment);
-  if (Result == 0) return false;
+  if (!Result) return false;
 
   FuncInfo.MBB->insert(FuncInfo.InsertPt, Result);
   MI->eraseFromParent();
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index c2c234b..6c5b86f 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -13,7 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-fixup-LEAs"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-fixup-LEAs"
+
 STATISTIC(NumLEAs, "Number of LEA instructions created");
 
 namespace {
@@ -56,6 +57,11 @@ namespace {
     void processInstruction(MachineBasicBlock::iterator& I,
                             MachineFunction::iterator MFI);
 
+    /// \brief Given a LEA instruction which is unprofitable
+    /// on Silvermont try to replace it with an equivalent ADD instruction
+    void processInstructionForSLM(MachineBasicBlock::iterator& I,
+                                  MachineFunction::iterator MFI);
+
     /// \brief Determine if an instruction references a machine register
     /// and, if so, whether it reads or writes the register.
     RegUsageState usesRegister(MachineOperand& p,
@@ -85,7 +91,7 @@ namespace {
   private:
     MachineFunction *MF;
     const TargetMachine *TM;
-    const TargetInstrInfo *TII; // Machine instruction info.
+    const X86InstrInfo *TII; // Machine instruction info.
 
   };
   char FixupLEAPass::ID = 0;
@@ -97,7 +103,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
   MachineInstr* MI = MBBI;
   MachineInstr* NewMI;
   switch (MI->getOpcode()) {
-  case X86::MOV32rr: 
+  case X86::MOV32rr:
   case X86::MOV64rr: {
     const MachineOperand& Src = MI->getOperand(1);
     const MachineOperand& Dest = MI->getOperand(0);
@@ -123,7 +129,7 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
     if (!MI->getOperand(2).isImm()) {
       // convertToThreeAddress will call getImm()
       // which requires isImm() to be true
-      return 0;
+      return nullptr;
     }
     break;
   case X86::ADD16rr:
@@ -132,10 +138,10 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
       // if src1 != src2, then convertToThreeAddress will
       // need to create a Virtual register, which we cannot do
       // after register allocation.
-      return 0;
+      return nullptr;
     }
   }
-  return TII->convertToThreeAddress(MFI, MBBI, 0);
+  return TII->convertToThreeAddress(MFI, MBBI, nullptr);
 }
 
 FunctionPass *llvm::createX86FixupLEAs() {
@@ -143,9 +149,12 @@ FunctionPass *llvm::createX86FixupLEAs() {
 }
 
 bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
-  MF = &Func;
-  TM = &MF->getTarget();
-  TII = TM->getInstrInfo();
+  TM = &Func.getTarget();
+  const X86Subtarget &ST = TM->getSubtarget<X86Subtarget>();
+  if (!ST.LEAusesAG() && !ST.slowLEA())
+    return false;
+
+  TII = static_cast<const X86InstrInfo*>(TM->getInstrInfo());
 
   DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   // Process all basic blocks.
@@ -211,7 +220,7 @@ MachineBasicBlock::iterator FixupLEAPass::searchBackwards(MachineOperand& p,
     InstrDistance += TII->getInstrLatency(TM->getInstrItineraryData(), CurInst);
     Found = getPreviousInstr(CurInst, MFI);
   }
-  return 0;
+  return nullptr;
 }
 
 void FixupLEAPass::processInstruction(MachineBasicBlock::iterator& I,
@@ -242,9 +251,9 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
     MachineInstr* NewMI = postRAConvertToLEA(MFI, MBI);
     if (NewMI) {
       ++NumLEAs;
-      DEBUG(dbgs() << "Candidate to replace:"; MBI->dump(););
+      DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MBI->dump(););
       // now to replace with an equivalent LEA...
-      DEBUG(dbgs() << "Replaced by: "; NewMI->dump(););
+      DEBUG(dbgs() << "FixLEA: Replaced by: "; NewMI->dump(););
       MFI->erase(MBI);
       MachineBasicBlock::iterator J =
                              static_cast<MachineBasicBlock::iterator> (NewMI);
@@ -253,10 +262,80 @@ void FixupLEAPass::seekLEAFixup(MachineOperand& p,
   }
 }
 
+void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I,
+                                            MachineFunction::iterator MFI) {
+  MachineInstr *MI = I;
+  const int opcode = MI->getOpcode();
+  if (opcode != X86::LEA16r && opcode != X86::LEA32r && opcode != X86::LEA64r &&
+      opcode != X86::LEA64_32r)
+    return;
+  if (MI->getOperand(5).getReg() != 0 || !MI->getOperand(4).isImm() ||
+      !TII->isSafeToClobberEFLAGS(*MFI, I))
+    return;
+  const unsigned DstR = MI->getOperand(0).getReg();
+  const unsigned SrcR1 = MI->getOperand(1).getReg();
+  const unsigned SrcR2 = MI->getOperand(3).getReg();
+  if ((SrcR1 == 0 || SrcR1 != DstR) && (SrcR2 == 0 || SrcR2 != DstR))
+    return;
+  if (MI->getOperand(2).getImm() > 1)
+    return;
+  int addrr_opcode, addri_opcode;
+  switch (opcode) {
+  case X86::LEA16r:
+    addrr_opcode = X86::ADD16rr;
+    addri_opcode = X86::ADD16ri;
+    break;
+  case X86::LEA32r:
+    addrr_opcode = X86::ADD32rr;
+    addri_opcode = X86::ADD32ri;
+    break;
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    addrr_opcode = X86::ADD64rr;
+    addri_opcode = X86::ADD64ri32;
+    break;
+  default:
+    assert(false && "Unexpected LEA instruction");
+  }
+  DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump(););
+  DEBUG(dbgs() << "FixLEA: Replaced by: ";);
+  MachineInstr *NewMI = 0;
+  const MachineOperand &Dst = MI->getOperand(0);
+  // Make ADD instruction for two registers writing to LEA's destination
+  if (SrcR1 != 0 && SrcR2 != 0) {
+    const MachineOperand &Src1 = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    const MachineOperand &Src2 = MI->getOperand(SrcR1 == DstR ? 3 : 1);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addrr_opcode))
+                .addOperand(Dst)
+                .addOperand(Src1)
+                .addOperand(Src2);
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  // Make ADD instruction for immediate
+  if (MI->getOperand(4).getImm() != 0) {
+    const MachineOperand &SrcR = MI->getOperand(SrcR1 == DstR ? 1 : 3);
+    NewMI = BuildMI(*MF, MI->getDebugLoc(), TII->get(addri_opcode))
+                .addOperand(Dst)
+                .addOperand(SrcR)
+                .addImm(MI->getOperand(4).getImm());
+    MFI->insert(I, NewMI);
+    DEBUG(NewMI->dump(););
+  }
+  if (NewMI) {
+    MFI->erase(I);
+    I = static_cast<MachineBasicBlock::iterator>(NewMI);
+  }
+}
+
 bool FixupLEAPass::processBasicBlock(MachineFunction &MF,
                                      MachineFunction::iterator MFI) {
 
-  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I)
-    processInstruction(I, MFI);
+  for (MachineBasicBlock::iterator I = MFI->begin(); I != MFI->end(); ++I) {
+    if (TM->getSubtarget<X86Subtarget>().isSLM())
+      processInstructionForSLM(I, MFI);
+    else
+      processInstruction(I, MFI);
+  }
   return false;
 }
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 7955ade..c8a3ab3 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -23,7 +23,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-codegen"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -45,6 +44,8 @@
 #include <algorithm>
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-codegen"
+
 STATISTIC(NumFXCH, "Number of fxch instructions inserted");
 STATISTIC(NumFP  , "Number of floating point instructions");
 
@@ -430,7 +431,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
     if (FPInstClass == X86II::NotFP)
       continue;  // Efficiently ignore non-fp insts!
 
-    MachineInstr *PrevMI = 0;
+    MachineInstr *PrevMI = nullptr;
     if (I != BB.begin())
       PrevMI = std::prev(I);
 
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f0ad4d1..4c1374f 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -182,7 +182,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
       }
     }
 
-    MachineInstr *MI = NULL;
+    MachineInstr *MI = nullptr;
 
     if (UseLEA) {
       MI =  addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr),
@@ -204,7 +204,7 @@ void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
 /// mergeSPUpdatesUp - Merge two stack-manipulating instructions upper iterator.
 static
 void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                      unsigned StackPtr, uint64_t *NumBytes = NULL) {
+                      unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   if (MBBI == MBB.begin()) return;
 
   MachineBasicBlock::iterator PI = std::prev(MBBI);
@@ -225,11 +225,12 @@ void mergeSPUpdatesUp(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
   }
 }
 
-/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower iterator.
+/// mergeSPUpdatesDown - Merge two stack-manipulating instructions lower
+/// iterator.
 static
 void mergeSPUpdatesDown(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator &MBBI,
-                        unsigned StackPtr, uint64_t *NumBytes = NULL) {
+                        unsigned StackPtr, uint64_t *NumBytes = nullptr) {
   // FIXME:  THIS ISN'T RUN!!!
   return;
 
@@ -257,19 +258,19 @@ void mergeSPUpdatesDown(MachineBasicBlock &MBB,
 }
 
 /// mergeSPUpdates - Checks the instruction before/after the passed
-/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and the
-/// stack adjustment is returned as a positive value for ADD/LEA and a negative for
-/// SUB.
+/// instruction. If it is an ADD/SUB/LEA instruction it is deleted argument and
+/// the stack adjustment is returned as a positive value for ADD/LEA and a
+/// negative for SUB.
 static int mergeSPUpdates(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator &MBBI,
-                           unsigned StackPtr,
-                           bool doMergeWithPrevious) {
+                          MachineBasicBlock::iterator &MBBI, unsigned StackPtr,
+                          bool doMergeWithPrevious) {
   if ((doMergeWithPrevious && MBBI == MBB.begin()) ||
       (!doMergeWithPrevious && MBBI == MBB.end()))
     return 0;
 
   MachineBasicBlock::iterator PI = doMergeWithPrevious ? std::prev(MBBI) : MBBI;
-  MachineBasicBlock::iterator NI = doMergeWithPrevious ? 0 : std::next(MBBI);
+  MachineBasicBlock::iterator NI = doMergeWithPrevious ? nullptr
+                                                       : std::next(MBBI);
   unsigned Opc = PI->getOpcode();
   int Offset = 0;
 
@@ -366,8 +367,10 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
 
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
     unsigned CFIIndex =
-        MMI.addFrameInst(MCCFIInstruction::createOffset(0, DwarfReg, Offset));
-    BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg,
+                                                        Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
   }
 }
 
@@ -446,7 +449,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       !MFI->adjustsStack() &&                           // No calls.
       !IsWin64 &&                                       // Win64 has no Red Zone
       !usesTheStack(MF) &&                              // Don't push and pop.
-      !MF.getTarget().Options.EnableSegmentedStacks) {  // Regular stack
+      !MF.shouldSplitStack()) {                         // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
     if (HasFP) MinSize += SlotSize;
     StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0);
@@ -511,15 +514,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(0, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
       CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createOffset(0, DwarfFramePtr, 2 * stackGrowth));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createOffset(nullptr,
+                                         DwarfFramePtr, 2 * stackGrowth));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -534,8 +538,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA to use the EBP/RBP register.
       unsigned DwarfFramePtr = RegInfo->getDwarfRegNum(FramePtr, true);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaRegister(0, DwarfFramePtr));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+          MCCFIInstruction::createDefCfaRegister(nullptr, DwarfFramePtr));
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -564,7 +568,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
           MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
       StackOffset += stackGrowth;
     }
@@ -698,9 +702,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       unsigned CFIIndex = MMI.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(0, -StackSize + stackGrowth));
+          MCCFIInstruction::createDefCfaOffset(nullptr,
+                                               -StackSize + stackGrowth));
 
-      BuildMI(MBB, MBBI, DL, TII.get(X86::CFI_INSTRUCTION))
+      BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
 
@@ -905,7 +910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF, int FI) const {
+int X86FrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                          int FI) const {
   const X86RegisterInfo *RegInfo =
     static_cast<const X86RegisterInfo*>(MF.getTarget().getRegisterInfo());
   const MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1170,6 +1176,15 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
       !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD())
     report_fatal_error("Segmented stacks not supported on this platform.");
 
+  // Eventually StackSize will be calculated by a link-time pass; which will
+  // also decide whether checking code needs to be injected into this particular
+  // prologue.
+  StackSize = MFI->getStackSize();
+
+  // Do not generate a prologue for functions with a stack of size zero
+  if (StackSize == 0)
+    return;
+
   MachineBasicBlock *allocMBB = MF.CreateMachineBasicBlock();
   MachineBasicBlock *checkMBB = MF.CreateMachineBasicBlock();
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
@@ -1194,11 +1209,6 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
   MF.push_front(allocMBB);
   MF.push_front(checkMBB);
 
-  // Eventually StackSize will be calculated by a link-time pass; which will
-  // also decide whether checking code needs to be injected into this particular
-  // prologue.
-  StackSize = MFI->getStackSize();
-
   // When the frame size is less than 256 we just compare the stack
   // boundary directly to the value of the stack pointer, per gcc.
   bool CompareStackPointer = StackSize < kSplitStackAvailable;
@@ -1256,22 +1266,23 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
         .addReg(0).addImm(0).addReg(0).addImm(TlsOffset).addReg(TlsReg);
     } else if (STI.isTargetDarwin()) {
 
-      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register
+      // TlsOffset doesn't fit into a mod r/m byte so we need an extra register.
       unsigned ScratchReg2;
       bool SaveScratch2;
       if (CompareStackPointer) {
-        // The primary scratch register is available for holding the TLS offset
+        // The primary scratch register is available for holding the TLS offset.
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, true);
         SaveScratch2 = false;
       } else {
         // Need to use a second register to hold the TLS offset
         ScratchReg2 = GetScratchRegister(Is64Bit, MF, false);
 
-        // Unfortunately, with fastcc the second scratch register may hold an arg
+        // Unfortunately, with fastcc the second scratch register may hold an
+        // argument.
         SaveScratch2 = MF.getRegInfo().isLiveIn(ScratchReg2);
       }
 
-      // If Scratch2 is live-in then it needs to be saved
+      // If Scratch2 is live-in then it needs to be saved.
       assert((!MF.getRegInfo().isLiveIn(ScratchReg2) || SaveScratch2) &&
              "Scratch register is live-in and not saved");
 
@@ -1348,14 +1359,14 @@ X86FrameLowering::adjustForSegmentedStacks(MachineFunction &MF) const {
 /// http://publications.uu.se/uu/fulltext/nbn_se_uu_diva-2688.pdf)
 ///
 /// CheckStack:
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 /// OldStart:
-///	  ...
+///       ...
 /// IncStack:
-///	  call inc_stack   # doubles the stack space
-///	  temp0 = sp - MaxStack
-///	  if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
+///       call inc_stack   # doubles the stack space
+///       temp0 = sp - MaxStack
+///       if( temp0 < SP_LIMIT(P) ) goto IncStack else goto OldStart
 void X86FrameLowering::adjustForHiPEPrologue(MachineFunction &MF) const {
   const X86InstrInfo &TII = *TM.getInstrInfo();
   MachineFrameInfo *MFI = MF.getFrameInfo();
@@ -1514,7 +1525,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     unsigned StackAlign = TM.getFrameLowering()->getStackAlignment();
     Amount = (Amount + StackAlign - 1) / StackAlign * StackAlign;
 
-    MachineInstr *New = 0;
+    MachineInstr *New = nullptr;
     if (Opcode == TII.getCallFrameSetupOpcode()) {
       New = BuildMI(MF, DL, TII.get(getSUBriOpcode(IsLP64, Amount)),
                     StackPtr)
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index f0db8cb..208bb8b 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -47,7 +47,7 @@ public:
   void adjustForHiPEPrologue(MachineFunction &MF) const override;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                        RegScavenger *RS = NULL) const override;
+                                     RegScavenger *RS = nullptr) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3e45adb..74386d3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-isel"
 #include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86MachineFunctionInfo.h"
@@ -36,6 +35,8 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-isel"
+
 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 
 //===----------------------------------------------------------------------===//
@@ -70,17 +71,18 @@ namespace {
 
     X86ISelAddressMode()
       : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
-        Segment(), GV(0), CP(0), BlockAddr(0), ES(0), JT(-1), Align(0),
-        SymbolFlags(X86II::MO_NO_FLAG) {
+        Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
+        JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {
     }
 
     bool hasSymbolicDisplacement() const {
-      return GV != 0 || CP != 0 || ES != 0 || JT != -1 || BlockAddr != 0;
+      return GV != nullptr || CP != nullptr || ES != nullptr ||
+             JT != -1 || BlockAddr != nullptr;
     }
 
     bool hasBaseOrIndexReg() const {
       return BaseType == FrameIndexBase ||
-             IndexReg.getNode() != 0 || Base_Reg.getNode() != 0;
+             IndexReg.getNode() != nullptr || Base_Reg.getNode() != nullptr;
     }
 
     /// isRIPRelative - Return true if this addressing mode is already RIP
@@ -102,14 +104,14 @@ namespace {
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
-      if (Base_Reg.getNode() != 0)
+      if (Base_Reg.getNode())
         Base_Reg.getNode()->dump();
       else
         dbgs() << "nul";
       dbgs() << " Base.FrameIndex " << Base_FrameIndex << '\n'
              << " Scale" << Scale << '\n'
              << "IndexReg ";
-      if (IndexReg.getNode() != 0)
+      if (IndexReg.getNode())
         IndexReg.getNode()->dump();
       else
         dbgs() << "nul";
@@ -160,6 +162,13 @@ namespace {
       return "X86 DAG->DAG Instruction Selection";
     }
 
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      // Reset the subtarget each time through.
+      Subtarget = &TM.getSubtarget<X86Subtarget>();
+      SelectionDAGISel::runOnMachineFunction(MF);
+      return true;
+    }
+
     void EmitFunctionEntryCode() override;
 
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
@@ -374,14 +383,13 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
       else
         Ops.push_back(Chain.getOperand(i));
     SDValue NewChain =
-      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load),
-                      MVT::Other, &Ops[0], Ops.size());
+      CurDAG->getNode(ISD::TokenFactor, SDLoc(Load), MVT::Other, Ops);
     Ops.clear();
     Ops.push_back(NewChain);
   }
   for (unsigned i = 1, e = OrigChain.getNumOperands(); i != e; ++i)
     Ops.push_back(OrigChain.getOperand(i));
-  CurDAG->UpdateNodeOperands(OrigChain.getNode(), &Ops[0], Ops.size());
+  CurDAG->UpdateNodeOperands(OrigChain.getNode(), Ops);
   CurDAG->UpdateNodeOperands(Load.getNode(), Call.getOperand(0),
                              Load.getOperand(1), Load.getOperand(2));
 
@@ -390,7 +398,7 @@ static void MoveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
   Ops.push_back(SDValue(Load.getNode(), 1));
   for (unsigned i = 1, e = NumOps; i != e; ++i)
     Ops.push_back(Call.getOperand(i));
-  CurDAG->UpdateNodeOperands(Call.getNode(), &Ops[0], NumOps);
+  CurDAG->UpdateNodeOperands(Call.getNode(), Ops);
 }
 
 /// isCalleeLoad - Return true if call address is a load and it can be
@@ -612,7 +620,7 @@ bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // gs:0 (or fs:0 on X86-64) contains its own address.
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
-    if (C->getSExtValue() == 0 && AM.Segment.getNode() == 0 &&
+    if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
         Subtarget->isTargetLinux())
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
@@ -733,7 +741,7 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
   // a smaller encoding and avoids a scaled-index.
   if (AM.Scale == 2 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
-      AM.Base_Reg.getNode() == 0) {
+      AM.Base_Reg.getNode() == nullptr) {
     AM.Base_Reg = AM.IndexReg;
     AM.Scale = 1;
   }
@@ -745,8 +753,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
       Subtarget->is64Bit() &&
       AM.Scale == 1 &&
       AM.BaseType == X86ISelAddressMode::RegBase &&
-      AM.Base_Reg.getNode() == 0 &&
-      AM.IndexReg.getNode() == 0 &&
+      AM.Base_Reg.getNode() == nullptr &&
+      AM.IndexReg.getNode() == nullptr &&
       AM.SymbolFlags == X86II::MO_NO_FLAG &&
       AM.hasSymbolicDisplacement())
     AM.Base_Reg = CurDAG->getRegister(X86::RIP, MVT::i64);
@@ -926,7 +934,7 @@ static bool FoldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
   APInt MaskedHighBits =
     APInt::getHighBitsSet(X.getSimpleValueType().getSizeInBits(), MaskLZ);
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(X, KnownZero, KnownOne);
+  DAG.computeKnownBits(X, KnownZero, KnownOne);
   if (MaskedHighBits != KnownZero) return true;
 
   // We've identified a pattern that can be transformed into a single shift
@@ -1009,7 +1017,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   case ISD::FrameIndex:
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
-        AM.Base_Reg.getNode() == 0 &&
+        AM.Base_Reg.getNode() == nullptr &&
         (!Subtarget->is64Bit() || isDispSafeForFrameIndex(AM.Disp))) {
       AM.BaseType = X86ISelAddressMode::FrameIndexBase;
       AM.Base_FrameIndex = cast<FrameIndexSDNode>(N)->getIndex();
@@ -1018,7 +1026,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     break;
 
   case ISD::SHL:
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1)
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1)
       break;
 
     if (ConstantSDNode
@@ -1052,7 +1060,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 
   case ISD::SRL: {
     // Scale must not be used already.
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue And = N.getOperand(0);
     if (And.getOpcode() != ISD::AND) break;
@@ -1086,8 +1094,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
   case X86ISD::MUL_IMM:
     // X*[3,5,9] -> X+X*[2,4,8]
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
-        AM.Base_Reg.getNode() == 0 &&
-        AM.IndexReg.getNode() == 0) {
+        AM.Base_Reg.getNode() == nullptr &&
+        AM.IndexReg.getNode() == nullptr) {
       if (ConstantSDNode
             *CN = dyn_cast<ConstantSDNode>(N.getNode()->getOperand(1)))
         if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 ||
@@ -1237,7 +1245,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // with a constant to enable use of the scaled offset field.
 
     // Scale must not be used already.
-    if (AM.IndexReg.getNode() != 0 || AM.Scale != 1) break;
+    if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break;
 
     SDValue Shift = N.getOperand(0);
     if (Shift.getOpcode() != ISD::SRL && Shift.getOpcode() != ISD::SHL) break;
@@ -1276,7 +1284,7 @@ bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
   // Is the base register already occupied?
   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
     // If so, check to see if the scale index register is set.
-    if (AM.IndexReg.getNode() == 0) {
+    if (!AM.IndexReg.getNode()) {
       AM.IndexReg = N;
       AM.Scale = 1;
       return false;
@@ -1567,7 +1575,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
 
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return NULL;
+    return nullptr;
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
   const SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, In2L, In2H, Chain};
@@ -1756,7 +1764,7 @@ static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
 
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   if (Node->hasAnyUseOfValue(0))
-    return 0;
+    return nullptr;
 
   SDLoc dl(Node);
 
@@ -1768,13 +1776,13 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   SDValue Val = Node->getOperand(2);
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return 0;
+    return nullptr;
 
   // Which index into the table.
   enum AtomicOpc Op;
   switch (Node->getOpcode()) {
     default:
-      return 0;
+      return nullptr;
     case ISD::ATOMIC_LOAD_OR:
       Op = OR;
       break;
@@ -1795,7 +1803,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
 
   unsigned Opc = 0;
   switch (NVT.SimpleTy) {
-    default: return 0;
+    default: return nullptr;
     case MVT::i8:
       if (isCN)
         Opc = AtomicOpcTbl[Op][ConstantI8];
@@ -1847,7 +1855,7 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, MVT NVT) {
   }
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
   SDValue RetVals[] = { Undef, Ret };
-  return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
+  return CurDAG->getMergeValues(RetVals, dl).getNode();
 }
 
 /// HasNoSignedComparisonUses - Test whether the given X86ISD::CMP node has
@@ -1990,7 +1998,7 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
       // Make a new TokenFactor with all the other input chains except
       // for the load.
       InputChain = CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain),
-                                   MVT::Other, &ChainOps[0], ChainOps.size());
+                                   MVT::Other, ChainOps);
   }
   if (!ChainCheck)
     return false;
@@ -2027,7 +2035,7 @@ SDNode *X86DAGToDAGISel::SelectGather(SDNode *Node, unsigned Opc) {
   SDValue VMask = Node->getOperand(5);
   ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
   if (!Scale)
-    return 0;
+    return nullptr;
 
   SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
                                    MVT::Other);
@@ -2058,7 +2066,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   if (Node->isMachineOpcode()) {
     DEBUG(dbgs() << "== ";  Node->dump(CurDAG); dbgs() << '\n');
     Node->setNodeId(-1);
-    return NULL;   // Already selected.
+    return nullptr;   // Already selected.
   }
 
   switch (Opcode) {
@@ -2108,7 +2116,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       SDNode *RetVal = SelectGather(Node, Opc);
       if (RetVal)
         // We already called ReplaceUses inside SelectGather.
-        return NULL;
+        return nullptr;
       break;
     }
     }
@@ -2259,7 +2267,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     ReplaceUses(SDValue(Node, 0), SDValue(CNode, 0));
     ReplaceUses(SDValue(Node, 1), SDValue(CNode, 1));
     ReplaceUses(SDValue(Node, 2), SDValue(CNode, 2));
-    return NULL;
+    return nullptr;
   }
 
   case ISD::SMUL_LOHI:
@@ -2386,7 +2394,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      if (ResLo.getNode() == 0) {
+      if (!ResLo.getNode()) {
         assert(LoReg && "Register for low half is not defined!");
         ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg, NVT,
                                        InFlag);
@@ -2397,7 +2405,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      if (ResHi.getNode() == 0) {
+      if (!ResHi.getNode()) {
         assert(HiReg && "Register for high half is not defined!");
         ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg, NVT,
                                        InFlag);
@@ -2407,7 +2415,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
     }
 
-    return NULL;
+    return nullptr;
   }
 
   case ISD::SDIVREM:
@@ -2575,7 +2583,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
       ReplaceUses(SDValue(Node, 1), Result);
       DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
     }
-    return NULL;
+    return nullptr;
   }
 
   case X86ISD::CMP:
@@ -2632,7 +2640,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testl %eax, $2048" to "testb %ah, $8".
@@ -2669,7 +2677,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testl %eax, $32776" to "testw %ax, $32776".
@@ -2691,7 +2699,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
 
       // For example, "testq %rax, $268468232" to "testl %eax, $268468232".
@@ -2713,7 +2721,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
         // one, do not call ReplaceAllUsesWith.
         ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
                     SDValue(NewNode, 0));
-        return NULL;
+        return nullptr;
       }
     }
     break;
@@ -2740,7 +2748,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue StoredVal = StoreNode->getOperand(1);
     unsigned Opc = StoredVal->getOpcode();
 
-    LoadSDNode *LoadNode = 0;
+    LoadSDNode *LoadNode = nullptr;
     SDValue InputChain;
     if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
                              LoadNode, InputChain))
@@ -2772,7 +2780,7 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
   SDNode *ResNode = SelectCode(Node);
 
   DEBUG(dbgs() << "=> ";
-        if (ResNode == NULL || ResNode == Node)
+        if (ResNode == nullptr || ResNode == Node)
           Node->dump(CurDAG);
         else
           ResNode->dump(CurDAG);
@@ -2790,7 +2798,7 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
   case 'v':   // not offsetable    ??
   default: return true;
   case 'm':   // memory
-    if (!SelectAddr(0, Op, Op0, Op1, Op2, Op3, Op4))
+    if (!SelectAddr(nullptr, Op, Op0, Op1, Op2, Op3, Op4))
       return true;
     break;
   }
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 2a35061..cbaf44e 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-isel"
 #include "X86ISelLowering.h"
 #include "Utils/X86ShuffleDecode.h"
 #include "X86CallingConv.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/VariadicFunction.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -52,6 +52,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-isel"
+
 STATISTIC(NumTailCalls, "Number of tail calls");
 
 // Forward declarations.
@@ -84,7 +86,8 @@ static SDValue ExtractSubVector(SDValue Vec, unsigned IdxVal,
   // If the input is a buildvector just emit a smaller one.
   if (Vec.getOpcode() == ISD::BUILD_VECTOR)
     return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
-                       Vec->op_begin()+NormalizedIdxVal, ElemsPerChunk);
+                       makeArrayRef(Vec->op_begin()+NormalizedIdxVal,
+                                    ElemsPerChunk));
 
   SDValue VecIdx = DAG.getIntPtrConstant(NormalizedIdxVal);
   SDValue Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec,
@@ -265,10 +268,10 @@ void X86TargetLowering::resetOperationActions() {
 
     // The _ftol2 runtime function has an unusual calling conv, which
     // is modeled by a special pseudo-instruction.
-    setLibcallName(RTLIB::FPTOUINT_F64_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I64, 0);
-    setLibcallName(RTLIB::FPTOUINT_F64_I32, 0);
-    setLibcallName(RTLIB::FPTOUINT_F32_I32, 0);
+    setLibcallName(RTLIB::FPTOUINT_F64_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I64, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F64_I32, nullptr);
+    setLibcallName(RTLIB::FPTOUINT_F32_I32, nullptr);
   }
 
   if (Subtarget->isTargetDarwin()) {
@@ -635,15 +638,8 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::STACKSAVE,          MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
-  if (Subtarget->isOSWindows() && !Subtarget->isTargetMacho())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Custom);
-  else if (TM.Options.EnableSegmentedStacks)
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Custom);
-  else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
-                       MVT::i64 : MVT::i32, Expand);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+                     MVT::i64 : MVT::i32, Custom);
 
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
     // f32 and f64 use SSE.
@@ -832,7 +828,9 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FRINT, VT, Expand);
     setOperationAction(ISD::FNEARBYINT, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHS, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::MULHU, VT, Expand);
     setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::FPOW, VT, Expand);
@@ -944,6 +942,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::ADD,                MVT::v2i64, Legal);
     setOperationAction(ISD::MUL,                MVT::v4i32, Custom);
     setOperationAction(ISD::MUL,                MVT::v2i64, Custom);
+    setOperationAction(ISD::UMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::SMUL_LOHI,          MVT::v4i32, Custom);
+    setOperationAction(ISD::MULHU,              MVT::v8i16, Legal);
+    setOperationAction(ISD::MULHS,              MVT::v8i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v16i8, Legal);
     setOperationAction(ISD::SUB,                MVT::v8i16, Legal);
     setOperationAction(ISD::SUB,                MVT::v4i32, Legal);
@@ -1036,6 +1038,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
     setLoadExtAction(ISD::EXTLOAD,              MVT::v2f32, Legal);
+
+    setOperationAction(ISD::BITCAST,            MVT::v2i32, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v4i16, Custom);
+    setOperationAction(ISD::BITCAST,            MVT::v8i8,  Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasSSE41()) {
@@ -1064,11 +1070,14 @@ void X86TargetLowering::resetOperationActions() {
     // FIXME: Do we need to handle scalar-to-vector here?
     setOperationAction(ISD::MUL,                MVT::v4i32, Legal);
 
-    setOperationAction(ISD::VSELECT,            MVT::v2f64, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v2i64, Legal);
+    setOperationAction(ISD::VSELECT,            MVT::v2f64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v2i64, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4i32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v4f32, Custom);
+    setOperationAction(ISD::VSELECT,            MVT::v8i16, Custom);
+    // There is no BLENDI for byte vectors. We don't need to custom lower
+    // some vselects for now.
     setOperationAction(ISD::VSELECT,            MVT::v16i8, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4i32, Legal);
-    setOperationAction(ISD::VSELECT,            MVT::v4f32, Legal);
 
     // i8 and i16 vectors are custom , because the source register and source
     // source memory operand types are not the same width.  f32 vectors are
@@ -1111,9 +1120,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SHL,               MVT::v4i32, Custom);
 
     setOperationAction(ISD::SRA,               MVT::v4i32, Custom);
-
-    setOperationAction(ISD::SDIV,              MVT::v8i16, Custom);
-    setOperationAction(ISD::SDIV,              MVT::v4i32, Custom);
   }
 
   if (!TM.Options.UseSoftFloat && Subtarget->hasFp256()) {
@@ -1178,8 +1184,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SRA,               MVT::v16i16, Custom);
     setOperationAction(ISD::SRA,               MVT::v32i8, Custom);
 
-    setOperationAction(ISD::SDIV,              MVT::v16i16, Custom);
-
     setOperationAction(ISD::SETCC,             MVT::v32i8, Custom);
     setOperationAction(ISD::SETCC,             MVT::v16i16, Custom);
     setOperationAction(ISD::SETCC,             MVT::v8i32, Custom);
@@ -1189,10 +1193,10 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::SELECT,            MVT::v4i64, Custom);
     setOperationAction(ISD::SELECT,            MVT::v8f32, Custom);
 
-    setOperationAction(ISD::VSELECT,           MVT::v4f64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v4i64, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8i32, Legal);
-    setOperationAction(ISD::VSELECT,           MVT::v8f32, Legal);
+    setOperationAction(ISD::VSELECT,           MVT::v4f64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v4i64, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8i32, Custom);
+    setOperationAction(ISD::VSELECT,           MVT::v8f32, Custom);
 
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v4i64, Custom);
     setOperationAction(ISD::SIGN_EXTEND,       MVT::v8i32, Custom);
@@ -1232,9 +1236,13 @@ void X86TargetLowering::resetOperationActions() {
       setOperationAction(ISD::MUL,             MVT::v16i16, Legal);
       // Don't lower v32i8 because there is no 128-bit byte mul
 
-      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
+      setOperationAction(ISD::UMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::SMUL_LOHI,       MVT::v8i32, Custom);
+      setOperationAction(ISD::MULHU,           MVT::v16i16, Legal);
+      setOperationAction(ISD::MULHS,           MVT::v16i16, Legal);
 
-      setOperationAction(ISD::SDIV,            MVT::v8i32, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v16i16, Custom);
+      setOperationAction(ISD::VSELECT,         MVT::v32i8, Legal);
     } else {
       setOperationAction(ISD::ADD,             MVT::v4i64, Custom);
       setOperationAction(ISD::ADD,             MVT::v8i32, Custom);
@@ -1343,7 +1351,6 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FNEG,               MVT::v8f64, Custom);
     setOperationAction(ISD::FMA,                MVT::v8f64, Legal);
     setOperationAction(ISD::FMA,                MVT::v16f32, Legal);
-    setOperationAction(ISD::SDIV,               MVT::v16i32, Custom);
 
     setOperationAction(ISD::FP_TO_SINT,         MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::i32, Legal);
@@ -1358,9 +1365,11 @@ void X86TargetLowering::resetOperationActions() {
     setOperationAction(ISD::FP_TO_SINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v16i32, Legal);
     setOperationAction(ISD::FP_TO_UINT,         MVT::v8i32, Legal);
+    setOperationAction(ISD::FP_TO_UINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v16i32, Legal);
     setOperationAction(ISD::UINT_TO_FP,         MVT::v8i32, Legal);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v8f32, Legal);
     setOperationAction(ISD::FP_EXTEND,          MVT::v8f32, Legal);
 
@@ -1392,6 +1401,8 @@ void X86TargetLowering::resetOperationActions() {
 
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1,  Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v16i1, Custom);
+    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v8i1, Custom);
     setOperationAction(ISD::BUILD_VECTOR,       MVT::v16i1, Custom);
     setOperationAction(ISD::SELECT,             MVT::v8f64, Custom);
@@ -1474,6 +1485,8 @@ void X86TargetLowering::resetOperationActions() {
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  if (!Subtarget->is64Bit())
+    setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
 
   // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
   // handle type legalization for these operations here.
@@ -1498,9 +1511,9 @@ void X86TargetLowering::resetOperationActions() {
 
   if (!Subtarget->is64Bit()) {
     // These libcalls are not available in 32-bit.
-    setLibcallName(RTLIB::SHL_I128, 0);
-    setLibcallName(RTLIB::SRL_I128, 0);
-    setLibcallName(RTLIB::SRA_I128, 0);
+    setLibcallName(RTLIB::SHL_I128, nullptr);
+    setLibcallName(RTLIB::SRL_I128, nullptr);
+    setLibcallName(RTLIB::SRA_I128, nullptr);
   }
 
   // Combine sin / cos into one node or libcall if possible.
@@ -1516,6 +1529,15 @@ void X86TargetLowering::resetOperationActions() {
     }
   }
 
+  if (Subtarget->isTargetWin64()) {
+    setOperationAction(ISD::SDIV, MVT::i128, Custom);
+    setOperationAction(ISD::UDIV, MVT::i128, Custom);
+    setOperationAction(ISD::SREM, MVT::i128, Custom);
+    setOperationAction(ISD::UREM, MVT::i128, Custom);
+    setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
+  }
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
@@ -1540,6 +1562,7 @@ void X86TargetLowering::resetOperationActions() {
   setTargetDAGCombine(ISD::TRUNCATE);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
+  setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
   if (Subtarget->is64Bit())
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
@@ -1738,7 +1761,7 @@ getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
 // FIXME: Why this routine is here? Move to RegInfo!
 std::pair<const TargetRegisterClass*, uint8_t>
 X86TargetLowering::findRepresentativeClass(MVT VT) const{
-  const TargetRegisterClass *RRC = 0;
+  const TargetRegisterClass *RRC = nullptr;
   uint8_t Cost = 1;
   switch (VT.SimpleTy) {
   default:
@@ -1806,8 +1829,8 @@ X86TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
   return CCInfo.CheckReturn(Outs, RetCC_X86);
 }
 
-const uint16_t *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
-  static const uint16_t ScratchRegs[] = { X86::R11, 0 };
+const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
+  static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
   return ScratchRegs;
 }
 
@@ -1930,8 +1953,7 @@ X86TargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(X86ISD::RET_FLAG, dl,
-                     MVT::Other, &RetOps[0], RetOps.size());
+  return DAG.getNode(X86ISD::RET_FLAG, dl, MVT::Other, RetOps);
 }
 
 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -2285,22 +2307,25 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     InVals.push_back(ArgValue);
   }
 
-  // The x86-64 ABIs require that for returning structs by value we copy
-  // the sret argument into %rax/%eax (depending on ABI) for the return.
-  // Win32 requires us to put the sret argument to %eax as well.
-  // Save the argument into a virtual register so that we can access it
-  // from the return points.
-  if (MF.getFunction()->hasStructRetAttr() &&
-      (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC())) {
-    X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-    unsigned Reg = FuncInfo->getSRetReturnReg();
-    if (!Reg) {
-      MVT PtrTy = getPointerTy();
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
-      FuncInfo->setSRetReturnReg(Reg);
+  if (Subtarget->is64Bit() || Subtarget->isTargetKnownWindowsMSVC()) {
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      // The x86-64 ABIs require that for returning structs by value we copy
+      // the sret argument into %rax/%eax (depending on ABI) for the return.
+      // Win32 requires us to put the sret argument to %eax as well.
+      // Save the argument into a virtual register so that we can access it
+      // from the return points.
+      if (Ins[i].Flags.isSRet()) {
+        unsigned Reg = FuncInfo->getSRetReturnReg();
+        if (!Reg) {
+          MVT PtrTy = getPointerTy();
+          Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
+          FuncInfo->setSRetReturnReg(Reg);
+        }
+        SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+        break;
+      }
     }
-    SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
   }
 
   unsigned StackSize = CCInfo.getNextStackOffset();
@@ -2320,17 +2345,17 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
       unsigned TotalNumIntRegs = 0, TotalNumXMMRegs = 0;
 
       // FIXME: We should really autogenerate these arrays
-      static const uint16_t GPR64ArgRegsWin64[] = {
+      static const MCPhysReg GPR64ArgRegsWin64[] = {
         X86::RCX, X86::RDX, X86::R8,  X86::R9
       };
-      static const uint16_t GPR64ArgRegs64Bit[] = {
+      static const MCPhysReg GPR64ArgRegs64Bit[] = {
         X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
       };
-      static const uint16_t XMMArgRegs64Bit[] = {
+      static const MCPhysReg XMMArgRegs64Bit[] = {
         X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
         X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
       };
-      const uint16_t *GPR64ArgRegs;
+      const MCPhysReg *GPR64ArgRegs;
       unsigned NumXMMRegs = 0;
 
       if (IsWin64) {
@@ -2424,13 +2449,11 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
           SaveXMMOps.push_back(Val);
         }
         MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                     MVT::Other,
-                                     &SaveXMMOps[0], SaveXMMOps.size()));
+                                     MVT::Other, SaveXMMOps));
       }
 
       if (!MemOps.empty())
-        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                            &MemOps[0], MemOps.size());
+        Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
     }
   }
 
@@ -2497,10 +2520,10 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 
 /// EmitTailCallStoreRetAddr - Emit a store of the return address if tail call
 /// optimization is performed and it is required (FPDiff!=0).
-static SDValue
-EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
-                         unsigned SlotSize, int FPDiff, SDLoc dl) {
+static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
+                                        SDValue Chain, SDValue RetAddrFrIdx,
+                                        EVT PtrVT, unsigned SlotSize,
+                                        int FPDiff, SDLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
@@ -2537,7 +2560,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (MF.getTarget().Options.DisableTailCalls)
     isTailCall = false;
 
-  if (isTailCall) {
+  bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+  if (IsMustTail) {
+    // Force this to be a tail call.  The verifier rules are enough to ensure
+    // that we can lower this successfully without moving the return address
+    // around.
+    isTailCall = true;
+  } else if (isTailCall) {
     // Check if it's really possible to do a tail call.
     isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
                     isVarArg, SR != NotStructReturn,
@@ -2578,7 +2607,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
 
   int FPDiff = 0;
-  if (isTailCall && !IsSibcall) {
+  if (isTailCall && !IsSibcall && !IsMustTail) {
     // Lower arguments at fp - stackoffset + fpdiff.
     X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
     unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
@@ -2683,7 +2712,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       }
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
-      if (StackPtr.getNode() == 0)
+      if (!StackPtr.getNode())
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                       getPointerTy());
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
@@ -2692,8 +2721,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   if (Subtarget->isPICStyleGOT()) {
     // ELF / PIC requires GOT in the EBX register before function calls via PLT
@@ -2730,7 +2758,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // registers used and is in the range 0 - 8 inclusive.
 
     // Count the number of XMM registers allocated.
-    static const uint16_t XMMArgRegs[] = {
+    static const MCPhysReg XMMArgRegs[] = {
       X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
       X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
     };
@@ -2742,8 +2770,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         DAG.getConstant(NumXMMRegs, MVT::i8)));
   }
 
-  // For tail calls lower the arguments to the 'real' stack slot.
-  if (isTailCall) {
+  // For tail calls lower the arguments to the 'real' stack slots.  Sibcalls
+  // don't need this because the eligibility check rejects calls that require
+  // shuffling arguments passed in memory.
+  if (!IsSibcall && isTailCall) {
     // Force all the incoming stack arguments to be loaded from the stack
     // before any new outgoing arguments are stored to the stack, because the
     // outgoing stack slots may alias the incoming argument stack slots, and
@@ -2755,45 +2785,45 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     SmallVector<SDValue, 8> MemOpChains2;
     SDValue FIN;
     int FI = 0;
-    if (getTargetMachine().Options.GuaranteedTailCallOpt) {
-      for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-        CCValAssign &VA = ArgLocs[i];
-        if (VA.isRegLoc())
-          continue;
-        assert(VA.isMemLoc());
-        SDValue Arg = OutVals[i];
-        ISD::ArgFlagsTy Flags = Outs[i].Flags;
-        // Create frame index.
-        int32_t Offset = VA.getLocMemOffset()+FPDiff;
-        uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
-        FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
-        FIN = DAG.getFrameIndex(FI, getPointerTy());
-
-        if (Flags.isByVal()) {
-          // Copy relative to framepointer.
-          SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
-          if (StackPtr.getNode() == 0)
-            StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                          RegInfo->getStackRegister(),
-                                          getPointerTy());
-          Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
-
-          MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
-                                                           ArgChain,
-                                                           Flags, DAG, dl));
-        } else {
-          // Store relative to framepointer.
-          MemOpChains2.push_back(
-            DAG.getStore(ArgChain, dl, Arg, FIN,
-                         MachinePointerInfo::getFixedStack(FI),
-                         false, false, 0));
-        }
+    for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+      CCValAssign &VA = ArgLocs[i];
+      if (VA.isRegLoc())
+        continue;
+      assert(VA.isMemLoc());
+      SDValue Arg = OutVals[i];
+      ISD::ArgFlagsTy Flags = Outs[i].Flags;
+      // Skip inalloca arguments.  They don't require any work.
+      if (Flags.isInAlloca())
+        continue;
+      // Create frame index.
+      int32_t Offset = VA.getLocMemOffset()+FPDiff;
+      uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
+      FI = MF.getFrameInfo()->CreateFixedObject(OpSize, Offset, true);
+      FIN = DAG.getFrameIndex(FI, getPointerTy());
+
+      if (Flags.isByVal()) {
+        // Copy relative to framepointer.
+        SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
+        if (!StackPtr.getNode())
+          StackPtr = DAG.getCopyFromReg(Chain, dl,
+                                        RegInfo->getStackRegister(),
+                                        getPointerTy());
+        Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
+
+        MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
+                                                         ArgChain,
+                                                         Flags, DAG, dl));
+      } else {
+        // Store relative to framepointer.
+        MemOpChains2.push_back(
+          DAG.getStore(ArgChain, dl, Arg, FIN,
+                       MachinePointerInfo::getFixedStack(FI),
+                       false, false, 0));
       }
     }
 
     if (!MemOpChains2.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                          &MemOpChains2[0], MemOpChains2.size());
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
 
     // Store the return address to the appropriate stack slot.
     Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
@@ -2930,10 +2960,10 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // This isn't right, although it's probably harmless on x86; liveouts
     // should be computed from returns not tail calls.  Consider a void
     // function making a tail call to a function returning int.
-    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, &Ops[0], Ops.size());
+    return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
   }
 
-  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -3927,6 +3957,29 @@ static bool isMOVLHPSMask(ArrayRef<int> Mask, MVT VT) {
   return true;
 }
 
+/// isINSERTPSMask - Return true if the specified VECTOR_SHUFFLE operand
+/// specifies a shuffle of elements that is suitable for input to INSERTPS.
+/// i. e: If all but one element come from the same vector.
+static bool isINSERTPSMask(ArrayRef<int> Mask, MVT VT) {
+  // TODO: Deal with AVX's VINSERTPS
+  if (!VT.is128BitVector() || (VT != MVT::v4f32 && VT != MVT::v4i32))
+    return false;
+
+  unsigned CorrectPosV1 = 0;
+  unsigned CorrectPosV2 = 0;
+  for (int i = 0, e = (int)VT.getVectorNumElements(); i != e; ++i)
+    if (Mask[i] == i)
+      ++CorrectPosV1;
+    else if (Mask[i] == i + 4)
+      ++CorrectPosV2;
+
+  if (CorrectPosV1 == 3 || CorrectPosV2 == 3)
+    // We have 3 elements from one vector, and one from another.
+    return true;
+
+  return false;
+}
+
 //
 // Some special combinations that can be optimized.
 //
@@ -4146,6 +4199,29 @@ static bool isUNPCKH_v_undef_Mask(ArrayRef<int> Mask, MVT VT, bool HasInt256) {
   return true;
 }
 
+// Match for INSERTI64x4 INSERTF64x4 instructions (src0[0], src1[0]) or
+// (src1[0], src0[1]), manipulation with 256-bit sub-vectors
+static bool isINSERT64x4Mask(ArrayRef<int> Mask, MVT VT, unsigned int *Imm) {
+  if (!VT.is512BitVector())
+    return false;
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned HalfSize = NumElts/2;
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, 0)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, NumElts)) {
+      *Imm = 1;
+      return true;
+    }
+  }
+  if (isSequentialOrUndefInRange(Mask, 0, HalfSize, NumElts)) {
+    if (isSequentialOrUndefInRange(Mask, HalfSize, HalfSize, HalfSize)) {
+      *Imm = 0;
+      return true;
+    }
+  }
+  return false;
+}
+
 /// isMOVLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to MOVSS,
 /// MOVSD, and MOVD, i.e. setting the lowest element.
@@ -4624,11 +4700,17 @@ unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
   return getInsertVINSERTImmediate(N, 256);
 }
 
+/// isZero - Returns true if Elt is a constant integer zero
+static bool isZero(SDValue V) {
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
+  return C && C->isNullValue();
+}
+
 /// isZeroNode - Returns true if Elt is a constant zero or a floating point
 /// constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
-  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Elt))
-    return CN->isNullValue();
+  if (isZero(Elt))
+    return true;
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Elt))
     return CFP->getValueAPF().isPosZero();
   return false;
@@ -4677,7 +4759,7 @@ static bool ShouldXformToMOVHLPS(ArrayRef<int> Mask, MVT VT) {
 /// isScalarLoadToVector - Returns true if the node is a scalar load that
 /// is promoted to a vector. It also returns the LoadSDNode by reference if
 /// required.
-static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = NULL) {
+static bool isScalarLoadToVector(SDNode *N, LoadSDNode **LD = nullptr) {
   if (N->getOpcode() != ISD::SCALAR_TO_VECTOR)
     return false;
   N = N->getOperand(0).getNode();
@@ -4803,28 +4885,24 @@ static SDValue getZeroVector(EVT VT, const X86Subtarget *Subtarget,
     if (Subtarget->hasInt256()) { // AVX2
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else {
       // 256-bit logic and arithmetic instructions in AVX are all
       // floating-point, no support for integer ops. Emit fp zeroed vectors.
       SDValue Cst = DAG.getTargetConstantFP(+0.0, MVT::f32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8f32, Ops);
     }
   } else if (VT.is512BitVector()) { // AVX-512
       SDValue Cst = DAG.getTargetConstant(0, MVT::i32);
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
                         Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops, 16);
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i32, Ops);
   } else if (VT.getScalarType() == MVT::i1) {
     assert(VT.getVectorNumElements() <= 16 && "Unexpected vector type");
     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   } else
     llvm_unreachable("Unexpected vector type");
 
@@ -4844,8 +4922,7 @@ static SDValue getOnesVector(MVT VT, bool HasInt256, SelectionDAG &DAG,
   if (VT.is256BitVector()) {
     if (HasInt256) { // AVX2
       SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops,
-                        array_lengthof(Ops));
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops);
     } else { // AVX
       Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
       Vec = Concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
@@ -5307,7 +5384,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     return SDValue();
 
   SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 16; ++i) {
     bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -5320,7 +5397,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
 
     if ((i & 1) != 0) {
-      SDValue ThisElt(0, 0), LastElt(0, 0);
+      SDValue ThisElt, LastElt;
       bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
       if (LastIsNonZero) {
         LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
@@ -5355,7 +5432,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
     return SDValue();
 
   SDLoc dl(Op);
-  SDValue V(0, 0);
+  SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
     bool isNonZero = (NonZeros & (1 << i)) != 0;
@@ -5376,6 +5453,79 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   return V;
 }
 
+/// LowerBuildVectorv4x32 - Custom lower build_vector of v4i32 or v4f32.
+static SDValue LowerBuildVectorv4x32(SDValue Op, unsigned NumElems,
+                                     unsigned NonZeros, unsigned NumNonZero,
+                                     unsigned NumZero, SelectionDAG &DAG,
+                                     const X86Subtarget *Subtarget,
+                                     const TargetLowering &TLI) {
+  // We know there's at least one non-zero element
+  unsigned FirstNonZeroIdx = 0;
+  SDValue FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  while (FirstNonZero.getOpcode() == ISD::UNDEF ||
+         X86::isZeroNode(FirstNonZero)) {
+    ++FirstNonZeroIdx;
+    FirstNonZero = Op->getOperand(FirstNonZeroIdx);
+  }
+
+  if (FirstNonZero.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      !isa<ConstantSDNode>(FirstNonZero.getOperand(1)))
+    return SDValue();
+
+  SDValue V = FirstNonZero.getOperand(0);
+  MVT VVT = V.getSimpleValueType();
+  if (!Subtarget->hasSSE41() || (VVT != MVT::v4f32 && VVT != MVT::v4i32))
+    return SDValue();
+
+  unsigned FirstNonZeroDst =
+      cast<ConstantSDNode>(FirstNonZero.getOperand(1))->getZExtValue();
+  unsigned CorrectIdx = FirstNonZeroDst == FirstNonZeroIdx;
+  unsigned IncorrectIdx = CorrectIdx ? -1U : FirstNonZeroIdx;
+  unsigned IncorrectDst = CorrectIdx ? -1U : FirstNonZeroDst;
+
+  for (unsigned Idx = FirstNonZeroIdx + 1; Idx < NumElems; ++Idx) {
+    SDValue Elem = Op.getOperand(Idx);
+    if (Elem.getOpcode() == ISD::UNDEF || X86::isZeroNode(Elem))
+      continue;
+
+    // TODO: What else can be here? Deal with it.
+    if (Elem.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // TODO: Some optimizations are still possible here
+    // ex: Getting one element from a vector, and the rest from another.
+    if (Elem.getOperand(0) != V)
+      return SDValue();
+
+    unsigned Dst = cast<ConstantSDNode>(Elem.getOperand(1))->getZExtValue();
+    if (Dst == Idx)
+      ++CorrectIdx;
+    else if (IncorrectIdx == -1U) {
+      IncorrectIdx = Idx;
+      IncorrectDst = Dst;
+    } else
+      // There was already one element with an incorrect index.
+      // We can't optimize this case to an insertps.
+      return SDValue();
+  }
+
+  if (NumNonZero == CorrectIdx || NumNonZero == CorrectIdx + 1) {
+    SDLoc dl(Op);
+    EVT VT = Op.getSimpleValueType();
+    unsigned ElementMoveMask = 0;
+    if (IncorrectIdx == -1U)
+      ElementMoveMask = FirstNonZeroIdx << 6 | FirstNonZeroIdx << 4;
+    else
+      ElementMoveMask = IncorrectDst << 6 | IncorrectIdx << 4;
+
+    SDValue InsertpsMask =
+        DAG.getIntPtrConstant(ElementMoveMask | (~NonZeros & 0xf));
+    return DAG.getNode(X86ISD::INSERTPS, dl, VT, V, V, InsertpsMask);
+  }
+
+  return SDValue();
+}
+
 /// getVShift - Return a vector logical shift node.
 ///
 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp,
@@ -5480,7 +5630,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElems = Elts.size();
 
-  LoadSDNode *LDBase = NULL;
+  LoadSDNode *LDBase = nullptr;
   unsigned LastLoadedElt = -1U;
 
   // For each element in the initializer, see if we've found a load or an undef.
@@ -5545,8 +5695,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
     SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
     SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
     SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
-                                array_lengthof(Ops), MVT::i64,
+        DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, MVT::i64,
                                 LDBase->getPointerInfo(),
                                 LDBase->getAlignment(),
                                 false/*isVolatile*/, true/*ReadMem*/,
@@ -5661,7 +5810,7 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
     unsigned ScalarSize = CVT.getSizeInBits();
 
     if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)) {
-      const Constant *C = 0;
+      const Constant *C = nullptr;
       if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
         C = CI->getConstantIntValue();
       else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
@@ -5706,6 +5855,41 @@ static SDValue LowerVectorBroadcast(SDValue Op, const X86Subtarget* Subtarget,
   return SDValue();
 }
 
+/// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
+/// underlying vector and index.
+///
+/// Modifies \p ExtractedFromVec to the real vector and returns the real
+/// index.
+static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
+                                         SDValue ExtIdx) {
+  int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
+  if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
+    return Idx;
+
+  // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
+  // lowered this:
+  //   (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+  // to:
+  //   (extract_vector_elt (vector_shuffle<2,u,u,u>
+  //                           (extract_subvector (v8f32 %vreg0), Constant<4>),
+  //                           undef)
+  //                       Constant<0>)
+  // In this case the vector is the extract_subvector expression and the index
+  // is 2, as specified by the shuffle.
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
+  SDValue ShuffleVec = SVOp->getOperand(0);
+  MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
+  assert(ShuffleVecVT.getVectorElementType() ==
+         ExtractedFromVec.getSimpleValueType().getVectorElementType());
+
+  int ShuffleIdx = SVOp->getMaskElt(Idx);
+  if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
+    ExtractedFromVec = ShuffleVec;
+    return ShuffleIdx;
+  }
+  return Idx;
+}
+
 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
 
@@ -5739,34 +5923,32 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+    // Quit if non-constant index.
+    if (!isa<ConstantSDNode>(ExtIdx))
+      return SDValue();
+    int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
 
     // Quit if extracted from vector of different type.
     if (ExtractedFromVec.getValueType() != VT)
       return SDValue();
 
-    // Quit if non-constant index.
-    if (!isa<ConstantSDNode>(ExtIdx))
-      return SDValue();
-
-    if (VecIn1.getNode() == 0)
+    if (!VecIn1.getNode())
       VecIn1 = ExtractedFromVec;
     else if (VecIn1 != ExtractedFromVec) {
-      if (VecIn2.getNode() == 0)
+      if (!VecIn2.getNode())
         VecIn2 = ExtractedFromVec;
       else if (VecIn2 != ExtractedFromVec)
         // Quit if more than 2 vectors to shuffle
         return SDValue();
     }
 
-    unsigned Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
-
     if (ExtractedFromVec == VecIn1)
       Mask[i] = Idx;
     else if (ExtractedFromVec == VecIn2)
       Mask[i] = Idx + NumElems;
   }
 
-  if (VecIn1.getNode() == 0)
+  if (!VecIn1.getNode())
     return SDValue();
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
@@ -5791,24 +5973,22 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   if (ISD::isBuildVectorAllZeros(Op.getNode())) {
     SDValue Cst = DAG.getTargetConstant(0, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
     SDValue Cst = DAG.getTargetConstant(1, MVT::i1);
-    SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst,
-                      Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT,
-                       Ops, VT.getVectorNumElements());
+    SmallVector<SDValue, 16> Ops(VT.getVectorNumElements(), Cst);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
   }
 
   bool AllContants = true;
   uint64_t Immediate = 0;
   int NonConstIdx = -1;
   bool IsSplat = true;
+  unsigned NumNonConsts = 0;
+  unsigned NumConsts = 0;
   for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
     SDValue In = Op.getOperand(idx);
     if (In.getOpcode() == ISD::UNDEF)
@@ -5816,9 +5996,13 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
     if (!isa<ConstantSDNode>(In)) {
       AllContants = false;
       NonConstIdx = idx;
+      NumNonConsts++;
     }
-    else if (cast<ConstantSDNode>(In)->getZExtValue())
+    else {
+      NumConsts++;
+      if (cast<ConstantSDNode>(In)->getZExtValue())
       Immediate |= (1ULL << idx);
+    }
     if (In != Op.getOperand(0))
       IsSplat = false;
   }
@@ -5830,6 +6014,19 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getIntPtrConstant(0));
   }
 
+  if (NumNonConsts == 1 && NonConstIdx != 0) {
+    SDValue DstVec;
+    if (NumConsts) {
+      SDValue VecAsImm = DAG.getConstant(Immediate,
+                                         MVT::getIntegerVT(VT.getSizeInBits()));
+      DstVec = DAG.getNode(ISD::BITCAST, dl, VT, VecAsImm);
+    }
+    else 
+      DstVec = DAG.getUNDEF(VT);
+    return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
+                       Op.getOperand(NonConstIdx),
+                       DAG.getIntPtrConstant(NonConstIdx));
+  }
   if (!IsSplat && (NonConstIdx != 0))
     llvm_unreachable("Unsupported BUILD_VECTOR operation");
   MVT SelectVT = (VT == MVT::v16i1)? MVT::i16 : MVT::i8;
@@ -6043,9 +6240,10 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
 
     // Build both the lower and upper subvector.
-    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[0], NumElems/2);
-    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT, &V[NumElems / 2],
-                                NumElems/2);
+    SDValue Lower = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[0], NumElems/2));
+    SDValue Upper = DAG.getNode(ISD::BUILD_VECTOR, dl, HVT,
+                                makeArrayRef(&V[NumElems / 2], NumElems/2));
 
     // Recreate the wider vector with the lower and upper part.
     if (VT.is256BitVector())
@@ -6078,6 +6276,14 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     if (V.getNode()) return V;
   }
 
+  // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
+  if (EVTBits == 32 && NumElems == 4) {
+    SDValue V = LowerBuildVectorv4x32(Op, NumElems, NonZeros, NumNonZero,
+                                      NumZero, DAG, Subtarget, *this);
+    if (V.getNode())
+      return V;
+  }
+
   // If element VT is == 32 bits, turn it into a number of shuffles.
   SmallVector<SDValue, 8> V(NumElems);
   if (NumElems == 4 && NumZero > 0) {
@@ -6332,8 +6538,7 @@ static SDValue getPSHUFB(ArrayRef<int> MaskVals, SDValue V1, SDLoc &dl,
   if (ShufVT != VT)
     V1 = DAG.getNode(ISD::BITCAST, dl, ShufVT, V1);
   return DAG.getNode(X86ISD::PSHUFB, dl, ShufVT, V1,
-                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT,
-                                 PshufbMask.data(), PshufbMask.size()));
+                     DAG.getNode(ISD::BUILD_VECTOR, dl, ShufVT, PshufbMask));
 }
 
 // v8i16 shuffles - Prefer shuffles in the following order:
@@ -6516,7 +6721,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                   NewV.getOperand(0),
@@ -6540,7 +6745,7 @@ LowerVECTOR_SHUFFLEv8i16(SDValue Op, const X86Subtarget *Subtarget,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3()) {
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSE2()) {
       ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(NewV.getNode());
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                                   NewV.getOperand(0),
@@ -6635,7 +6840,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     }
     V1 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V1,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
 
     // As PSHUFB will zero elements with negative indices, it's safe to ignore
     // the 2nd operand if it's undefined or zero.
@@ -6653,7 +6858,7 @@ static SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
     }
     V2 = DAG.getNode(X86ISD::PSHUFB, dl, MVT::v16i8, V2,
                      DAG.getNode(ISD::BUILD_VECTOR, dl,
-                                 MVT::v16i8, &pshufbMask[0], 16));
+                                 MVT::v16i8, pshufbMask));
     return DAG.getNode(ISD::OR, dl, MVT::v16i8, V1, V2);
   }
 
@@ -6771,6 +6976,9 @@ SDValue RewriteAsNarrowerShuffle(ShuffleVectorSDNode *SVOp,
   unsigned Scale;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected!");
+  case MVT::v2i64:
+  case MVT::v2f64:
+           return SDValue(SVOp, 0);
   case MVT::v4f32:  NewVT = MVT::v2f64; Scale = 2; break;
   case MVT::v4i32:  NewVT = MVT::v2i64; Scale = 2; break;
   case MVT::v8i16:  NewVT = MVT::v4i32; Scale = 2; break;
@@ -6805,7 +7013,7 @@ static SDValue getVZextMovL(MVT VT, MVT OpVT,
                             SDValue SrcOp, SelectionDAG &DAG,
                             const X86Subtarget *Subtarget, SDLoc dl) {
   if (VT == MVT::v2f64 || VT == MVT::v4f32) {
-    LoadSDNode *LD = NULL;
+    LoadSDNode *LD = nullptr;
     if (!isScalarLoadToVector(SrcOp.getNode(), &LD))
       LD = dyn_cast<LoadSDNode>(SrcOp);
     if (!LD) {
@@ -6924,8 +7132,7 @@ LowerVECTOR_SHUFFLE_256(ShuffleVectorSDNode *SVOp, SelectionDAG &DAG) {
       }
 
       // Construct the output using a BUILD_VECTOR.
-      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, &SVOps[0],
-                              SVOps.size());
+      Output[l] = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, SVOps);
     } else if (InputUsed[0] < 0) {
       // No input vectors were used! The result is undefined.
       Output[l] = DAG.getUNDEF(NVT);
@@ -7207,6 +7414,93 @@ SDValue getMOVLP(SDValue &Op, SDLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+static SDValue NarrowVectorLoadToElement(LoadSDNode *Load, unsigned Index,
+                                         SelectionDAG &DAG) {
+  SDLoc dl(Load);
+  MVT VT = Load->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue Addr = Load->getOperand(1);
+  SDValue NewAddr = DAG.getNode(
+      ISD::ADD, dl, Addr.getSimpleValueType(), Addr,
+      DAG.getConstant(Index * EVT.getStoreSize(), Addr.getSimpleValueType()));
+
+  SDValue NewLoad =
+      DAG.getLoad(EVT, dl, Load->getChain(), NewAddr,
+                  DAG.getMachineFunction().getMachineMemOperand(
+                      Load->getMemOperand(), 0, EVT.getStoreSize()));
+  return NewLoad;
+}
+
+// It is only safe to call this function if isINSERTPSMask is true for
+// this shufflevector mask.
+static SDValue getINSERTPS(ShuffleVectorSDNode *SVOp, SDLoc &dl,
+                           SelectionDAG &DAG) {
+  // Generate an insertps instruction when inserting an f32 from memory onto a
+  // v4f32 or when copying a member from one v4f32 to another.
+  // We also use it for transferring i32 from one register to another,
+  // since it simply copies the same bits.
+  // If we're transferring an i32 from memory to a specific element in a
+  // register, we output a generic DAG that will match the PINSRD
+  // instruction.
+  MVT VT = SVOp->getSimpleValueType(0);
+  MVT EVT = VT.getVectorElementType();
+  SDValue V1 = SVOp->getOperand(0);
+  SDValue V2 = SVOp->getOperand(1);
+  auto Mask = SVOp->getMask();
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "unsupported vector type for insertps/pinsrd");
+
+  int FromV1 = std::count_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; });
+
+  SDValue From;
+  SDValue To;
+  unsigned DestIndex;
+  if (FromV1 == 1) {
+    From = V1;
+    To = V2;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i < 4; }) -
+                Mask.begin();
+  } else {
+    From = V2;
+    To = V1;
+    DestIndex = std::find_if(Mask.begin(), Mask.end(),
+                             [](const int &i) { return i >= 4; }) -
+                Mask.begin();
+  }
+
+  if (MayFoldLoad(From)) {
+    // Trivial case, when From comes from a load and is only used by the
+    // shuffle. Make it use insertps from the vector that we need from that
+    // load.
+    SDValue NewLoad =
+        NarrowVectorLoadToElement(cast<LoadSDNode>(From), DestIndex, DAG);
+    if (!NewLoad.getNode())
+      return SDValue();
+
+    if (EVT == MVT::f32) {
+      // Create this as a scalar to vector to match the instruction pattern.
+      SDValue LoadScalarToVector =
+          DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, NewLoad);
+      SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4);
+      return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, LoadScalarToVector,
+                         InsertpsMask);
+    } else { // EVT == MVT::i32
+      // If we're getting an i32 from memory, use an INSERT_VECTOR_ELT
+      // instruction, to match the PINSRD instruction, which loads an i32 to a
+      // certain vector element.
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, To, NewLoad,
+                         DAG.getConstant(DestIndex, MVT::i32));
+    }
+  }
+
+  // Vector-element-to-vector
+  unsigned SrcIndex = Mask[DestIndex] % 4;
+  SDValue InsertpsMask = DAG.getIntPtrConstant(DestIndex << 4 | SrcIndex << 6);
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, To, From, InsertpsMask);
+}
+
 // Reduce a vector shuffle to zext.
 static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                                     SelectionDAG &DAG) {
@@ -7295,9 +7589,8 @@ static SDValue LowerVectorIntExtend(SDValue Op, const X86Subtarget *Subtarget,
                      DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
 }
 
-static SDValue
-NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
-                       SelectionDAG &DAG) {
+static SDValue NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
+                                      SelectionDAG &DAG) {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -7322,31 +7615,29 @@ NormalizeVectorShuffle(SDValue Op, const X86Subtarget *Subtarget,
 
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
-  if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
-      VT == MVT::v16i16 || VT == MVT::v32i8) {
+  if (VT == MVT::v8i16 || VT == MVT::v16i8 || VT == MVT::v16i16 ||
+      VT == MVT::v32i8) {
     SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
     if (NewOp.getNode())
       return DAG.getNode(ISD::BITCAST, dl, VT, NewOp);
-  } else if ((VT == MVT::v4i32 ||
-             (VT == MVT::v4f32 && Subtarget->hasSSE2()))) {
+  } else if (VT.is128BitVector() && Subtarget->hasSSE2()) {
     // FIXME: Figure out a cleaner way to do this.
-    // Try to make use of movq to zero out the top part.
     if (ISD::isBuildVectorAllZeros(V2.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
         MVT NewVT = NewOp.getSimpleValueType();
         if (isCommutedMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(),
                                NewVT, true, false))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(0),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(0), DAG, Subtarget,
+                              dl);
       }
     } else if (ISD::isBuildVectorAllZeros(V1.getNode())) {
       SDValue NewOp = RewriteAsNarrowerShuffle(SVOp, DAG);
       if (NewOp.getNode()) {
         MVT NewVT = NewOp.getSimpleValueType();
         if (isMOVLMask(cast<ShuffleVectorSDNode>(NewOp)->getMask(), NewVT))
-          return getVZextMovL(VT, NewVT, NewOp.getOperand(1),
-                              DAG, Subtarget, dl);
+          return getVZextMovL(VT, NewVT, NewOp.getOperand(1), DAG, Subtarget,
+                              dl);
       }
     }
   }
@@ -7609,6 +7900,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 getShuffleSHUFImmediate(SVOp), DAG);
   }
 
+  unsigned Idx;
+  if (VT.is512BitVector() && isINSERT64x4Mask(M, VT, &Idx))
+    return Insert256BitVector(V1, Extract256BitVector(V2, 0, DAG, dl),
+                              Idx*(NumElems/2), DAG, dl);
+
   // Handle VPERM2F128/VPERM2I128 permutations
   if (isVPERM2X128Mask(M, VT, HasFp256))
     return getTargetShuffleNode(X86ISD::VPERM2X128, dl, VT, V1,
@@ -7618,6 +7914,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   if (BlendOp.getNode())
     return BlendOp;
 
+  if (Subtarget->hasSSE41() && isINSERTPSMask(M, VT))
+    return getINSERTPS(SVOp, dl, DAG);
+
   unsigned Imm8;
   if (V2IsUndef && HasInt256 && isPermImmMask(M, VT, Imm8))
     return getTargetShuffleNode(X86ISD::VPERMI, dl, VT, V1, Imm8, DAG);
@@ -7631,8 +7930,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
       permclMask.push_back(DAG.getConstant((M[i]>=0) ? M[i] : 0, MaskEltVT));
     }
 
-    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT,
-                                &permclMask[0], NumElems);
+    SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, MaskVectorVT, permclMask);
     if (V2IsUndef)
       // Bitcast is for VPERMPS since mask is v8i32 but node takes v8f32
       return DAG.getNode(X86ISD::VPERMV, dl, VT,
@@ -7684,6 +7982,109 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
+// This function assumes its argument is a BUILD_VECTOR of constants or
+// undef SDNodes. i.e: ISD::isBuildVectorOfConstantSDNodes(BuildVector) is
+// true.
+static bool BUILD_VECTORtoBlendMask(BuildVectorSDNode *BuildVector,
+                                    unsigned &MaskValue) {
+  MaskValue = 0;
+  unsigned NumElems = BuildVector->getNumOperands();
+  // There are 2 lanes if (NumElems > 8), and 1 lane otherwise.
+  unsigned NumLanes = (NumElems - 1) / 8 + 1;
+  unsigned NumElemsInLane = NumElems / NumLanes;
+
+  // Blend for v16i16 should be symetric for the both lanes.
+  for (unsigned i = 0; i < NumElemsInLane; ++i) {
+    SDValue EltCond = BuildVector->getOperand(i);
+    SDValue SndLaneEltCond =
+        (NumLanes == 2) ? BuildVector->getOperand(i + NumElemsInLane) : EltCond;
+
+    int Lane1Cond = -1, Lane2Cond = -1;
+    if (isa<ConstantSDNode>(EltCond))
+      Lane1Cond = !isZero(EltCond);
+    if (isa<ConstantSDNode>(SndLaneEltCond))
+      Lane2Cond = !isZero(SndLaneEltCond);
+
+    if (Lane1Cond == Lane2Cond || Lane2Cond < 0)
+      // Lane1Cond != 0, means we want the first argument.
+      // Lane1Cond == 0, means we want the second argument.
+      // The encoding of this argument is 0 for the first argument, 1
+      // for the second. Therefore, invert the condition.
+      MaskValue |= !Lane1Cond << i;
+    else if (Lane1Cond < 0)
+      MaskValue |= !Lane2Cond << i;
+    else
+      return false;
+  }
+  return true;
+}
+
+// Try to lower a vselect node into a simple blend instruction.
+static SDValue LowerVSELECTtoBlend(SDValue Op, const X86Subtarget *Subtarget,
+                                   SelectionDAG &DAG) {
+  SDValue Cond = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  // Check the mask for BLEND and build the value.
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  // Convert i32 vectors to floating point if it is not AVX2.
+  // AVX2 introduced VPBLENDD instruction for 128 and 256-bit vectors.
+  MVT BlendVT = VT;
+  if (EltVT == MVT::i64 || (EltVT == MVT::i32 && !Subtarget->hasInt256())) {
+    BlendVT = MVT::getVectorVT(MVT::getFloatingPointVT(EltVT.getSizeInBits()),
+                               NumElems);
+    LHS = DAG.getNode(ISD::BITCAST, dl, VT, LHS);
+    RHS = DAG.getNode(ISD::BITCAST, dl, VT, RHS);
+  }
+
+  SDValue Ret = DAG.getNode(X86ISD::BLENDI, dl, BlendVT, LHS, RHS,
+                            DAG.getConstant(MaskValue, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, VT, Ret);
+}
+
+SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
+  SDValue BlendOp = LowerVSELECTtoBlend(Op, Subtarget, DAG);
+  if (BlendOp.getNode())
+    return BlendOp;
+
+  // Some types for vselect were previously set to Expand, not Legal or
+  // Custom. Return an empty SDValue so we fall-through to Expand, after
+  // the Custom lowering phase.
+  MVT VT = Op.getSimpleValueType();
+  switch (VT.SimpleTy) {
+  default:
+    break;
+  case MVT::v8i16:
+  case MVT::v16i16:
+    return SDValue();
+  }
+
+  // We couldn't create a "Blend with immediate" node.
+  // This node should still be legal, but we'll have to emit a blendv*
+  // instruction.
+  return Op;
+}
+
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
   SDLoc dl(Op);
@@ -7946,10 +8347,47 @@ static SDValue LowerINSERT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// Insert one bit to mask vector, like v16i1 or v8i1.
+/// AVX-512 feature.
+SDValue 
+X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  SDValue Vec = Op.getOperand(0);
+  SDValue Elt = Op.getOperand(1);
+  SDValue Idx = Op.getOperand(2);
+  MVT VecVT = Vec.getSimpleValueType();
+
+  if (!isa<ConstantSDNode>(Idx)) {
+    // Non constant index. Extend source and destination,
+    // insert element and then truncate the result.
+    MVT ExtVecVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
+    MVT ExtEltVT = (VecVT == MVT::v8i1 ?  MVT::i64 : MVT::i32);
+    SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, 
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
+      DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
+    return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
+  }
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+  if (Vec.getOpcode() == ISD::UNDEF)
+    return DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                       DAG.getConstant(IdxVal, MVT::i8));
+  const TargetRegisterClass* rc = getRegClassFor(VecVT);
+  unsigned MaxSift = rc->getSize()*8 - 1;
+  EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift, MVT::i8));
+  EltInVec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, EltInVec,
+                    DAG.getConstant(MaxSift - IdxVal, MVT::i8));
+  return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
+}
 SDValue
 X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
   MVT VT = Op.getSimpleValueType();
   MVT EltVT = VT.getVectorElementType();
+  
+  if (EltVT == MVT::i1)
+    return InsertBitToMaskVector(Op, DAG);
 
   SDLoc dl(Op);
   SDValue N0 = Op.getOperand(0);
@@ -8294,10 +8732,10 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
 
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   } else {
     SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -8325,7 +8763,7 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
 static SDValue
 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                                 const EVT PtrVT) {
-  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
@@ -8342,7 +8780,7 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
 
   SDValue Base;
   if (is64Bit) {
-    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
+    Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
                       X86II::MO_TLSLD, /*LocalDynamic=*/true);
   } else {
     SDValue InFlag;
@@ -8481,7 +8919,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     SDValue Chain = DAG.getEntryNode();
     SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Args[] = { Chain, Offset };
-    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args, 2);
+    Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
 
     // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
     MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
@@ -8507,10 +8945,6 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
     // Windows 64bit: gs:0x58
     // Windows 32bit: fs:__tls_array
 
-    // If GV is an alias then use the aliasee for determining
-    // thread-localness.
-    if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-      GV = GA->getAliasedGlobal();
     SDLoc dl(GA);
     SDValue Chain = DAG.getEntryNode();
 
@@ -8609,15 +9043,15 @@ static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
   SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
 
   if (Op.getOpcode() == ISD::SHL_PARTS) {
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   } else {
-    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0, 4);
-    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1, 4);
+    Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
+    Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
   }
 
   SDValue Ops[2] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
@@ -8680,8 +9114,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
   SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
   SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
                                            X86ISD::FILD, DL,
-                                           Tys, Ops, array_lengthof(Ops),
-                                           SrcVT, MMO);
+                                           Tys, Ops, SrcVT, MMO);
 
   if (useSSE) {
     Chain = Result.getValue(1);
@@ -8704,8 +9137,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
                             MachineMemOperand::MOStore, SSFISize, SSFISize);
 
     Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
-                                    Ops, array_lengthof(Ops),
-                                    Op.getValueType(), MMO);
+                                    Ops, Op.getValueType(), MMO);
     Result = DAG.getLoad(Op.getValueType(), DL, Chain, StackSlot,
                          MachinePointerInfo::getFixedStack(SSFI),
                          false, false, false, 0);
@@ -8900,7 +9332,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
   SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         array_lengthof(Ops), MVT::i64, MMO);
+                                         MVT::i64, MMO);
 
   APInt FF(32, 0x5F800000ULL);
 
@@ -8993,8 +9425,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(SSFI),
                               MachineMemOperand::MOLoad, MemSize, MemSize);
-    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops,
-                                    array_lengthof(Ops), DstTy, MMO);
+    Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
     Chain = Value.getValue(1);
     SSFI = MF.getFrameInfo()->CreateStackObject(MemSize, MemSize, false);
     StackSlot = DAG.getFrameIndex(SSFI, getPointerTy());
@@ -9008,8 +9439,7 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
     // Build the FP_TO_INT*_IN_MEM
     SDValue Ops[] = { Chain, Value, StackSlot };
     SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
-                                           Ops, array_lengthof(Ops), DstTy,
-                                           MMO);
+                                           Ops, DstTy, MMO);
     return std::make_pair(FIST, StackSlot);
   } else {
     SDValue ftol = DAG.getNode(X86ISD::WIN_FTOL, DL,
@@ -9021,8 +9451,8 @@ X86TargetLowering:: FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
       MVT::i32, eax.getValue(2));
     SDValue Ops[] = { eax, edx };
     SDValue pair = IsReplace
-      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops, array_lengthof(Ops))
-      : DAG.getMergeValues(Ops, array_lengthof(Ops), DL);
+      ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops)
+      : DAG.getMergeValues(Ops, DL);
     return std::make_pair(pair, SDValue());
   }
 }
@@ -9217,8 +9647,7 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
         for (unsigned j = 0; j < 8; ++j)
           pshufbMask.push_back(DAG.getConstant(0x80, MVT::i8));
       }
-      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8,
-                               &pshufbMask[0], 32);
+      SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v32i8, pshufbMask);
       In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
       In = DAG.getNode(ISD::BITCAST, DL, MVT::v4i64, In);
 
@@ -9284,7 +9713,7 @@ SDValue X86TargetLowering::LowerFP_TO_SINT(SDValue Op,
     /*IsSigned=*/ true, /*IsReplace=*/ false);
   SDValue FIST = Vals.first, StackSlot = Vals.second;
   // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
-  if (FIST.getNode() == 0) return Op;
+  if (!FIST.getNode()) return Op;
 
   if (StackSlot.getNode())
     // Load the result.
@@ -9581,12 +10010,29 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget *Subtarget,
                      VecIns.back(), VecIns.back());
 }
 
+/// \brief return true if \c Op has a use that doesn't just read flags.
+static bool hasNonFlagsUse(SDValue Op) {
+  for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
+       ++UI) {
+    SDNode *User = *UI;
+    unsigned UOpNo = UI.getOperandNo();
+    if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
+      // Look pass truncate.
+      UOpNo = User->use_begin().getOperandNo();
+      User = *User->use_begin();
+    }
+
+    if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
+        !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
+      return true;
+  }
+  return false;
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
-SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
+SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, SDLoc dl,
                                     SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-
   if (Op.getValueType() == MVT::i1)
     // KORTEST instruction should be selected
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
@@ -9687,31 +10133,35 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     Opcode = X86ISD::ADD;
     NumOperands = 2;
     break;
-  case ISD::AND: {
-    // If the primary and result isn't used, don't bother using X86ISD::AND,
-    // because a TEST instruction will be better.
-    bool NonFlagUse = false;
-    for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-           UE = Op.getNode()->use_end(); UI != UE; ++UI) {
-      SDNode *User = *UI;
-      unsigned UOpNo = UI.getOperandNo();
-      if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
-        // Look pass truncate.
-        UOpNo = User->use_begin().getOperandNo();
-        User = *User->use_begin();
-      }
-
-      if (User->getOpcode() != ISD::BRCOND &&
-          User->getOpcode() != ISD::SETCC &&
-          !(User->getOpcode() == ISD::SELECT && UOpNo == 0)) {
-        NonFlagUse = true;
+  case ISD::SHL:
+  case ISD::SRL:
+    // If we have a constant logical shift that's only used in a comparison
+    // against zero turn it into an equivalent AND. This allows turning it into
+    // a TEST instruction later.
+    if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) &&
+        isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
+      EVT VT = Op.getValueType();
+      unsigned BitWidth = VT.getSizeInBits();
+      unsigned ShAmt = Op->getConstantOperandVal(1);
+      if (ShAmt >= BitWidth) // Avoid undefined shifts.
         break;
-      }
+      APInt Mask = ArithOp.getOpcode() == ISD::SRL
+                       ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
+                       : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
+      if (!Mask.isSignedIntN(32)) // Avoid large immediates.
+        break;
+      SDValue New = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
+                                DAG.getConstant(Mask, VT));
+      DAG.ReplaceAllUsesWith(Op, New);
+      Op = New;
     }
+    break;
 
-    if (!NonFlagUse)
+  case ISD::AND:
+    // If the primary and result isn't used, don't bother using X86ISD::AND,
+    // because a TEST instruction will be better.
+    if (!hasNonFlagsUse(Op))
       break;
-  }
     // FALL THROUGH
   case ISD::SUB:
   case ISD::OR:
@@ -9794,7 +10244,7 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
   for (unsigned i = 0; i != NumOperands; ++i)
     Ops.push_back(Op.getOperand(i));
 
-  SDValue New = DAG.getNode(Opcode, dl, VTs, &Ops[0], NumOperands);
+  SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
   DAG.ReplaceAllUsesWith(Op, New);
   return SDValue(New.getNode(), 1);
 }
@@ -9802,11 +10252,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
-                                   SelectionDAG &DAG) const {
-  SDLoc dl(Op0);
+                                   SDLoc dl, SelectionDAG &DAG) const {
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op1)) {
     if (C->getAPIntValue() == 0)
-      return EmitTest(Op0, X86CC, DAG);
+      return EmitTest(Op0, X86CC, dl, DAG);
 
      if (Op0.getValueType() == MVT::i1)
        llvm_unreachable("Unexpected comparison operation for MVT::i1 operands");
@@ -9888,7 +10337,7 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
         unsigned AndBitWidth = And.getValueSizeInBits();
         if (BitWidth > AndBitWidth) {
           APInt Zeros, Ones;
-          DAG.ComputeMaskedBits(Op0, Zeros, Ones);
+          DAG.computeKnownBits(Op0, Zeros, Ones);
           if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
             return SDValue();
         }
@@ -10054,7 +10503,7 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG,
 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
 /// operand \p Op1.  If non-trivial (for example because it's not constant)
 /// return an empty value.
-static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
+static SDValue ChangeVSETULTtoVSETULE(SDLoc dl, SDValue Op1, SelectionDAG &DAG)
 {
   BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
   if (!BV)
@@ -10078,8 +10527,7 @@ static SDValue ChangeVSETULTtoVSETULE(SDValue Op1, SelectionDAG &DAG)
     ULTOp1.push_back(DAG.getConstant(Val - 1, EVT));
   }
 
-  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Op1), VT, ULTOp1.data(),
-                     ULTOp1.size());
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, ULTOp1);
 }
 
 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
@@ -10204,7 +10652,7 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget *Subtarget,
       // Only do this pre-AVX since vpcmp* is no longer destructive.
       if (Subtarget->hasAVX())
         break;
-      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(Op1, DAG);
+      SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG);
       if (ULEOp1.getNode()) {
         Op1 = ULEOp1;
         Subus = true; Invert = false; Swap = false;
@@ -10383,7 +10831,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (X86CC == X86::COND_INVALID)
     return SDValue();
 
-  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, DAG);
+  SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
   EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
   SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                               DAG.getConstant(X86CC, MVT::i8), EFLAGS);
@@ -10418,11 +10866,6 @@ static bool isX86LogicalCmp(SDValue Op) {
   return false;
 }
 
-static bool isZero(SDValue V) {
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
-  return C && C->isNullValue();
-}
-
 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
   if (V.getOpcode() != ISD::TRUNCATE)
     return false;
@@ -10517,7 +10960,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
       ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(Op2);
-      if (N2C == 0 || !N2C->isNullValue())
+      if (!N2C || !N2C->isNullValue())
         Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
       return Res;
     }
@@ -10606,7 +11049,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+    Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
   }
 
   // a <  b ? -1 :  0 -> RES = ~setcc_carry
@@ -10646,7 +11089,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // condition is true.
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
   SDValue Ops[] = { Op2, Op1, CC, Cond };
-  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
 }
 
 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -11027,7 +11470,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 
   if (addTest) {
     CC = DAG.getConstant(X86::COND_NE, MVT::i8);
-    Cond = EmitTest(Cond, X86::COND_NE, DAG);
+    Cond = EmitTest(Cond, X86::COND_NE, dl, DAG);
   }
   Cond = ConvertCmpIfNecessary(Cond, DAG);
   return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -11042,13 +11485,50 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
 SDValue
 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
-  assert((Subtarget->isOSWindows() ||
-          getTargetMachine().Options.EnableSegmentedStacks) &&
-         "This should be used only on Windows targets or when segmented stacks "
-         "are being used");
-  assert(!Subtarget->isTargetMacho() && "Not implemented");
+  MachineFunction &MF = DAG.getMachineFunction();
+  bool SplitStack = MF.shouldSplitStack();
+  bool Lower = (Subtarget->isOSWindows() && !Subtarget->isTargetMacho()) ||
+               SplitStack;
   SDLoc dl(Op);
 
+  if (!Lower) {
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SDNode* Node = Op.getNode();
+
+    unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+    assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+        " not tell us which reg is the stack pointer!");
+    EVT VT = Node->getValueType(0);
+    SDValue Tmp1 = SDValue(Node, 0);
+    SDValue Tmp2 = SDValue(Node, 1);
+    SDValue Tmp3 = Node->getOperand(2);
+    SDValue Chain = Tmp1.getOperand(0);
+
+    // Chain the dynamic stack allocation so that it doesn't modify the stack
+    // pointer when other instructions are using the stack.
+    Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, true),
+        SDLoc(Node));
+
+    SDValue Size = Tmp2.getOperand(1);
+    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+    Chain = SP.getValue(1);
+    unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+    const TargetFrameLowering &TFI = *getTargetMachine().getFrameLowering();
+    unsigned StackAlign = TFI.getStackAlignment();
+    Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    if (Align > StackAlign)
+      Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+          DAG.getConstant(-(uint64_t)Align, VT));
+    Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1); // Output chain
+
+    Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, true),
+        DAG.getIntPtrConstant(0, true), SDValue(),
+        SDLoc(Node));
+
+    SDValue Ops[2] = { Tmp1, Tmp2 };
+    return DAG.getMergeValues(Ops, dl);
+  }
+
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
@@ -11058,8 +11538,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   bool Is64Bit = Subtarget->is64Bit();
   EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
 
-  if (getTargetMachine().Options.EnableSegmentedStacks) {
-    MachineFunction &MF = DAG.getMachineFunction();
+  if (SplitStack) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
     if (Is64Bit) {
@@ -11081,7 +11560,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
                                 DAG.getRegister(Vreg, SPTy));
     SDValue Ops1[2] = { Value, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
   } else {
     SDValue Flag;
     unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
@@ -11105,7 +11584,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     }
 
     SDValue Ops1[2] = { SP, Chain };
-    return DAG.getMergeValues(Ops1, 2, dl);
+    return DAG.getMergeValues(Ops1, dl);
   }
 }
 
@@ -11166,8 +11645,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
                        MachinePointerInfo(SV, 16), false, false, 0);
   MemOps.push_back(Store);
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                     &MemOps[0], MemOps.size());
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
 }
 
 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -11221,8 +11699,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   InstOps.push_back(DAG.getConstant(Align, MVT::i32));
   SDVTList VTs = DAG.getVTList(getPointerTy(), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
-                                          VTs, &InstOps[0], InstOps.size(),
-                                          MVT::i64,
+                                          VTs, InstOps, MVT::i64,
                                           MachinePointerInfo(SV),
                                           /*Align=*/0,
                                           /*Volatile=*/false,
@@ -11262,6 +11739,10 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Fold this packed shift into its first operand if ShiftAmt is 0.
+  if (ShiftAmt == 0)
+    return SrcOp;
+
   // Check for ShiftAmt >= element width
   if (ShiftAmt >= ElementType.getSizeInBits()) {
     if (Opc == X86ISD::VSRAI)
@@ -11282,7 +11763,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
     ConstantSDNode *ND;
 
     switch(Opc) {
-    default: llvm_unreachable(0);
+    default: llvm_unreachable(nullptr);
     case X86ISD::VSHLI:
       for (unsigned i=0; i!=NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
@@ -11321,7 +11802,7 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, SDLoc dl, MVT VT,
       break;
     }
 
-    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElts);
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
   }
 
   return DAG.getNode(Opc, dl, VT, SrcOp, DAG.getConstant(ShiftAmt, MVT::i8));
@@ -11353,7 +11834,7 @@ static SDValue getTargetVShiftNode(unsigned Opc, SDLoc dl, MVT VT,
   ShOps[0] = ShAmt;
   ShOps[1] = DAG.getConstant(0, MVT::i32);
   ShOps[2] = ShOps[3] = DAG.getUNDEF(MVT::i32);
-  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, &ShOps[0], 4);
+  ShAmt = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, ShOps);
 
   // The return type has to be a 128-bit type with the same element
   // type as the input type.
@@ -11476,6 +11957,21 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
+  case Intrinsic::x86_sse41_pmuldq:
+  case Intrinsic::x86_avx2_pmul_dq:
+    return DAG.getNode(X86ISD::PMULDQ, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulhu_w:
+  case Intrinsic::x86_avx2_pmulhu_w:
+    return DAG.getNode(ISD::MULHU, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
+  case Intrinsic::x86_sse2_pmulh_w:
+  case Intrinsic::x86_avx2_pmulh_w:
+    return DAG.getNode(ISD::MULHS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   // SSE2/AVX2 sub with unsigned saturation intrinsics
   case Intrinsic::x86_sse2_psubus_b:
   case Intrinsic::x86_sse2_psubus_w:
@@ -11927,7 +12423,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     }
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
                                 DAG.getConstant(X86CC, MVT::i8),
                                 SDValue(PCMP.getNode(), 1));
@@ -11944,7 +12440,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
 
     SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
     SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
-    return DAG.getNode(Opcode, dl, VTs, NewOps.data(), NewOps.size());
+    return DAG.getNode(Opcode, dl, VTs, NewOps);
   }
   case Intrinsic::x86_fma_vfmadd_ps:
   case Intrinsic::x86_fma_vfmadd_pd:
@@ -12042,27 +12538,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
 }
 
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                             SDValue Base, SDValue Index,
-                             SDValue ScaleOp, SDValue Chain,
-                             const X86Subtarget * Subtarget) {
-  SDLoc dl(Op);
-  ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
-  assert(C && "Invalid scale type");
-  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
-  SDValue Src = getZeroVector(Op.getValueType(), Subtarget, DAG, dl);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
-  SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
-  SDValue Segment = DAG.getRegister(0, MVT::i32);
-  SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
-}
-
-static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
                               const X86Subtarget * Subtarget) {
@@ -12072,7 +12547,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
@@ -12081,12 +12561,12 @@ static SDValue getMGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Ops[] = {Src, MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
-  return DAG.getMergeValues(RetOps, array_lengthof(RetOps), dl);
+  return DAG.getMergeValues(RetOps, dl);
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                              SDValue Src, SDValue Base, SDValue Index,
-                              SDValue ScaleOp, SDValue Chain) {
+                               SDValue Src, SDValue Mask, SDValue Base,
+                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   assert(C && "Invalid scale type");
@@ -12095,52 +12575,218 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Segment = DAG.getRegister(0, MVT::i32);
   EVT MaskVT = MVT::getVectorVT(MVT::i1,
                              Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getConstant(~0, MaskVT);
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
   SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
   SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
   return SDValue(Res, 1);
 }
 
-static SDValue getMScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
-                               SDValue Src, SDValue Mask, SDValue Base,
-                               SDValue Index, SDValue ScaleOp, SDValue Chain) {
+static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                               SDValue Mask, SDValue Base, SDValue Index,
+                               SDValue ScaleOp, SDValue Chain) {
   SDLoc dl(Op);
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ScaleOp);
   assert(C && "Invalid scale type");
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), MVT::i8);
   SDValue Disp = DAG.getTargetConstant(0, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  EVT MaskVT = MVT::getVectorVT(MVT::i1,
-                             Index.getSimpleValueType().getVectorNumElements());
-  SDValue MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
-  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
-  SDValue Ops[] = {Base, Scale, Index, Disp, Segment, MaskInReg, Src, Chain};
-  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
-  return SDValue(Res, 1);
+  EVT MaskVT =
+    MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
+  SDValue MaskInReg;
+  ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask);
+  if (MaskC)
+    MaskInReg = DAG.getTargetConstant(MaskC->getSExtValue(), MaskVT);
+  else
+    MaskInReg = DAG.getNode(ISD::BITCAST, dl, MaskVT, Mask);
+  //SDVTList VTs = DAG.getVTList(MVT::Other);
+  SDValue Ops[] = {MaskInReg, Base, Scale, Index, Disp, Segment, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
+  return SDValue(Res, 0);
+}
+
+// getReadTimeStampCounter - Handles the lowering of builtin intrinsics that
+// read the time stamp counter (x86_rdtsc and x86_rdtscp). This function is
+// also used to custom lower READCYCLECOUNTER nodes.
+static void getReadTimeStampCounter(SDNode *N, SDLoc DL, unsigned Opcode,
+                              SelectionDAG &DAG, const X86Subtarget *Subtarget,
+                              SmallVectorImpl<SDValue> &Results) {
+  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
+  SDValue LO, HI;
+
+  // The processor's time-stamp counter (a 64-bit MSR) is stored into the
+  // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
+  // and the EAX register is loaded with the low-order 32 bits.
+  if (Subtarget->is64Bit()) {
+    LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
+                            LO.getValue(2));
+  } else {
+    LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
+    HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
+                            LO.getValue(2));
+  }
+  SDValue Chain = HI.getValue(1);
+
+  if (Opcode == X86ISD::RDTSCP_DAG) {
+    assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
+
+    // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
+    // the ECX register. Add 'ecx' explicitly to the chain.
+    SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
+                                     HI.getValue(2));
+    // Explicitly store the content of ECX at the location passed in input
+    // to the 'rdtscp' intrinsic.
+    Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
+                         MachinePointerInfo(), false, false, 0);
+  }
+
+  if (Subtarget->is64Bit()) {
+    // The EDX register is loaded with the high-order 32 bits of the MSR, and
+    // the EAX register is loaded with the low-order 32 bits.
+    SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
+                              DAG.getConstant(32, MVT::i8));
+    Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
+    Results.push_back(Chain);
+    return;
+  }
+
+  // Use a buildpair to merge the two 32-bit values into a 64-bit one.
+  SDValue Ops[] = { LO, HI };
+  SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
+  Results.push_back(Pair);
+  Results.push_back(Chain);
+}
+
+static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
+                                     SelectionDAG &DAG) {
+  SmallVector<SDValue, 2> Results;
+  SDLoc DL(Op);
+  getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                          Results);
+  return DAG.getMergeValues(Results, DL);
+}
+
+enum IntrinsicType {
+  GATHER, SCATTER, PREFETCH, RDSEED, RDRAND, RDTSC, XTEST
+};
+
+struct IntrinsicData {
+  IntrinsicData(IntrinsicType IType, unsigned IOpc0, unsigned IOpc1)
+    :Type(IType), Opc0(IOpc0), Opc1(IOpc1) {}
+  IntrinsicType Type;
+  unsigned      Opc0;
+  unsigned      Opc1;
+};
+
+std::map < unsigned, IntrinsicData> IntrMap;
+static void InitIntinsicsMap() {
+  static bool Initialized = false;
+  if (Initialized) 
+    return;
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qps_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERQPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpd_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dps_512,
+                                IntrinsicData(GATHER, X86::VGATHERDPSZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_qpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERQQZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpi_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDDZrm, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gather_dpq_512, 
+                                IntrinsicData(GATHER, X86::VPGATHERDQZrm, 0)));
+
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qps_512,
+                                IntrinsicData(SCATTER, X86::VSCATTERQPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERQPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpd_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dps_512, 
+                                IntrinsicData(SCATTER, X86::VSCATTERDPSZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_qpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERQQZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpi_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDDZmr, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatter_dpq_512, 
+                                IntrinsicData(SCATTER, X86::VPSCATTERDQZmr, 0)));
+   
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPSm,
+                                                        X86::VGATHERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0QPDm,
+                                                        X86::VGATHERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPDm,
+                                                        X86::VGATHERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_gatherpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VGATHERPF0DPSm,
+                                                        X86::VGATHERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPSm,
+                                                        X86::VSCATTERPF1QPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_qpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0QPDm,
+                                                        X86::VSCATTERPF1QPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dpd_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPDm,
+                                                        X86::VSCATTERPF1DPDm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_avx512_scatterpf_dps_512, 
+                                IntrinsicData(PREFETCH, X86::VSCATTERPF0DPSm,
+                                                        X86::VSCATTERPF1DPSm)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_16,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_32,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdrand_64,
+                                IntrinsicData(RDRAND, X86ISD::RDRAND, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_16,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_32,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdseed_64,
+                                IntrinsicData(RDSEED, X86ISD::RDSEED, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_xtest,
+                                IntrinsicData(XTEST,  X86ISD::XTEST,  0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtsc,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSC_DAG, 0)));
+  IntrMap.insert(std::make_pair(Intrinsic::x86_rdtscp,
+                                IntrinsicData(RDTSC,  X86ISD::RDTSCP_DAG, 0)));
+  Initialized = true;
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                                       SelectionDAG &DAG) {
-  SDLoc dl(Op);
+  InitIntinsicsMap();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
-  switch (IntNo) {
-  default: return SDValue();    // Don't custom lower most intrinsics.
+  std::map < unsigned, IntrinsicData>::const_iterator itr = IntrMap.find(IntNo);
+  if (itr == IntrMap.end())
+    return SDValue();
 
-  // RDRAND/RDSEED intrinsics.
-  case Intrinsic::x86_rdrand_16:
-  case Intrinsic::x86_rdrand_32:
-  case Intrinsic::x86_rdrand_64:
-  case Intrinsic::x86_rdseed_16:
-  case Intrinsic::x86_rdseed_32:
-  case Intrinsic::x86_rdseed_64: {
-    unsigned Opcode = (IntNo == Intrinsic::x86_rdseed_16 ||
-                       IntNo == Intrinsic::x86_rdseed_32 ||
-                       IntNo == Intrinsic::x86_rdseed_64) ? X86ISD::RDSEED :
-                                                            X86ISD::RDRAND;
+  SDLoc dl(Op);
+  IntrinsicData Intr = itr->second;
+  switch(Intr.Type) {
+  case RDSEED:
+  case RDRAND: {
     // Emit the node with the right value type.
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
-    SDValue Result = DAG.getNode(Opcode, dl, VTs, Op.getOperand(0));
+    SDValue Result = DAG.getNode(Intr.Opc0, dl, VTs, Op.getOperand(0));
 
     // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
     // Otherwise return the value from Rand, which is always 0, casted to i32.
@@ -12150,152 +12796,55 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                       SDValue(Result.getNode(), 1) };
     SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
                                   DAG.getVTList(Op->getValueType(1), MVT::Glue),
-                                  Ops, array_lengthof(Ops));
+                                  Ops);
 
     // Return { result, isValid, chain }.
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
-  //int_gather(index, base, scale);
-  case Intrinsic::x86_avx512_gather_qpd_512:
-  case Intrinsic::x86_avx512_gather_qps_512:
-  case Intrinsic::x86_avx512_gather_dpd_512:
-  case Intrinsic::x86_avx512_gather_qpi_512:
-  case Intrinsic::x86_avx512_gather_qpq_512:
-  case Intrinsic::x86_avx512_gather_dpq_512:
-  case Intrinsic::x86_avx512_gather_dps_512:
-  case Intrinsic::x86_avx512_gather_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_512: Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_512: Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_512: Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_512: Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_512: Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_512: Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_512: Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_512: Opc = X86::VPGATHERDQZrm; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Index = Op.getOperand(2);
-    SDValue Base  = Op.getOperand(3);
-    SDValue Scale = Op.getOperand(4);
-    return getGatherNode(Opc, Op, DAG, Base, Index, Scale, Chain, Subtarget);
-  }
-  //int_gather_mask(v1, mask, index, base, scale);
-  case Intrinsic::x86_avx512_gather_qps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dpd_mask_512:
-  case Intrinsic::x86_avx512_gather_dps_mask_512:
-  case Intrinsic::x86_avx512_gather_qpi_mask_512:
-  case Intrinsic::x86_avx512_gather_qpq_mask_512:
-  case Intrinsic::x86_avx512_gather_dpi_mask_512:
-  case Intrinsic::x86_avx512_gather_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_gather_qps_mask_512:
-      Opc = X86::VGATHERQPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpd_mask_512:
-      Opc = X86::VGATHERQPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpd_mask_512:
-      Opc = X86::VGATHERDPDZrm; break;
-    case Intrinsic::x86_avx512_gather_dps_mask_512:
-      Opc = X86::VGATHERDPSZrm; break;
-    case Intrinsic::x86_avx512_gather_qpi_mask_512:
-      Opc = X86::VPGATHERQDZrm; break;
-    case Intrinsic::x86_avx512_gather_qpq_mask_512:
-      Opc = X86::VPGATHERQQZrm; break;
-    case Intrinsic::x86_avx512_gather_dpi_mask_512:
-      Opc = X86::VPGATHERDDZrm; break;
-    case Intrinsic::x86_avx512_gather_dpq_mask_512:
-      Opc = X86::VPGATHERDQZrm; break;
-    }
+  case GATHER: {
+  //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Src   = Op.getOperand(2);
-    SDValue Mask  = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
-    SDValue Base  = Op.getOperand(5);
+    SDValue Mask  = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getMGatherNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
+    return getGatherNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain,
                           Subtarget);
   }
-  //int_scatter(base, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qpd_512:
-  case Intrinsic::x86_avx512_scatter_qps_512:
-  case Intrinsic::x86_avx512_scatter_dpd_512:
-  case Intrinsic::x86_avx512_scatter_qpi_512:
-  case Intrinsic::x86_avx512_scatter_qpq_512:
-  case Intrinsic::x86_avx512_scatter_dpq_512:
-  case Intrinsic::x86_avx512_scatter_dps_512:
-  case Intrinsic::x86_avx512_scatter_dpi_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
-    SDValue Chain = Op.getOperand(0);
-    SDValue Base  = Op.getOperand(2);
-    SDValue Index = Op.getOperand(3);
-    SDValue Src   = Op.getOperand(4);
-    SDValue Scale = Op.getOperand(5);
-    return getScatterNode(Opc, Op, DAG, Src, Base, Index, Scale, Chain);
-  }
-  //int_scatter_mask(base, mask, index, v1, scale);
-  case Intrinsic::x86_avx512_scatter_qps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-  case Intrinsic::x86_avx512_scatter_dps_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-  case Intrinsic::x86_avx512_scatter_dpq_mask_512: {
-    unsigned Opc;
-    switch (IntNo) {
-    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-    case Intrinsic::x86_avx512_scatter_qpd_mask_512:
-      Opc = X86::VSCATTERQPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qps_mask_512:
-      Opc = X86::VSCATTERQPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpd_mask_512:
-      Opc = X86::VSCATTERDPDZmr; break;
-    case Intrinsic::x86_avx512_scatter_dps_mask_512:
-      Opc = X86::VSCATTERDPSZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpi_mask_512:
-      Opc = X86::VPSCATTERQDZmr; break;
-    case Intrinsic::x86_avx512_scatter_qpq_mask_512:
-      Opc = X86::VPSCATTERQQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpq_mask_512:
-      Opc = X86::VPSCATTERDQZmr; break;
-    case Intrinsic::x86_avx512_scatter_dpi_mask_512:
-      Opc = X86::VPSCATTERDDZmr; break;
-    }
+  case SCATTER: {
+  //scatter(base, mask, index, v1, scale);
     SDValue Chain = Op.getOperand(0);
     SDValue Base  = Op.getOperand(2);
     SDValue Mask  = Op.getOperand(3);
     SDValue Index = Op.getOperand(4);
     SDValue Src   = Op.getOperand(5);
     SDValue Scale = Op.getOperand(6);
-    return getMScatterNode(Opc, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+    return getScatterNode(Intr.Opc0, Op, DAG, Src, Mask, Base, Index, Scale, Chain);
+  }
+  case PREFETCH: {
+    SDValue Hint = Op.getOperand(6);
+    unsigned HintVal;
+    if (dyn_cast<ConstantSDNode> (Hint) == 0 ||
+        (HintVal = dyn_cast<ConstantSDNode> (Hint)->getZExtValue()) > 1)
+      llvm_unreachable("Wrong prefetch hint in intrinsic: should be 0 or 1");
+    unsigned Opcode = (HintVal ? Intr.Opc1 : Intr.Opc0);
+    SDValue Chain = Op.getOperand(0);
+    SDValue Mask  = Op.getOperand(2);
+    SDValue Index = Op.getOperand(3);
+    SDValue Base  = Op.getOperand(4);
+    SDValue Scale = Op.getOperand(5);
+    return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain);
+  }
+  // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
+  case RDTSC: {
+    SmallVector<SDValue, 2> Results;
+    getReadTimeStampCounter(Op.getNode(), dl, Intr.Opc0, DAG, Subtarget, Results);
+    return DAG.getMergeValues(Results, dl);
   }
   // XTEST intrinsics.
-  case Intrinsic::x86_xtest: {
+  case XTEST: {
     SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
     SDValue InTrans = DAG.getNode(X86ISD::XTEST, dl, VTs, Op.getOperand(0));
     SDValue SetCC = DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
@@ -12306,6 +12855,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget *Subtarget,
                        Ret, SDValue(InTrans.getNode(), 1));
   }
   }
+  llvm_unreachable("Unknown Intrinsic Type");
 }
 
 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
@@ -12358,6 +12908,19 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   return FrameAddr;
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+unsigned X86TargetLowering::getRegisterByName(const char* RegName,
+                                              EVT VT) const {
+  unsigned Reg = StringSwitch<unsigned>(RegName)
+                       .Case("esp", X86::ESP)
+                       .Case("rsp", X86::RSP)
+                       .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
   const X86RegisterInfo *RegInfo =
@@ -12477,7 +13040,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 22),
                                 false, false, 0);
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 6);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   } else {
     const Function *Func =
       cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
@@ -12557,7 +13120,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
                                 MachinePointerInfo(TrmpAddr, 6),
                                 false, false, 1);
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 4);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
   }
 }
 
@@ -12600,8 +13163,7 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
   SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
                                           DAG.getVTList(MVT::Other),
-                                          Ops, array_lengthof(Ops), MVT::i16,
-                                          MMO);
+                                          Ops, MVT::i16, MMO);
 
   // Load FP Control Word from stack slot
   SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot,
@@ -12654,7 +13216,7 @@ static SDValue LowerCTLZ(SDValue Op, SelectionDAG &DAG) {
     DAG.getConstant(X86::COND_E, MVT::i8),
     Op.getValue(1)
   };
-  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops, array_lengthof(Ops));
+  Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
 
   // Finally xor with NumBits-1.
   Op = DAG.getNode(ISD::XOR, dl, OpVT, Op, DAG.getConstant(NumBits-1, OpVT));
@@ -12706,7 +13268,7 @@ static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
     DAG.getConstant(X86::COND_E, MVT::i8),
     Op.getValue(1)
   };
-  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops, array_lengthof(Ops));
+  return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
 // Lower256IntArith - Break a 256-bit integer operation into two new 128-bit
@@ -12824,59 +13386,104 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget *Subtarget,
   return DAG.getNode(ISD::ADD, dl, VT, Res, AhiBlo);
 }
 
-static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  MVT EltTy = VT.getVectorElementType();
-  unsigned NumElts = VT.getVectorNumElements();
-  SDValue N0 = Op.getOperand(0);
+SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetWin64() && "Unexpected target");
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
+         "Unexpected return type for lowering");
+
+  RTLIB::Libcall LC;
+  bool isSigned;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case ISD::SDIV:      isSigned = true;  LC = RTLIB::SDIV_I128;    break;
+  case ISD::UDIV:      isSigned = false; LC = RTLIB::UDIV_I128;    break;
+  case ISD::SREM:      isSigned = true;  LC = RTLIB::SREM_I128;    break;
+  case ISD::UREM:      isSigned = false; LC = RTLIB::UREM_I128;    break;
+  case ISD::SDIVREM:   isSigned = true;  LC = RTLIB::SDIVREM_I128; break;
+  case ISD::UDIVREM:   isSigned = false; LC = RTLIB::UDIVREM_I128; break;
+  }
+
   SDLoc dl(Op);
+  SDValue InChain = DAG.getEntryNode();
 
-  // Lower sdiv X, pow2-const.
-  BuildVectorSDNode *C = dyn_cast<BuildVectorSDNode>(Op.getOperand(1));
-  if (!C)
-    return SDValue();
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
+           "Unexpected argument type for lowering");
+    SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    Entry.Node = StackPtr;
+    InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr, MachinePointerInfo(),
+                           false, false, 16);
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Ty = PointerType::get(ArgTy,0);
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
 
-  APInt SplatValue, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (!C->isConstantSplat(SplatValue, SplatUndef, SplatBitSize,
-                          HasAnyUndefs) ||
-      EltTy.getSizeInBits() < SplatBitSize)
-    return SDValue();
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(getLibcallCallingConv(LC),
+               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
+               Callee, &Args, 0)
+    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+  return DAG.getNode(ISD::BITCAST, dl, VT, CallInfo.first);
+}
+
+static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget *Subtarget,
+                             SelectionDAG &DAG) {
+  SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
+  EVT VT = Op0.getValueType();
+  SDLoc dl(Op);
 
-  if ((SplatValue != 0) &&
-      (SplatValue.isPowerOf2() || (-SplatValue).isPowerOf2())) {
-    unsigned Lg2 = SplatValue.countTrailingZeros();
-    // Splat the sign bit.
-    SmallVector<SDValue, 16> Sz(NumElts,
-                                DAG.getConstant(EltTy.getSizeInBits() - 1,
-                                                EltTy));
-    SDValue SGN = DAG.getNode(ISD::SRA, dl, VT, N0,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Sz[0],
-                                          NumElts));
-    // Add (N0 < 0) ? abs2 - 1 : 0;
-    SmallVector<SDValue, 16> Amt(NumElts,
-                                 DAG.getConstant(EltTy.getSizeInBits() - Lg2,
-                                                 EltTy));
-    SDValue SRL = DAG.getNode(ISD::SRL, dl, VT, SGN,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Amt[0],
-                                          NumElts));
-    SDValue ADD = DAG.getNode(ISD::ADD, dl, VT, N0, SRL);
-    SmallVector<SDValue, 16> Lg2Amt(NumElts, DAG.getConstant(Lg2, EltTy));
-    SDValue SRA = DAG.getNode(ISD::SRA, dl, VT, ADD,
-                              DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Lg2Amt[0],
-                                          NumElts));
-
-    // If we're dividing by a positive value, we're done.  Otherwise, we must
-    // negate the result.
-    if (SplatValue.isNonNegative())
-      return SRA;
-
-    SmallVector<SDValue, 16> V(NumElts, DAG.getConstant(0, EltTy));
-    SDValue Zero = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], NumElts);
-    return DAG.getNode(ISD::SUB, dl, VT, Zero, SRA);
+  assert((VT == MVT::v4i32 && Subtarget->hasSSE2()) ||
+         (VT == MVT::v8i32 && Subtarget->hasInt256()));
+
+  // Get the high parts.
+  const int Mask[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  SDValue Hi0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, Mask);
+  SDValue Hi1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, Mask);
+
+  // Emit two multiplies, one for the lower 2 ints and one for the higher 2
+  // ints.
+  MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+  bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
+  unsigned Opcode =
+      (!IsSigned || !Subtarget->hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
+  SDValue Mul1 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
+  SDValue Mul2 = DAG.getNode(ISD::BITCAST, dl, VT,
+                             DAG.getNode(Opcode, dl, MulVT, Hi0, Hi1));
+
+  // Shuffle it back into the right order.
+  const int HighMask[] = {1, 5, 3, 7, 9, 13, 11, 15};
+  SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+  const int LowMask[] = {0, 4, 2, 6, 8, 12, 10, 14};
+  SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
+  // If we have a signed multiply but no PMULDQ fix up the high parts of a
+  // unsigned multiply.
+  if (IsSigned && !Subtarget->hasSSE41()) {
+    SDValue ShAmt =
+        DAG.getConstant(31, DAG.getTargetLoweringInfo().getShiftAmountTy(VT));
+    SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
+    SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
+                             DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
+
+    SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
+    Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
   }
-  return SDValue();
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl, Op.getValueType(), Highs, Lows);
 }
 
 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
@@ -12920,7 +13527,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
@@ -12933,7 +13540,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRA) {
           if (ShiftAmt == 7) {
@@ -12946,7 +13553,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
           SmallVector<SDValue, 16> V(16, DAG.getConstant(128 >> ShiftAmt,
                                                          MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 16);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
@@ -12966,7 +13573,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U << ShiftAmt),
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SHL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRL) {
           // Make a large shift.
@@ -12979,7 +13586,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
                                      DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
                                                      MVT::i8));
           return DAG.getNode(ISD::AND, dl, VT, SRL,
-                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V));
         }
         if (Op.getOpcode() == ISD::SRA) {
           if (ShiftAmt == 7) {
@@ -12992,7 +13599,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
           SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
           SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
                                                          MVT::i8));
-          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, V);
           Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
           Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
           return Res;
@@ -13014,7 +13621,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     uint64_t ShiftAmt = 0;
     for (unsigned i = 0; i != Ratio; ++i) {
       ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i));
-      if (C == 0)
+      if (!C)
         return SDValue();
       // 6 == Log2(64)
       ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
@@ -13025,7 +13632,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
       for (unsigned j = 0; j != Ratio; ++j) {
         ConstantSDNode *C =
           dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
-        if (C == 0)
+        if (!C)
           return SDValue();
         // 6 == Log2(64)
         ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
@@ -13107,7 +13714,7 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
                BaseShAmt = InVec.getOperand(1);
            }
         }
-        if (BaseShAmt.getNode() == 0)
+        if (!BaseShAmt.getNode())
           BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Amt,
                                   DAG.getIntPtrConstant(0));
       }
@@ -13260,7 +13867,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       }
       Elts.push_back(DAG.getConstant(One.shl(ShAmt), SVT));
     }
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &Elts[0], NumElems);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Elts);
     return DAG.getNode(ISD::MUL, dl, VT, R, BV);
   }
 
@@ -13274,6 +13881,79 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
     return DAG.getNode(ISD::MUL, dl, VT, Op, R);
   }
 
+  // If possible, lower this shift as a sequence of two shifts by
+  // constant plus a MOVSS/MOVSD instead of scalarizing it.
+  // Example:
+  //   (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
+  //
+  // Could be rewritten as:
+  //   (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
+  //
+  // The advantage is that the two shifts from the example would be
+  // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
+  // the vector shift into four scalar shifts plus four pairs of vector
+  // insert/extract.
+  if ((VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      ISD::isBuildVectorOfConstantSDNodes(Amt.getNode())) {
+    unsigned TargetOpcode = X86ISD::MOVSS;
+    bool CanBeSimplified;
+    // The splat value for the first packed shift (the 'X' from the example).
+    SDValue Amt1 = Amt->getOperand(0);
+    // The splat value for the second packed shift (the 'Y' from the example).
+    SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) :
+                                        Amt->getOperand(2);
+
+    // See if it is possible to replace this node with a sequence of
+    // two shifts followed by a MOVSS/MOVSD
+    if (VT == MVT::v4i32) {
+      // Check if it is legal to use a MOVSS.
+      CanBeSimplified = Amt2 == Amt->getOperand(2) &&
+                        Amt2 == Amt->getOperand(3);
+      if (!CanBeSimplified) {
+        // Otherwise, check if we can still simplify this node using a MOVSD.
+        CanBeSimplified = Amt1 == Amt->getOperand(1) &&
+                          Amt->getOperand(2) == Amt->getOperand(3);
+        TargetOpcode = X86ISD::MOVSD;
+        Amt2 = Amt->getOperand(2);
+      }
+    } else {
+      // Do similar checks for the case where the machine value type
+      // is MVT::v8i16.
+      CanBeSimplified = Amt1 == Amt->getOperand(1);
+      for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
+        CanBeSimplified = Amt2 == Amt->getOperand(i);
+
+      if (!CanBeSimplified) {
+        TargetOpcode = X86ISD::MOVSD;
+        CanBeSimplified = true;
+        Amt2 = Amt->getOperand(4);
+        for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
+          CanBeSimplified = Amt1 == Amt->getOperand(i);
+        for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
+          CanBeSimplified = Amt2 == Amt->getOperand(j);
+      }
+    }
+    
+    if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
+        isa<ConstantSDNode>(Amt2)) {
+      // Replace this node with two shifts followed by a MOVSS/MOVSD.
+      EVT CastVT = MVT::v4i32;
+      SDValue Splat1 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), VT);
+      SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
+      SDValue Splat2 = 
+        DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), VT);
+      SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
+      if (TargetOpcode == X86ISD::MOVSD)
+        CastVT = MVT::v2i64;
+      SDValue BitCast1 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift1);
+      SDValue BitCast2 = DAG.getNode(ISD::BITCAST, dl, CastVT, Shift2);
+      SDValue Result = getTargetShuffleNode(TargetOpcode, dl, CastVT, BitCast2,
+                                            BitCast1, DAG);
+      return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+    }
+  }
+
   if (VT == MVT::v16i8 && Op->getOpcode() == ISD::SHL) {
     assert(Subtarget->hasSSE2() && "Need SSE2 for pslli/pcmpeq.");
 
@@ -13351,10 +14031,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget* Subtarget,
       for (unsigned i = NumElems/2; i != NumElems; ++i)
         Amt2Csts.push_back(Amt->getOperand(i));
 
-      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt1Csts[0], NumElems/2);
-      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT,
-                                 &Amt2Csts[0], NumElems/2);
+      Amt1 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt1Csts);
+      Amt2 = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Amt2Csts);
     } else {
       // Variable shift amount
       Amt1 = Extract128BitVector(Amt, 0, DAG, dl);
@@ -13585,35 +14263,47 @@ static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget *Subtarget,
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
   SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
-                                           Ops, array_lengthof(Ops), T, MMO);
+                                           Ops, T, MMO);
   SDValue cpOut =
     DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
   return cpOut;
 }
 
-static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget *Subtarget,
-                                     SelectionDAG &DAG) {
-  assert(Subtarget->is64Bit() && "Result not type legalized?");
-  SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-  SDValue TheChain = Op.getOperand(0);
-  SDLoc dl(Op);
-  SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
-  SDValue rax = DAG.getCopyFromReg(rd, dl, X86::RAX, MVT::i64, rd.getValue(1));
-  SDValue rdx = DAG.getCopyFromReg(rax.getValue(1), dl, X86::RDX, MVT::i64,
-                                   rax.getValue(2));
-  SDValue Tmp = DAG.getNode(ISD::SHL, dl, MVT::i64, rdx,
-                            DAG.getConstant(32, MVT::i8));
-  SDValue Ops[] = {
-    DAG.getNode(ISD::OR, dl, MVT::i64, rax, Tmp),
-    rdx.getValue(1)
-  };
-  return DAG.getMergeValues(Ops, array_lengthof(Ops), dl);
-}
-
 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget *Subtarget,
                             SelectionDAG &DAG) {
   MVT SrcVT = Op.getOperand(0).getSimpleValueType();
   MVT DstVT = Op.getSimpleValueType();
+
+  if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8) {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    if (DstVT != MVT::f64)
+      // This conversion needs to be expanded.
+      return SDValue();
+
+    SDValue InVec = Op->getOperand(0);
+    SDLoc dl(Op);
+    unsigned NumElts = SrcVT.getVectorNumElements();
+    EVT SVT = SrcVT.getVectorElementType();
+
+    // Widen the vector in input in the case of MVT::v2i32.
+    // Example: from MVT::v2i32 to MVT::v4i32.
+    SmallVector<SDValue, 16> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, InVec,
+                                 DAG.getIntPtrConstant(i)));
+
+    // Explicitly mark the extra elements as Undef.
+    SDValue Undef = DAG.getUNDEF(SVT);
+    for (unsigned i = NumElts, e = NumElts * 2; i != e; ++i)
+      Elts.push_back(Undef);
+
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, Elts);
+    SDValue ToV2F64 = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, BV);
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
+                       DAG.getIntPtrConstant(0));
+  }
+
   assert(Subtarget->is64Bit() && !Subtarget->hasSSE2() &&
          Subtarget->hasMMX() && "Unexpected custom BITCAST");
   assert((DstVT == MVT::i64 ||
@@ -13641,8 +14331,7 @@ static SDValue LowerLOAD_SUB(SDValue Op, SelectionDAG &DAG) {
                        cast<AtomicSDNode>(Node)->getMemoryVT(),
                        Node->getOperand(0),
                        Node->getOperand(1), negOp,
-                       cast<AtomicSDNode>(Node)->getSrcValue(),
-                       cast<AtomicSDNode>(Node)->getAlignment(),
+                       cast<AtomicSDNode>(Node)->getMemOperand(),
                        cast<AtomicSDNode>(Node)->getOrdering(),
                        cast<AtomicSDNode>(Node)->getSynchScope());
 }
@@ -13730,12 +14419,11 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget *Subtarget,
   Type *RetTy = isF64
     ? (Type*)StructType::get(ArgTy, ArgTy, NULL)
     : (Type*)VectorType::get(ArgTy, 4);
-  TargetLowering::
-    CallLoweringInfo CLI(DAG.getEntryNode(), RetTy,
-                         false, false, false, false, 0,
-                         CallingConv::C, /*isTaillCall=*/false,
-                         /*doesNotRet=*/false, /*isReturnValueUsed*/true,
-                         Callee, Args, DAG, dl);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(CallingConv::C, RetTy, Callee, &Args, 0);
+
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
   if (isF64)
@@ -13764,6 +14452,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::BUILD_VECTOR:       return LowerBUILD_VECTOR(Op, DAG);
   case ISD::CONCAT_VECTORS:     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::VECTOR_SHUFFLE:     return LowerVECTOR_SHUFFLE(Op, DAG);
+  case ISD::VSELECT:            return LowerVSELECT(Op, DAG);
   case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::INSERT_VECTOR_ELT:  return LowerINSERT_VECTOR_ELT(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:  return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
@@ -13815,6 +14504,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::CTLZ_ZERO_UNDEF:    return LowerCTLZ_ZERO_UNDEF(Op, DAG);
   case ISD::CTTZ:               return LowerCTTZ(Op, DAG);
   case ISD::MUL:                return LowerMUL(Op, Subtarget, DAG);
+  case ISD::UMUL_LOHI:
+  case ISD::SMUL_LOHI:          return LowerMUL_LOHI(Op, Subtarget, DAG);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:                return LowerShift(Op, Subtarget, DAG);
@@ -13832,7 +14523,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
-  case ISD::SDIV:               return LowerSDIV(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   }
 }
@@ -13875,10 +14565,10 @@ ReplaceATOMIC_BINARY_64(SDNode *Node, SmallVectorImpl<SDValue>&Results,
   SDValue Ops[] = { Chain, In1, In2L, In2H };
   SDVTList Tys = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
   SDValue Result =
-    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, array_lengthof(Ops), MVT::i64,
+    DAG.getMemIntrinsicNode(NewOp, dl, Tys, Ops, MVT::i64,
                             cast<MemSDNode>(Node)->getMemOperand());
   SDValue OpsF[] = { Result.getValue(0), Result.getValue(1)};
-  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF, 2));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, OpsF));
   Results.push_back(Result.getValue(2));
 }
 
@@ -13899,6 +14589,16 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::SUBE:
     // We don't want to expand or promote these.
     return;
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM: {
+    SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
+    Results.push_back(V);
+    return;
+  }
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT: {
     bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
@@ -13909,10 +14609,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     std::pair<SDValue,SDValue> Vals =
         FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
     SDValue FIST = Vals.first, StackSlot = Vals.second;
-    if (FIST.getNode() != 0) {
+    if (FIST.getNode()) {
       EVT VT = N->getValueType(0);
       // Return a load from the stack slot.
-      if (StackSlot.getNode() != 0)
+      if (StackSlot.getNode())
         Results.push_back(DAG.getLoad(VT, dl, FIST, StackSlot,
                                       MachinePointerInfo(),
                                       false, false, false, 0));
@@ -13945,20 +14645,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     Results.push_back(V);
     return;
   }
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+    switch (IntNo) {
+    default : llvm_unreachable("Do not know how to custom type "
+                               "legalize this intrinsic operation!");
+    case Intrinsic::x86_rdtsc:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                     Results);
+    case Intrinsic::x86_rdtscp:
+      return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
+                                     Results);
+    }
+  }
   case ISD::READCYCLECOUNTER: {
-    SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
-    SDValue TheChain = N->getOperand(0);
-    SDValue rd = DAG.getNode(X86ISD::RDTSC_DAG, dl, Tys, &TheChain, 1);
-    SDValue eax = DAG.getCopyFromReg(rd, dl, X86::EAX, MVT::i32,
-                                     rd.getValue(1));
-    SDValue edx = DAG.getCopyFromReg(eax.getValue(1), dl, X86::EDX, MVT::i32,
-                                     eax.getValue(2));
-    // Use a buildpair to merge the two 32-bit values into a 64-bit one.
-    SDValue Ops[] = { eax, edx };
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Ops,
-                                  array_lengthof(Ops)));
-    Results.push_back(edx.getValue(1));
-    return;
+    return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
+                                   Results);
   }
   case ISD::ATOMIC_CMP_SWAP: {
     EVT T = N->getValueType(0);
@@ -13994,8 +14696,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
     unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_DAG :
                                   X86ISD::LCMPXCHG8_DAG;
-    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys,
-                                             Ops, array_lengthof(Ops), T, MMO);
+    SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
     SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
                                         Regs64bit ? X86::RAX : X86::EAX,
                                         HalfT, Result.getValue(1));
@@ -14003,7 +14704,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                         Regs64bit ? X86::RDX : X86::EDX,
                                         HalfT, cpOutL.getValue(2));
     SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
-    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF, 2));
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
     Results.push_back(cpOutH.getValue(1));
     return;
   }
@@ -14058,14 +14759,39 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     ReplaceATOMIC_BINARY_64(N, Results, DAG, Opc);
     return;
   }
-  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_LOAD: {
     ReplaceATOMIC_LOAD(N, Results, DAG);
+    return;
+  }
+  case ISD::BITCAST: {
+    assert(Subtarget->hasSSE2() && "Requires at least SSE2!");
+    EVT DstVT = N->getValueType(0);
+    EVT SrcVT = N->getOperand(0)->getValueType(0);
+
+    if (SrcVT != MVT::f64 ||
+        (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
+      return;
+
+    unsigned NumElts = DstVT.getVectorNumElements();
+    EVT SVT = DstVT.getVectorElementType();
+    EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
+    SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                   MVT::v2f64, N->getOperand(0));
+    SDValue ToVecInt = DAG.getNode(ISD::BITCAST, dl, WiderVT, Expanded);
+
+    SmallVector<SDValue, 8> Elts;
+    for (unsigned i = 0, e = NumElts; i != e; ++i)
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
+                                   ToVecInt, DAG.getIntPtrConstant(i)));
+
+    Results.push_back(DAG.getNode(ISD::BUILD_VECTOR, dl, DstVT, Elts));
+  }
   }
 }
 
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch (Opcode) {
-  default: return NULL;
+  default: return nullptr;
   case X86ISD::BSF:                return "X86ISD::BSF";
   case X86ISD::BSR:                return "X86ISD::BSR";
   case X86ISD::SHLD:               return "X86ISD::SHLD";
@@ -14176,7 +14902,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BZHI:               return "X86ISD::BZHI";
   case X86ISD::BEXTR:              return "X86ISD::BEXTR";
   case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
   case X86ISD::PTEST:              return "X86ISD::PTEST";
@@ -14203,6 +14928,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
   case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
   case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
+  case X86ISD::VEXTRACT:           return "X86ISD::VEXTRACT";
   case X86ISD::VPERMILP:           return "X86ISD::VPERMILP";
   case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
   case X86ISD::VPERMV:             return "X86ISD::VPERMV";
@@ -14210,6 +14936,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VPERMIV3:           return "X86ISD::VPERMIV3";
   case X86ISD::VPERMI:             return "X86ISD::VPERMI";
   case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
+  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
   case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
   case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
   case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
@@ -14240,7 +14967,7 @@ bool X86TargetLowering::isLegalAddressingMode(const AddrMode &AM,
   Reloc::Model R = getTargetMachine().getRelocationModel();
 
   // X86 allows a sign-extended 32-bit immediate field as a displacement.
-  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != NULL))
+  if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
     return false;
 
   if (AM.BaseGV) {
@@ -14418,7 +15145,23 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
   if (VT.getSizeInBits() == 64)
     return false;
 
-  // FIXME: pshufb, blends, shifts.
+  // If this is a single-input shuffle with no 128 bit lane crossings we can
+  // lower it into pshufb.
+  if ((SVT.is128BitVector() && Subtarget->hasSSSE3()) ||
+      (SVT.is256BitVector() && Subtarget->hasInt256())) {
+    bool isLegal = true;
+    for (unsigned I = 0, E = M.size(); I != E; ++I) {
+      if (M[I] >= (int)SVT.getVectorNumElements() ||
+          ShuffleCrosses128bitLane(SVT, I, M[I])) {
+        isLegal = false;
+        break;
+      }
+    }
+    if (isLegal)
+      return true;
+  }
+
+  // FIXME: blends, shifts.
   return (SVT.getVectorNumElements() == 2 ||
           ShuffleVectorSDNode::isSplatMask(&M[0], VT) ||
           isMOVLMask(M, SVT) ||
@@ -15366,7 +16109,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
     OffsetDestReg = 0; // unused
     OverflowDestReg = DestReg;
 
-    offsetMBB = NULL;
+    offsetMBB = nullptr;
     overflowMBB = thisMBB;
     endMBB = thisMBB;
   } else {
@@ -15736,7 +16479,7 @@ X86TargetLowering::EmitLoweredSegAlloca(MachineInstr *MI, MachineBasicBlock *BB,
   MachineFunction *MF = BB->getParent();
   const BasicBlock *LLVM_BB = BB->getBasicBlock();
 
-  assert(getTargetMachine().Options.EnableSegmentedStacks);
+  assert(MF->shouldSplitStack());
 
   unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
   unsigned TlsOffset = Is64Bit ? 0x70 : 0x30;
@@ -16509,11 +17252,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
 //                           X86 Optimization Hooks
 //===----------------------------------------------------------------------===//
 
-void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                       APInt &KnownZero,
-                                                       APInt &KnownOne,
-                                                       const SelectionDAG &DAG,
-                                                       unsigned Depth) const {
+void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                      APInt &KnownZero,
+                                                      APInt &KnownOne,
+                                                      const SelectionDAG &DAG,
+                                                      unsigned Depth) const {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
   assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -16576,8 +17319,10 @@ void X86TargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
   }
 }
 
-unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
-                                                         unsigned Depth) const {
+unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
+  SDValue Op,
+  const SelectionDAG &,
+  unsigned Depth) const {
   // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
   if (Op.getOpcode() == X86ISD::SETCC_CARRY)
     return Op.getValueType().getScalarType().getSizeInBits();
@@ -16679,7 +17424,6 @@ static SDValue PerformShuffleCombine256(SDNode *N, SelectionDAG &DAG,
         SDValue Ops[] = { Ld->getChain(), Ld->getBasePtr() };
         SDValue ResNode =
           DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  array_lengthof(Ops),
                                   Ld->getMemoryVT(),
                                   Ld->getPointerInfo(),
                                   Ld->getAlignment(),
@@ -17036,6 +17780,51 @@ matchIntegerMINMAX(SDValue Cond, EVT VT, SDValue LHS, SDValue RHS,
   return std::make_pair(Opc, NeedSplit);
 }
 
+static SDValue
+TransformVSELECTtoBlendVECTOR_SHUFFLE(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+
+  if (Cond.getOpcode() == ISD::SIGN_EXTEND) {
+    SDValue CondSrc = Cond->getOperand(0);
+    if (CondSrc->getOpcode() == ISD::SIGN_EXTEND_INREG)
+      Cond = CondSrc->getOperand(0);
+  }
+
+  MVT VT = N->getSimpleValueType(0);
+  MVT EltVT = VT.getVectorElementType();
+  unsigned NumElems = VT.getVectorNumElements();
+  // There is no blend with immediate in AVX-512.
+  if (VT.is512BitVector())
+    return SDValue();
+
+  if (!Subtarget->hasSSE41() || EltVT == MVT::i8)
+    return SDValue();
+  if (!Subtarget->hasInt256() && VT == MVT::v16i16)
+    return SDValue();
+
+  if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
+    return SDValue();
+
+  unsigned MaskValue = 0;
+  if (!BUILD_VECTORtoBlendMask(cast<BuildVectorSDNode>(Cond), MaskValue))
+    return SDValue();
+
+  SmallVector<int, 8> ShuffleMask(NumElems, -1);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    // Be sure we emit undef where we can.
+    if (Cond.getOperand(i)->getOpcode() == ISD::UNDEF)
+      ShuffleMask[i] = -1;
+    else
+      ShuffleMask[i] = i + NumElems * ((MaskValue >> i) & 1);
+  }
+
+  return DAG.getVectorShuffle(VT, dl, LHS, RHS, &ShuffleMask[0]);
+}
+
 /// PerformSELECTCombine - Do target-specific dag combines on SELECT and VSELECT
 /// nodes.
 static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
@@ -17378,7 +18167,7 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
 
       // Another special case: If C was a sign bit, the sub has been
       // canonicalized into a xor.
-      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      // FIXME: Would it be better to use computeKnownBits to determine whether
       //        it's safe to decanonicalize the xor?
       // x s< 0 ? x^C : 0 --> subus x, C
       if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
@@ -17544,7 +18333,13 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
   // to simplify previous instructions.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
-      !DCI.isBeforeLegalize() && TLI.isOperationLegal(ISD::VSELECT, VT)) {
+      !DCI.isBeforeLegalize() &&
+      // We explicitly check against v8i16 and v16i16 because, although
+      // they're marked as Custom, they might only be legal when Cond is a
+      // build_vector of constants. This will be taken care in a later
+      // condition.
+      (TLI.isOperationLegalOrCustom(ISD::VSELECT, VT) && VT != MVT::v16i16 &&
+       VT != MVT::v8i16)) {
     unsigned BitWidth = Cond.getValueType().getScalarType().getSizeInBits();
 
     // Don't optimize vector selects that map to mask-registers.
@@ -17571,6 +18366,23 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
       DCI.CommitTargetLoweringOpt(TLO);
   }
 
+  // We should generate an X86ISD::BLENDI from a vselect if its argument
+  // is a sign_extend_inreg of an any_extend of a BUILD_VECTOR of
+  // constants. This specific pattern gets generated when we split a
+  // selector for a 512 bit vector in a machine without AVX512 (but with
+  // 256-bit vectors), during legalization:
+  //
+  // (vselect (sign_extend (any_extend (BUILD_VECTOR)) i1) LHS RHS)
+  //
+  // Iff we find this pattern and the build_vectors are built from
+  // constants, we translate the vselect into a shuffle_vector that we
+  // know will be matched by LowerVECTOR_SHUFFLEtoBlend.
+  if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalize()) {
+    SDValue Shuffle = TransformVSELECTtoBlendVECTOR_SHUFFLE(N, DAG, Subtarget);
+    if (Shuffle.getNode())
+      return Shuffle;
+  }
+
   return SDValue();
 }
 
@@ -17605,7 +18417,7 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   SDValue Op2 = Cmp.getOperand(1);
 
   SDValue SetCC;
-  const ConstantSDNode* C = 0;
+  const ConstantSDNode* C = nullptr;
   bool needOppositeCond = (CC == X86::COND_E);
   bool checkAgainstTrue = false; // Is it a comparison against 1?
 
@@ -17740,8 +18552,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
       (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC))) {
     SDValue Ops[] = { FalseOp, TrueOp,
                       DAG.getConstant(CC, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
-                       Ops, array_lengthof(Ops));
+    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
   }
 
   // If this is a select between two integer constants, try to do some
@@ -17856,7 +18667,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
     // the DCI.xxxx conditions are provided to postpone the optimization as
     // late as possible.
 
-    ConstantSDNode *CmpAgainst = 0;
+    ConstantSDNode *CmpAgainst = nullptr;
     if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
         (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
         !isa<ConstantSDNode>(Cond.getOperand(0))) {
@@ -17871,8 +18682,7 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
           CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
         SDValue Ops[] = { FalseOp, Cond.getOperand(0),
                           DAG.getConstant(CC, MVT::i8), Cond };
-        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops,
-                           array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
       }
     }
   }
@@ -17880,6 +18690,106 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
+                                                const X86Subtarget *Subtarget) {
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+  switch (IntNo) {
+  default: return SDValue();
+  // SSE/AVX/AVX2 blend intrinsics.
+  case Intrinsic::x86_avx2_pblendvb:
+  case Intrinsic::x86_avx2_pblendw:
+  case Intrinsic::x86_avx2_pblendd_128:
+  case Intrinsic::x86_avx2_pblendd_256:
+    // Don't try to simplify this intrinsic if we don't have AVX2.
+    if (!Subtarget->hasAVX2())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_avx_blend_pd_256:
+  case Intrinsic::x86_avx_blend_ps_256:
+  case Intrinsic::x86_avx_blendv_pd_256:
+  case Intrinsic::x86_avx_blendv_ps_256:
+    // Don't try to simplify this intrinsic if we don't have AVX.
+    if (!Subtarget->hasAVX())
+      return SDValue();
+    // FALL-THROUGH
+  case Intrinsic::x86_sse41_pblendw:
+  case Intrinsic::x86_sse41_blendpd:
+  case Intrinsic::x86_sse41_blendps:
+  case Intrinsic::x86_sse41_blendvps:
+  case Intrinsic::x86_sse41_blendvpd:
+  case Intrinsic::x86_sse41_pblendvb: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    SDValue Mask = N->getOperand(3);
+
+    // Don't try to simplify this intrinsic if we don't have SSE4.1.
+    if (!Subtarget->hasSSE41())
+      return SDValue();
+
+    // fold (blend A, A, Mask) -> A
+    if (Op0 == Op1)
+      return Op0;
+    // fold (blend A, B, allZeros) -> A
+    if (ISD::isBuildVectorAllZeros(Mask.getNode()))
+      return Op0;
+    // fold (blend A, B, allOnes) -> B
+    if (ISD::isBuildVectorAllOnes(Mask.getNode()))
+      return Op1;
+    
+    // Simplify the case where the mask is a constant i32 value.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Mask)) {
+      if (C->isNullValue())
+        return Op0;
+      if (C->isAllOnesValue())
+        return Op1;
+    }
+  }
+
+  // Packed SSE2/AVX2 arithmetic shift immediate intrinsics.
+  case Intrinsic::x86_sse2_psrai_w:
+  case Intrinsic::x86_sse2_psrai_d:
+  case Intrinsic::x86_avx2_psrai_w:
+  case Intrinsic::x86_avx2_psrai_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx2_psra_d: {
+    SDValue Op0 = N->getOperand(1);
+    SDValue Op1 = N->getOperand(2);
+    EVT VT = Op0.getValueType();
+    assert(VT.isVector() && "Expected a vector type!");
+
+    if (isa<BuildVectorSDNode>(Op1))
+      Op1 = Op1.getOperand(0);
+
+    if (!isa<ConstantSDNode>(Op1))
+      return SDValue();
+
+    EVT SVT = VT.getVectorElementType();
+    unsigned SVTBits = SVT.getSizeInBits();
+
+    ConstantSDNode *CND = cast<ConstantSDNode>(Op1);
+    const APInt &C = APInt(SVTBits, CND->getAPIntValue().getZExtValue());
+    uint64_t ShAmt = C.getZExtValue();
+
+    // Don't try to convert this shift into a ISD::SRA if the shift
+    // count is bigger than or equal to the element size.
+    if (ShAmt >= SVTBits)
+      return SDValue();
+
+    // Trivial case: if the shift count is zero, then fold this
+    // into the first operand.
+    if (ShAmt == 0)
+      return Op0;
+
+    // Replace this packed shift intrinsic with a target independent
+    // shift dag node.
+    SDValue Splat = DAG.getConstant(C, VT);
+    return DAG.getNode(ISD::SRA, SDLoc(N), VT, Op0, Splat);
+  }
+  }
+}
+
 /// PerformMulCombine - Optimize a single multiply with constant into two
 /// in order to implement it with two cheaper instructions, e.g.
 /// LEA + SHL, LEA + LEA.
@@ -18223,7 +19133,7 @@ static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
     N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getScalarType(),
                      N1->getOperand(0));
     SmallVector<SDValue, 8> C(WideVT.getVectorNumElements(), N1);
-    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, &C[0], C.size());
+    N1 = DAG.getNode(ISD::BUILD_VECTOR, DL, WideVT, C);
   } else if (RHSTrunc) {
     N1 = N1->getOperand(0);
   }
@@ -18260,40 +19170,13 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
   if (R.getNode())
     return R;
 
-  // Create BEXTR and BZHI instructions
-  // BZHI is X & ((1 << Y) - 1)
+  // Create BEXTR instructions
   // BEXTR is ((X >> imm) & (2**size-1))
   if (VT == MVT::i32 || VT == MVT::i64) {
     SDValue N0 = N->getOperand(0);
     SDValue N1 = N->getOperand(1);
     SDLoc DL(N);
 
-    if (Subtarget->hasBMI2()) {
-      // Check for (and (add (shl 1, Y), -1), X)
-      if (N0.getOpcode() == ISD::ADD && isAllOnes(N0.getOperand(1))) {
-        SDValue N00 = N0.getOperand(0);
-        if (N00.getOpcode() == ISD::SHL) {
-          SDValue N001 = N00.getOperand(1);
-          assert(N001.getValueType() == MVT::i8 && "unexpected type");
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N00.getOperand(0));
-          if (C && C->getZExtValue() == 1)
-            return DAG.getNode(X86ISD::BZHI, DL, VT, N1, N001);
-        }
-      }
-
-      // Check for (and X, (add (shl 1, Y), -1))
-      if (N1.getOpcode() == ISD::ADD && isAllOnes(N1.getOperand(1))) {
-        SDValue N10 = N1.getOperand(0);
-        if (N10.getOpcode() == ISD::SHL) {
-          SDValue N101 = N10.getOperand(1);
-          assert(N101.getValueType() == MVT::i8 && "unexpected type");
-          ConstantSDNode *C = dyn_cast<ConstantSDNode>(N10.getOperand(0));
-          if (C && C->getZExtValue() == 1)
-            return DAG.getNode(X86ISD::BZHI, DL, VT, N0, N101);
-        }
-      }
-    }
-
     // Check for BEXTR.
     if ((Subtarget->hasBMI() || Subtarget->hasTBM()) &&
         (N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::SRL)) {
@@ -18533,8 +19416,7 @@ static SDValue performIntegerAbsCombine(SDNode *N, SelectionDAG &DAG) {
         SDValue Ops[] = { N0.getOperand(0), Neg,
                           DAG.getConstant(X86::COND_GE, MVT::i8),
                           SDValue(Neg.getNode(), 1) };
-        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue),
-                           Ops, array_lengthof(Ops));
+        return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
       }
   return SDValue();
 }
@@ -18691,8 +19573,7 @@ static SDValue PerformLOADCombine(SDNode *N, SelectionDAG &DAG,
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
     }
 
-    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 
     // Bitcast the loaded value to a vector of the original element type, in
     // the size of the target vector type.
@@ -18867,8 +19748,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       Chains.push_back(Ch);
     }
 
-    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &Chains[0],
-                               Chains.size());
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
   }
 
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
@@ -18891,7 +19771,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       !cast<LoadSDNode>(St->getValue())->isVolatile() &&
       St->getChain().hasOneUse() && !St->isVolatile()) {
     SDNode* LdVal = St->getValue().getNode();
-    LoadSDNode *Ld = 0;
+    LoadSDNode *Ld = nullptr;
     int TokenFactorIndex = -1;
     SmallVector<SDValue, 8> Ops;
     SDNode* ChainVal = St->getChain().getNode();
@@ -18934,8 +19814,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
       SDValue NewChain = NewLd.getValue(1);
       if (TokenFactorIndex != -1) {
         Ops.push_back(NewChain);
-        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                               Ops.size());
+        NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
       }
       return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
                           St->getPointerInfo(),
@@ -18962,8 +19841,7 @@ static SDValue PerformSTORECombine(SDNode *N, SelectionDAG &DAG,
     if (TokenFactorIndex != -1) {
       Ops.push_back(LoLd);
       Ops.push_back(HiLd);
-      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, &Ops[0],
-                             Ops.size());
+      NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
     }
 
     LoAddr = St->getBasePtr();
@@ -19432,6 +20310,33 @@ static SDValue PerformISDSETCCCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformINSERTPSCombine(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget *Subtarget) {
+  SDLoc dl(N);
+  MVT VT = N->getOperand(1)->getSimpleValueType(0);
+  assert((VT == MVT::v4f32 || VT == MVT::v4i32) &&
+         "X86insertps is only defined for v4x32");
+
+  SDValue Ld = N->getOperand(1);
+  if (MayFoldLoad(Ld)) {
+    // Extract the countS bits from the immediate so we can get the proper
+    // address when narrowing the vector load to a specific element.
+    // When the second source op is a memory address, interps doesn't use
+    // countS and just gets an f32 from that address.
+    unsigned DestIndex =
+        cast<ConstantSDNode>(N->getOperand(2))->getZExtValue() >> 6;
+    Ld = NarrowVectorLoadToElement(cast<LoadSDNode>(Ld), DestIndex, DAG);
+  } else
+    return SDValue();
+
+  // Create this as a scalar to vector to match the instruction pattern.
+  SDValue LoadScalarToVector = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Ld);
+  // countS bits are ignored when loading from memory on insertps, which
+  // means we don't need to explicitly set them to 0.
+  return DAG.getNode(X86ISD::INSERTPS, dl, VT, N->getOperand(0),
+                     LoadScalarToVector, N->getOperand(2));
+}
+
 // Helper function of PerformSETCCCombine. It is to materialize "setb reg"
 // as "sbb reg,reg", since it can be extended without zext and produces
 // an all-ones bit which is more useful than 0/1 in some cases.
@@ -19711,7 +20616,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return PerformZExtCombine(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND:    return PerformSExtCombine(N, DAG, DCI, Subtarget);
-  case ISD::SIGN_EXTEND_INREG: return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
+  case ISD::SIGN_EXTEND_INREG:
+    return PerformSIGN_EXTEND_INREGCombine(N, DAG, Subtarget);
   case ISD::TRUNCATE:       return PerformTruncateCombine(N, DAG,DCI,Subtarget);
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG, Subtarget);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
@@ -19732,6 +20638,10 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VPERM2X128:
   case ISD::VECTOR_SHUFFLE: return PerformShuffleCombine(N, DAG, DCI,Subtarget);
   case ISD::FMA:            return PerformFMACombine(N, DAG, Subtarget);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformINTRINSIC_WO_CHAINCombine(N, DAG, Subtarget);
+  case X86ISD::INSERTPS:
+    return PerformINSERTPSCombine(N, DAG, Subtarget);
   }
 
   return SDValue();
@@ -20006,7 +20916,7 @@ TargetLowering::ConstraintWeight
   Value *CallOperandVal = info.CallOperandVal;
     // If we don't have a value, we can't do a match,
     // but allow it at the lowest weight.
-  if (CallOperandVal == NULL)
+  if (!CallOperandVal)
     return CW_Default;
   Type *type = CallOperandVal->getType();
   // Look at the constraint type.
@@ -20124,7 +21034,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
                                                      std::string &Constraint,
                                                      std::vector<SDValue>&Ops,
                                                      SelectionDAG &DAG) const {
-  SDValue Result(0, 0);
+  SDValue Result;
 
   // Only support length 1 constraints for now.
   if (Constraint.length() > 1) return;
@@ -20207,7 +21117,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
 
     // If we are in non-pic codegen mode, we allow the address of a global (with
     // an optional displacement) to be used with 'i'.
-    GlobalAddressSDNode *GA = 0;
+    GlobalAddressSDNode *GA = nullptr;
     int64_t Offset = 0;
 
     // Match either (GA), (GA+C), (GA+C1+C2), etc.
@@ -20363,7 +21273,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   Res = TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
 
   // Not found as a standard register?
-  if (Res.second == 0) {
+  if (!Res.second) {
     // Map st(0) -> st(7) -> ST0
     if (Constraint.size() == 7 && Constraint[0] == '{' &&
         tolower(Constraint[1]) == 's' &&
@@ -20488,3 +21398,30 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
 
   return Res;
 }
+
+int X86TargetLowering::getScalingFactorCost(const AddrMode &AM,
+                                            Type *Ty) const {
+  // Scaling factors are not free at all.
+  // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
+  // will take 2 allocations in the out of order engine instead of 1
+  // for plain addressing mode, i.e. inst (reg1).
+  // E.g.,
+  // vaddps (%rsi,%drx), %ymm0, %ymm1
+  // Requires two allocations (one for the load, one for the computation)
+  // whereas:
+  // vaddps (%rsi), %ymm0, %ymm1
+  // Requires just 1 allocation, i.e., freeing allocations for other operations
+  // and having less micro operations to execute.
+  //
+  // For some X86 architectures, this is even worse because for instance for
+  // stores, the complex addressing mode forces the instruction to use the
+  // "load" ports instead of the dedicated "store" port.
+  // E.g., on Haswell:
+  // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
+  // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.   
+  if (isLegalAddressingMode(AM, Ty))
+    // Scale represents reg2 * scale, thus account for 1
+    // as soon as we use a second register.
+    return AM.Scale != 0;
+  return -1;
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 0f0d17b..9f51b53 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -83,6 +83,9 @@ namespace llvm {
       /// readcyclecounter
       RDTSC_DAG,
 
+      /// X86 Read Time-Stamp Counter and Processor ID.
+      RDTSCP_DAG,
+
       /// X86 compare and logical compare instructions.
       CMP, COMI, UCOMI,
 
@@ -291,7 +294,6 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      BZHI,   // BZHI - Zero high bits
       BEXTR,  // BEXTR - Bit field extract
 
       UMUL, // LOW, HI, FLAGS = umul LHS, RHS
@@ -345,6 +347,8 @@ namespace llvm {
 
       // PMULUDQ - Vector multiply packed unsigned doubleword integers
       PMULUDQ,
+      // PMULUDQ - Vector multiply packed signed doubleword integers
+      PMULDQ,
 
       // FMA nodes
       FMADD,
@@ -614,18 +618,19 @@ namespace llvm {
     /// getSetCCResultType - Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(LLVMContext &Context, EVT VT) const override;
 
-    /// computeMaskedBitsForTargetNode - Determine which of the bits specified
+    /// computeKnownBitsForTargetNode - Determine which of the bits specified
     /// in Mask are known to be either zero or one and return them in the
     /// KnownZero/KnownOne bitsets.
-    void computeMaskedBitsForTargetNode(const SDValue Op,
-                                        APInt &KnownZero,
-                                        APInt &KnownOne,
-                                        const SelectionDAG &DAG,
-                                        unsigned Depth = 0) const override;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
     // ComputeNumSignBitsForTargetNode - Determine the number of bits in the
     // operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const SelectionDAG &DAG,
                                              unsigned Depth) const override;
 
     bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
@@ -679,6 +684,12 @@ namespace llvm {
     /// the immediate into a register.
     bool isLegalAddImmediate(int64_t Imm) const override;
 
+    /// \brief Return the cost of the scaling factor used in the addressing
+    /// mode represented by AM for this target, for a load/store
+    /// of the specified type.
+    /// If the AM is supported, the return value must be >= 0.
+    /// If the AM is not supported, it returns a negative value.
+    int getScalingFactorCost(const AddrMode &AM, Type *Ty) const override;
 
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
@@ -771,10 +782,12 @@ namespace llvm {
                                            Type *Ty) const override;
 
     /// Intel processors have a unified instruction and data cache
-    const char * getClearCacheBuiltinName() const {
-      return 0; // nothing to do, move along.
+    const char * getClearCacheBuiltinName() const override {
+      return nullptr; // nothing to do, move along.
     }
 
+    unsigned getRegisterByName(const char* RegName, EVT VT) const override;
+
     /// createFastISel - This method returns a target specific FastISel object,
     /// or null if the target does not support "fast" ISel.
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
@@ -871,8 +884,11 @@ namespace llvm {
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
+    SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -908,6 +924,7 @@ namespace llvm {
     SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue
       LowerFormalArguments(SDValue Chain,
@@ -936,7 +953,7 @@ namespace llvm {
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         LLVMContext &Context) const override;
 
-    const uint16_t *getScratchRegisters(CallingConv::ID CC) const override;
+    const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
@@ -987,11 +1004,12 @@ namespace llvm {
 
     /// Emit nodes that will be selected as "test Op0,Op0", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitTest(SDValue Op0, unsigned X86CC, SelectionDAG &DAG) const;
+    SDValue EmitTest(SDValue Op0, unsigned X86CC, SDLoc dl,
+                     SelectionDAG &DAG) const;
 
     /// Emit nodes that will be selected as "cmp Op0,Op1", or something
     /// equivalent, for use with the given x86 condition code.
-    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+    SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, SDLoc dl,
                     SelectionDAG &DAG) const;
 
     /// Convert a comparison if required by the subtarget.
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 2c5edf6..37bcc52 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -209,12 +209,12 @@ def : Pat<(vinsert256_insert:$ins (v16i32 VR512:$src1),
 def VINSERTPSzrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
       (ins VR128X:$src1, VR128X:$src2, u32u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insrtps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
+      [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
       EVEX_4V;
 def VINSERTPSzrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
       (ins VR128X:$src1, f32mem:$src2, u32u8imm:$src3),
       "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-      [(set VR128X:$dst, (X86insrtps VR128X:$src1,
+      [(set VR128X:$dst, (X86insertps VR128X:$src1,
                           (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                           imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
 
@@ -621,6 +621,22 @@ defm VPERMT2PS : avx512_perm_3src<0x7F, "vpermt2ps",  VR512, memopv16f32, i512me
                                X86VPermv3, v16f32>, EVEX_V512, EVEX_CD8<32, CD8VF>;
 defm VPERMT2PD : avx512_perm_3src<0x7F, "vpermt2pd",  VR512, memopv8f64, i512mem, 
                                X86VPermv3, v8f64>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+
+def : Pat<(v16f32 (int_x86_avx512_mask_vpermt_ps_512 (v16i32 VR512:$idx),
+                   (v16f32 VR512:$src1), (v16f32 VR512:$src2), (i16 -1))),
+          (VPERMT2PSrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v16i32 (int_x86_avx512_mask_vpermt_d_512 (v16i32 VR512:$idx),
+                   (v16i32 VR512:$src1), (v16i32 VR512:$src2), (i16 -1))),
+          (VPERMT2Drr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8f64 (int_x86_avx512_mask_vpermt_pd_512 (v8i64 VR512:$idx),
+                   (v8f64 VR512:$src1), (v8f64 VR512:$src2), (i8 -1))),
+          (VPERMT2PDrr VR512:$src1, VR512:$idx, VR512:$src2)>;
+
+def : Pat<(v8i64 (int_x86_avx512_mask_vpermt_q_512 (v8i64 VR512:$idx),
+                   (v8i64 VR512:$src1), (v8i64 VR512:$src2), (i8 -1))),
+          (VPERMT2Qrr VR512:$src1, VR512:$idx, VR512:$src2)>;
 //===----------------------------------------------------------------------===//
 // AVX-512 - BLEND using mask
 //
@@ -984,6 +1000,10 @@ let Predicates = [HasAVX512] in {
             (EXTRACT_SUBREG
              (AND32ri (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
               sub_16bit)>;
+  def : Pat<(v16i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK16)>;
+  def : Pat<(v8i1 (scalar_to_vector VK1:$src)),
+            (COPY_TO_REGCLASS VK1:$src, VK8)>;
 }
 // With AVX-512 only, 8-bit mask is promoted to 16-bit mask.
 let Predicates = [HasAVX512] in {
@@ -1356,6 +1376,23 @@ defm VMOVDQU64: avx512_load<0x6F, VR512, VK8WM, i512mem, load,
                               "vmovdqu64", SSEPackedInt, v8i64>,
                                XS, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VF>;
 
+def: Pat<(v16i32 (int_x86_avx512_mask_loadu_d_512 addr:$ptr,
+                 (v16i32 immAllZerosV), GR16:$mask)),
+       (VMOVDQU32rmkz (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)), addr:$ptr)>;
+
+def: Pat<(v8i64 (int_x86_avx512_mask_loadu_q_512 addr:$ptr,
+                 (bc_v8i64 (v16i32 immAllZerosV)), GR8:$mask)),
+       (VMOVDQU64rmkz (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), addr:$ptr)>;
+
+def: Pat<(int_x86_avx512_mask_storeu_d_512 addr:$ptr, (v16i32 VR512:$src),
+          GR16:$mask),
+         (VMOVDQU32mrk addr:$ptr, (v16i1 (COPY_TO_REGCLASS GR16:$mask, VK16WM)),
+            VR512:$src)>;
+def: Pat<(int_x86_avx512_mask_storeu_q_512 addr:$ptr, (v8i64 VR512:$src),
+          GR8:$mask),
+         (VMOVDQU64mrk addr:$ptr, (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)),
+            VR512:$src)>;
+
 let AddedComplexity = 20 in {
 def : Pat<(v8i64 (vselect VK8WM:$mask, (v8i64 VR512:$src),
                            (bc_v8i64 (v16i32 immAllZerosV)))),
@@ -3112,6 +3149,17 @@ def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
           (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr 
            (v16f32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
                                  
+def : Pat<(v4i32 (fp_to_uint (v4f32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
+           (v16f32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
+
+def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR256X:$src1, sub_ymm)))), sub_ymm)>;
+           
+def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
+          (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
+           (v16i32 (SUBREG_TO_REG (i32 0), VR128X:$src1, sub_xmm)))), sub_xmm)>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_cvtdq2ps_512 (v16i32 VR512:$src),
                    (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1), imm:$rc)),
@@ -3715,7 +3763,7 @@ defm VRNDSCALEPSZ : avx512_rndscale<0x08, "vrndscaleps", f512mem, VR512,
                                 EVEX_CD8<32, CD8VF>;
 
 def : Pat<(v16f32 (int_x86_avx512_mask_rndscale_ps_512 (v16f32 VR512:$src1),
-                   imm:$src2, (bc_v16f32 (v16i32 immAllZerosV)), (i16 -1),
+                   imm:$src2, (v16f32 VR512:$src1), (i16 -1),
                    FROUND_CURRENT)),
                    (VRNDSCALEPSZr VR512:$src1, imm:$src2)>;
 
@@ -3725,7 +3773,7 @@ defm VRNDSCALEPDZ : avx512_rndscale<0x09, "vrndscalepd", f512mem, VR512,
                                 VEX_W, EVEX_CD8<64, CD8VF>;
 
 def : Pat<(v8f64 (int_x86_avx512_mask_rndscale_pd_512 (v8f64 VR512:$src1),
-                  imm:$src2, (bc_v8f64 (v16i32 immAllZerosV)), (i8 -1),
+                  imm:$src2, (v8f64 VR512:$src1), (i8 -1),
                   FROUND_CURRENT)),
                    (VRNDSCALEPDZr VR512:$src1, imm:$src2)>;
 
@@ -3807,7 +3855,13 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
                !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
 
-  def krr : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+  def rrk : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
+               (ins KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr,
+                 " \t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
+  def rrkz : AVX512XS8I<opc, MRMDestReg, (outs dstRC:$dst),
                (ins KRC:$mask, srcRC:$src),
                !strconcat(OpcodeStr,
                  " \t{$src, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src}"),
@@ -3816,6 +3870,12 @@ multiclass avx512_trunc_sat<bits<8> opc, string OpcodeStr,
   def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, srcRC:$src),
                !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
                []>, EVEX;
+
+  def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
+               (ins x86memop:$dst, KRC:$mask, srcRC:$src),
+               !strconcat(OpcodeStr, " \t{$src, $dst {${mask}}|${dst} {${mask}}, $src}"),
+               []>, EVEX, EVEX_K;
+
 }
 defm VPMOVQB    : avx512_trunc_sat<0x32, "vpmovqb",   VR128X, VR512, VK8WM, 
                                  i128mem>, EVEX_V512, EVEX_CD8<8, CD8VO>;
@@ -3855,60 +3915,86 @@ def : Pat<(v16i8  (X86vtrunc (v16i32 VR512:$src))), (VPMOVDBrr  VR512:$src)>;
 def : Pat<(v8i32  (X86vtrunc (v8i64  VR512:$src))), (VPMOVQDrr  VR512:$src)>;
 
 def : Pat<(v16i8  (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDBkrr VK16WM:$mask, VR512:$src)>;
+                  (VPMOVDBrrkz VK16WM:$mask, VR512:$src)>;
 def : Pat<(v16i16 (X86vtruncm VK16WM:$mask, (v16i32 VR512:$src))),
-                  (VPMOVDWkrr VK16WM:$mask, VR512:$src)>;
+                  (VPMOVDWrrkz VK16WM:$mask, VR512:$src)>;
 def : Pat<(v8i16  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQWkrr  VK8WM:$mask, VR512:$src)>;
+                  (VPMOVQWrrkz  VK8WM:$mask, VR512:$src)>;
 def : Pat<(v8i32  (X86vtruncm VK8WM:$mask,  (v8i64 VR512:$src))),
-                  (VPMOVQDkrr  VK8WM:$mask, VR512:$src)>;
+                  (VPMOVQDrrkz  VK8WM:$mask, VR512:$src)>;
 
 
-multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass DstRC,
-                      RegisterClass SrcRC, SDNode OpNode, PatFrag mem_frag, 
-                      X86MemOperand x86memop, ValueType OpVT, ValueType InVT> {
+multiclass avx512_extend<bits<8> opc, string OpcodeStr, RegisterClass KRC,
+                      RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode,
+                      PatFrag mem_frag, X86MemOperand x86memop,
+                      ValueType OpVT, ValueType InVT> {
 
   def rr : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
               (ins SrcRC:$src),
               !strconcat(OpcodeStr, " \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst, (OpVT (OpNode (InVT SrcRC:$src))))]>, EVEX;
-  def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+
+  def rrk : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>, EVEX, EVEX_K;
+
+  def rrkz : AVX5128I<opc, MRMSrcReg, (outs DstRC:$dst),
+              (ins KRC:$mask, SrcRC:$src),
+              !strconcat(OpcodeStr, " \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>, EVEX, EVEX_KZ;
+
+  let mayLoad = 1 in {
+    def rm : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
               (ins x86memop:$src),
               !strconcat(OpcodeStr," \t{$src, $dst|$dst, $src}"),
               [(set DstRC:$dst,
                 (OpVT (OpNode (InVT (bitconvert (mem_frag addr:$src))))))]>,
               EVEX;
+
+    def rmk : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} |$dst {${mask}}, $src}"),
+              []>,
+              EVEX, EVEX_K;
+
+    def rmkz : AVX5128I<opc, MRMSrcMem, (outs DstRC:$dst),
+              (ins KRC:$mask, x86memop:$src),
+              !strconcat(OpcodeStr," \t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}"),
+              []>,
+              EVEX, EVEX_KZ;
+  }
 }
 
-defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VR512, VR128X, X86vzext, 
+defm VPMOVZXBDZ: avx512_extend<0x31, "vpmovzxbd", VK16WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
-defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VR512, VR128X, X86vzext, 
+defm VPMOVZXBQZ: avx512_extend<0x32, "vpmovzxbq", VK8WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
-defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VR512, VR256X, X86vzext, 
+defm VPMOVZXWDZ: avx512_extend<0x33, "vpmovzxwd", VK16WM, VR512, VR256X, X86vzext,
                              memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
-defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VR512, VR128X, X86vzext, 
+defm VPMOVZXWQZ: avx512_extend<0x34, "vpmovzxwq", VK8WM, VR512, VR128X, X86vzext,
                              memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
-defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VR512, VR256X, X86vzext, 
+defm VPMOVZXDQZ: avx512_extend<0x35, "vpmovzxdq", VK8WM, VR512, VR256X, X86vzext,
                              memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
-                             
-defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VR512, VR128X, X86vsext, 
+
+defm VPMOVSXBDZ: avx512_extend<0x21, "vpmovsxbd", VK16WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v16i32, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VQ>;
-defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VR512, VR128X, X86vsext, 
+defm VPMOVSXBQZ: avx512_extend<0x22, "vpmovsxbq", VK8WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v8i64, v16i8>, EVEX_V512,
                              EVEX_CD8<8, CD8VO>;
-defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VR512, VR256X, X86vsext, 
+defm VPMOVSXWDZ: avx512_extend<0x23, "vpmovsxwd", VK16WM, VR512, VR256X, X86vsext,
                              memopv4i64, i256mem, v16i32, v16i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VH>;
-defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VR512, VR128X, X86vsext, 
+defm VPMOVSXWQZ: avx512_extend<0x24, "vpmovsxwq", VK8WM, VR512, VR128X, X86vsext,
                              memopv2i64, i128mem, v8i64, v8i16>, EVEX_V512,
                              EVEX_CD8<16, CD8VQ>;
-defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VR512, VR256X, X86vsext, 
+defm VPMOVSXDQZ: avx512_extend<0x25, "vpmovsxdq", VK8WM, VR512, VR256X, X86vsext,
                              memopv4i64, i256mem, v8i64, v8i32>, EVEX_V512,
                              EVEX_CD8<32, CD8VH>;
 
@@ -3984,6 +4070,62 @@ defm VPSCATTERQQZ : avx512_scatter<0xA1, "vpscatterqq", VK8WM, VR512, vz64mem>,
 defm VPSCATTERQDZ : avx512_scatter<0xA1, "vpscatterqd", VK8WM, VR256X, vz64mem>,
                                   EVEX_V512, EVEX_CD8<32, CD8VT1>;
 
+// prefetch
+multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeStr,
+                       RegisterClass KRC, X86MemOperand memop> {
+  let Predicates = [HasPFI], hasSideEffects = 1 in
+  def m  : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
+            !strconcat(OpcodeStr, " \t{$src {${mask}}|{${mask}}, $src}"),
+            []>, EVEX, EVEX_K;
+}
+
+defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM1m, "vgatherpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+                     
+defm VGATHERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VGATHERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM2m, "vgatherpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VGATHERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM2m, "vgatherpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPS: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF0DPD: avx512_gather_scatter_prefetch<0xC6, MRM5m, "vscatterpf0dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF0QPD: avx512_gather_scatter_prefetch<0xC7, MRM5m, "vscatterpf0qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPS: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dps",
+                     VK16WM, vz32mem>, EVEX_V512, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPS: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qps",
+                     VK8WM, vz64mem>, EVEX_V512, EVEX_CD8<64, CD8VT1>;
+
+defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd",
+                     VK8WM, vy32mem>, EVEX_V512, VEX_W, EVEX_CD8<32, CD8VT1>;
+
+defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
+                     VK8WM, vz64mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
 //===----------------------------------------------------------------------===//
 // VSHUFPS - VSHUFPD Operations
 
@@ -4200,3 +4342,19 @@ def : Pat<(int_x86_avx512_mask_conflict_q_512 VR512:$src2, VR512:$src1,
                                               GR8:$mask),
           (VPCONFLICTQrrk VR512:$src1,
            (v8i1 (COPY_TO_REGCLASS GR8:$mask, VK8WM)), VR512:$src2)>;
+
+def : Pat<(store (i1 -1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  1), addr:$dst), (MOV8mi addr:$dst, (i8 1))>;
+def : Pat<(store (i1  0), addr:$dst), (MOV8mi addr:$dst, (i8 0))>;
+
+def : Pat<(store VK1:$src, addr:$dst),
+          (KMOVWmk addr:$dst, (COPY_TO_REGCLASS VK1:$src, VK16))>;
+
+def truncstorei1 : PatFrag<(ops node:$val, node:$ptr),
+                           (truncstore node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getMemoryVT() == MVT::i1;
+}]>;
+
+def : Pat<(truncstorei1 GR8:$src, addr:$dst),
+          (MOV8mr addr:$dst, GR8:$src)>;
+
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index aaef4a4..e421f8c 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -52,7 +52,8 @@ struct X86AddressMode {
   unsigned GVOpFlags;
 
   X86AddressMode()
-    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(0), GVOpFlags(0) {
+    : BaseType(RegBase), Scale(1), IndexReg(0), Disp(0), GV(nullptr),
+      GVOpFlags(0) {
     Base.Reg = 0;
   }
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 401849f..34d8fb9 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -1187,9 +1187,9 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
     return CurDAG->MaskedValueIsZero(N->getOperand(0), CN->getAPIntValue());
 
   APInt KnownZero0, KnownOne0;
-  CurDAG->ComputeMaskedBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
+  CurDAG->computeKnownBits(N->getOperand(0), KnownZero0, KnownOne0, 0);
   APInt KnownZero1, KnownOne1;
-  CurDAG->ComputeMaskedBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
+  CurDAG->computeKnownBits(N->getOperand(1), KnownZero1, KnownOne1, 0);
   return (~KnownZero0 & ~KnownZero1) == 0;
 }]>;
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index df6c9da..c0a6864 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -19,8 +19,9 @@ let Constraints = "$src1 = $dst" in {
 multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                     PatFrag MemFrag128, PatFrag MemFrag256,
                     ValueType OpVT128, ValueType OpVT256,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator Op = null_frag> {
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def r     : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, VR128:$src3),
                    !strconcat(OpcodeStr,
@@ -28,7 +29,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
                                                VR128:$src1, VR128:$src3)))]>;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def m     : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
                    (ins VR128:$src1, VR128:$src2, f128mem:$src3),
                    !strconcat(OpcodeStr,
@@ -36,7 +37,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
                                                (MemFrag128 addr:$src3))))]>;
 
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def rY    : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, VR256:$src3),
                    !strconcat(OpcodeStr,
@@ -44,7 +45,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
                                                VR256:$src3)))]>, VEX_L;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, VR256:$src2, f256mem:$src3),
                    !strconcat(OpcodeStr,
@@ -59,18 +60,27 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpcodeStr, string PackTy,
                        PatFrag MemFrag128, PatFrag MemFrag256,
                        SDNode Op, ValueType OpTy128, ValueType OpTy256> {
-  let isCommutable = 1 in
+  // For 213, both the register and memory variant are commutable.
+  // Indeed, the commutable operands are 1 and 2 and both live in registers
+  // for both variants.
   defm r213 : fma3p_rm<opc213,
                        !strconcat(OpcodeStr, "213", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 1,
+                       Op>;
 let neverHasSideEffects = 1 in {
   defm r132 : fma3p_rm<opc132,
                        !strconcat(OpcodeStr, "132", PackTy),
                        MemFrag128, MemFrag256, OpTy128, OpTy256>;
-  let isCommutable = 1 in
+  // For 231, only the register variant is commutable.
+  // For the memory variant the folded operand must be in 3. Thus,
+  // in that case, it cannot be swapped with 2.
   defm r231 : fma3p_rm<opc231,
                        !strconcat(OpcodeStr, "231", PackTy),
-                       MemFrag128, MemFrag256, OpTy128, OpTy256>;
+                       MemFrag128, MemFrag256, OpTy128, OpTy256,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
 } // neverHasSideEffects = 1
 }
 
@@ -119,8 +129,9 @@ let ExeDomain = SSEPackedDouble in {
 let Constraints = "$src1 = $dst" in {
 multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                     RegisterClass RC, ValueType OpVT, PatFrag mem_frag,
+                    bit IsRVariantCommutable = 0, bit IsMVariantCommutable = 0,
                     SDPatternOperator OpNode = null_frag> {
-  let usesCustomInserter = 1 in
+  let usesCustomInserter = 1, isCommutable = IsRVariantCommutable in
   def r     : FMA3<opc, MRMSrcReg, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, RC:$src3),
                    !strconcat(OpcodeStr,
@@ -128,7 +139,7 @@ multiclass fma3s_rm<bits<8> opc, string OpcodeStr, X86MemOperand x86memop,
                    [(set RC:$dst,
                      (OpVT (OpNode RC:$src2, RC:$src1, RC:$src3)))]>;
 
-  let mayLoad = 1 in
+  let mayLoad = 1, isCommutable = IsMVariantCommutable in
   def m     : FMA3<opc, MRMSrcMem, (outs RC:$dst),
                    (ins RC:$src1, RC:$src2, x86memop:$src3),
                    !strconcat(OpcodeStr,
@@ -147,14 +158,21 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 let neverHasSideEffects = 1 in {
   defm r132 : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
                        x86memop, RC, OpVT, mem_frag>;
-  let isCommutable = 1 in
+  // See the other defm of r231 for the explanation regarding the
+  // commutable flags.
   defm r231 : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                       x86memop, RC, OpVT, mem_frag>;
+                       x86memop, RC, OpVT, mem_frag,
+                       /* IsRVariantCommutable */ 1,
+                       /* IsMVariantCommutable */ 0>;
 }
 
-let isCommutable = 1 in
+// See the other defm of r213 for the explanation regarding the
+// commutable flags.
 defm r213 : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                     x86memop, RC, OpVT, mem_frag, OpNode>;
+                     x86memop, RC, OpVT, mem_frag,
+                     /* IsRVariantCommutable */ 1,
+                     /* IsMVariantCommutable */ 1,
+                     OpNode>;
 }
 
 multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 486e5a9..1582f43 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -81,7 +81,7 @@ def X86pinsrb  : SDNode<"X86ISD::PINSRB",
 def X86pinsrw  : SDNode<"X86ISD::PINSRW",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, i32>, SDTCisPtrTy<3>]>>;
-def X86insrtps : SDNode<"X86ISD::INSERTPS",
+def X86insertps : SDNode<"X86ISD::INSERTPS",
                  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisSameAs<0,1>,
                                       SDTCisVT<2, v4f32>, SDTCisPtrTy<3>]>>;
 def X86vzmovl  : SDNode<"X86ISD::VZEXT_MOVL",
@@ -175,6 +175,9 @@ def X86select  : SDNode<"X86ISD::SELECT"     , SDTSelect>;
 def X86pmuludq : SDNode<"X86ISD::PMULUDQ",
                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
                                       SDTCisSameAs<1,2>]>>;
+def X86pmuldq  : SDNode<"X86ISD::PMULDQ",
+                         SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                       SDTCisSameAs<1,2>]>>;
 
 // Specific shuffle nodes - At some point ISD::VECTOR_SHUFFLE will always get
 // translated into one of the target nodes below during lowering.
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 6450f2a..6993577 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -36,11 +36,13 @@
 #include "llvm/Target/TargetOptions.h"
 #include <limits>
 
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-instr-info"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "X86GenInstrInfo.inc"
 
-using namespace llvm;
-
 static cl::opt<bool>
 NoFusing("disable-spill-fusing",
          cl::desc("Disable fusing of spill code into instructions"));
@@ -1511,12 +1513,14 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
 /// operand and follow operands form a reference to the stack frame.
 bool X86InstrInfo::isFrameOperand(const MachineInstr *MI, unsigned int Op,
                                   int &FrameIndex) const {
-  if (MI->getOperand(Op).isFI() && MI->getOperand(Op+1).isImm() &&
-      MI->getOperand(Op+2).isReg() && MI->getOperand(Op+3).isImm() &&
-      MI->getOperand(Op+1).getImm() == 1 &&
-      MI->getOperand(Op+2).getReg() == 0 &&
-      MI->getOperand(Op+3).getImm() == 0) {
-    FrameIndex = MI->getOperand(Op).getIndex();
+  if (MI->getOperand(Op+X86::AddrBaseReg).isFI() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).isImm() &&
+      MI->getOperand(Op+X86::AddrIndexReg).isReg() &&
+      MI->getOperand(Op+X86::AddrDisp).isImm() &&
+      MI->getOperand(Op+X86::AddrScaleAmt).getImm() == 1 &&
+      MI->getOperand(Op+X86::AddrIndexReg).getReg() == 0 &&
+      MI->getOperand(Op+X86::AddrDisp).getImm() == 0) {
+    FrameIndex = MI->getOperand(Op+X86::AddrBaseReg).getIndex();
     return true;
   }
   return false;
@@ -1680,15 +1684,16 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   case X86::FsMOVAPSrm:
   case X86::FsMOVAPDrm: {
     // Loads from constant pools are trivially rematerializable.
-    if (MI->getOperand(1).isReg() &&
-        MI->getOperand(2).isImm() &&
-        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
+    if (MI->getOperand(1+X86::AddrBaseReg).isReg() &&
+        MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
         MI->isInvariantLoad(AA)) {
-      unsigned BaseReg = MI->getOperand(1).getReg();
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0 || BaseReg == X86::RIP)
         return true;
       // Allow re-materialization of PIC load.
-      if (!ReMatPICStubLoad && MI->getOperand(4).isGlobal())
+      if (!ReMatPICStubLoad && MI->getOperand(1+X86::AddrDisp).isGlobal())
         return false;
       const MachineFunction &MF = *MI->getParent()->getParent();
       const MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1699,13 +1704,14 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
 
   case X86::LEA32r:
   case X86::LEA64r: {
-    if (MI->getOperand(2).isImm() &&
-        MI->getOperand(3).isReg() && MI->getOperand(3).getReg() == 0 &&
-        !MI->getOperand(4).isReg()) {
+    if (MI->getOperand(1+X86::AddrScaleAmt).isImm() &&
+        MI->getOperand(1+X86::AddrIndexReg).isReg() &&
+        MI->getOperand(1+X86::AddrIndexReg).getReg() == 0 &&
+        !MI->getOperand(1+X86::AddrDisp).isReg()) {
       // lea fi#, lea GV, etc. are all rematerializable.
-      if (!MI->getOperand(1).isReg())
+      if (!MI->getOperand(1+X86::AddrBaseReg).isReg())
         return true;
-      unsigned BaseReg = MI->getOperand(1).getReg();
+      unsigned BaseReg = MI->getOperand(1+X86::AddrBaseReg).getReg();
       if (BaseReg == 0)
         return true;
       // Allow re-materialization of lea PICBase + x.
@@ -1722,12 +1728,8 @@ X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr *MI,
   return true;
 }
 
-/// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction that
-/// would clobber the EFLAGS condition register. Note the result may be
-/// conservative. If it cannot definitely determine the safety after visiting
-/// a few instructions in each direction it assumes it's not safe.
-static bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator I) {
+bool X86InstrInfo::isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator I) const {
   MachineBasicBlock::iterator E = MBB.end();
 
   // For compile time consideration, if we are not able to determine the
@@ -1998,7 +2000,7 @@ X86InstrInfo::convertToThreeAddressWithLEA(unsigned MIOpc,
     unsigned Src2 = MI->getOperand(2).getReg();
     bool isKill2 = MI->getOperand(2).isKill();
     unsigned leaInReg2 = 0;
-    MachineInstr *InsMI2 = 0;
+    MachineInstr *InsMI2 = nullptr;
     if (Src == Src2) {
       // ADD16rr %reg1028<kill>, %reg1028
       // just a single insert_subreg.
@@ -2062,14 +2064,14 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   // convert them to equivalent lea if the condition code register def's
   // are dead!
   if (hasLiveCondCodeDef(MI))
-    return 0;
+    return nullptr;
 
   MachineFunction &MF = *MI->getParent()->getParent();
   // All instructions input are two-addr instructions.  Get the known operands.
   const MachineOperand &Dest = MI->getOperand(0);
   const MachineOperand &Src = MI->getOperand(1);
 
-  MachineInstr *NewMI = NULL;
+  MachineInstr *NewMI = nullptr;
   // FIXME: 16-bit LEA's are really slow on Athlons, but not bad on P4's.  When
   // we have better subtarget support, enable the 16-bit LEA generation here.
   // 16-bit LEA is also slow on Core2.
@@ -2080,11 +2082,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   switch (MIOpc) {
   case X86::SHUFPSrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufps instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return 0;
+    if (B != C) return nullptr;
     unsigned M = MI->getOperand(3).getImm();
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::PSHUFDri))
       .addOperand(Dest).addOperand(Src).addImm(M);
@@ -2092,11 +2094,11 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   case X86::SHUFPDrri: {
     assert(MI->getNumOperands() == 4 && "Unknown shufpd instruction!");
-    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return 0;
+    if (!TM.getSubtarget<X86Subtarget>().hasSSE2()) return nullptr;
 
     unsigned B = MI->getOperand(1).getReg();
     unsigned C = MI->getOperand(2).getReg();
-    if (B != C) return 0;
+    if (B != C) return nullptr;
     unsigned M = MI->getOperand(3).getImm();
 
     // Convert to PSHUFD mask.
@@ -2109,13 +2111,13 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL64ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     // LEA can't handle RSP.
     if (TargetRegisterInfo::isVirtualRegister(Src.getReg()) &&
         !MF.getRegInfo().constrainRegClass(Src.getReg(),
                                            &X86::GR64_NOSPRegClass))
-      return 0;
+      return nullptr;
 
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA64r))
       .addOperand(Dest)
@@ -2125,7 +2127,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL32ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     unsigned Opc = is64Bit ? X86::LEA64_32r : X86::LEA32r;
 
@@ -2135,7 +2137,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
     if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                         SrcReg, isKill, isUndef, ImplicitOp))
-      return 0;
+      return nullptr;
 
     MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
       .addOperand(Dest)
@@ -2151,10 +2153,10 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::SHL16ri: {
     assert(MI->getNumOperands() >= 3 && "Unknown shift instruction!");
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
-    if (!isTruncatedShiftCountForLEA(ShAmt)) return 0;
+    if (!isTruncatedShiftCountForLEA(ShAmt)) return nullptr;
 
     if (DisableLEA16)
-      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+      return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : nullptr;
     NewMI = BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
       .addOperand(Dest)
       .addReg(0).addImm(1 << ShAmt).addOperand(Src).addImm(0).addReg(0);
@@ -2163,7 +2165,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   default: {
 
     switch (MIOpc) {
-    default: return 0;
+    default: return nullptr;
     case X86::INC64r:
     case X86::INC32r:
     case X86::INC64_32r: {
@@ -2175,7 +2177,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2189,7 +2191,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::INC16r:
     case X86::INC64_16r:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 2 && "Unknown inc instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src), 1);
@@ -2206,7 +2209,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ false,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2221,7 +2224,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::DEC16r:
     case X86::DEC64_16r:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 2 && "Unknown dec instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src), -1);
@@ -2242,7 +2246,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       const MachineOperand &Src2 = MI->getOperand(2);
       bool isKill2, isUndef2;
@@ -2250,7 +2254,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp2 = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src2, Opc, /*AllowSP=*/ false,
                           SrcReg2, isKill2, isUndef2, ImplicitOp2))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
         .addOperand(Dest);
@@ -2272,7 +2276,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD16rr:
     case X86::ADD16rr_DB: {
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       unsigned Src2 = MI->getOperand(2).getReg();
       bool isKill2 = MI->getOperand(2).isKill();
@@ -2311,7 +2316,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
       MachineOperand ImplicitOp = MachineOperand::CreateReg(0, false);
       if (!classifyLEAReg(MI, Src, Opc, /*AllowSP=*/ true,
                           SrcReg, isKill, isUndef, ImplicitOp))
-        return 0;
+        return nullptr;
 
       MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), get(Opc))
           .addOperand(Dest)
@@ -2327,7 +2332,8 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
     case X86::ADD16ri_DB:
     case X86::ADD16ri8_DB:
       if (DisableLEA16)
-        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV) : 0;
+        return is64Bit ? convertToThreeAddressWithLEA(MIOpc, MFI, MBBI, LV)
+                       : nullptr;
       assert(MI->getNumOperands() >= 3 && "Unknown add instruction!");
       NewMI = addOffset(BuildMI(MF, MI->getDebugLoc(), get(X86::LEA16r))
                         .addOperand(Dest).addOperand(Src),
@@ -2337,7 +2343,7 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   }
   }
 
-  if (!NewMI) return 0;
+  if (!NewMI) return nullptr;
 
   if (LV) {  // Update live variables
     if (Src.isKill())
@@ -2789,11 +2795,11 @@ bool X86InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
         std::next(I)->eraseFromParent();
 
       Cond.clear();
-      FBB = 0;
+      FBB = nullptr;
 
       // Delete the JMP if it's equivalent to a fall-through.
       if (MBB.isLayoutSuccessor(I->getOperand(0).getMBB())) {
-        TBB = 0;
+        TBB = nullptr;
         I->eraseFromParent();
         I = MBB.end();
         UnCondBrIter = MBB.end();
@@ -3549,6 +3555,26 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   }
 }
 
+/// isUseDefConvertible - check whether the use can be converted
+/// to remove a comparison against zero.
+static X86::CondCode isUseDefConvertible(MachineInstr *MI) {
+  switch (MI->getOpcode()) {
+  default: return X86::COND_INVALID;
+  case X86::LZCNT16rr: case X86::LZCNT16rm:
+  case X86::LZCNT32rr: case X86::LZCNT32rm:
+  case X86::LZCNT64rr: case X86::LZCNT64rm:
+    return X86::COND_B;
+  case X86::POPCNT16rr:case X86::POPCNT16rm:
+  case X86::POPCNT32rr:case X86::POPCNT32rm:
+  case X86::POPCNT64rr:case X86::POPCNT64rm:
+    return X86::COND_E;
+  case X86::TZCNT16rr: case X86::TZCNT16rm:
+  case X86::TZCNT32rr: case X86::TZCNT32rm:
+  case X86::TZCNT64rr: case X86::TZCNT64rm:
+    return X86::COND_B;
+  }
+}
+
 /// optimizeCompareInstr - Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
@@ -3615,13 +3641,38 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
   bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
-  if (IsCmpZero && (MI->getParent() != CmpInstr->getParent() ||
-      !isDefConvertible(MI)))
+  if (IsCmpZero && MI->getParent() != CmpInstr->getParent())
     return false;
 
+  // If we have a use of the source register between the def and our compare
+  // instruction we can eliminate the compare iff the use sets EFLAGS in the
+  // right way.
+  bool ShouldUpdateCC = false;
+  X86::CondCode NewCC = X86::COND_INVALID;
+  if (IsCmpZero && !isDefConvertible(MI)) {
+    // Scan forward from the use until we hit the use we're looking for or the
+    // compare instruction.
+    for (MachineBasicBlock::iterator J = MI;; ++J) {
+      // Do we have a convertible instruction?
+      NewCC = isUseDefConvertible(J);
+      if (NewCC != X86::COND_INVALID && J->getOperand(1).isReg() &&
+          J->getOperand(1).getReg() == SrcReg) {
+        assert(J->definesRegister(X86::EFLAGS) && "Must be an EFLAGS def!");
+        ShouldUpdateCC = true; // Update CC later on.
+        // This is not a def of SrcReg, but still a def of EFLAGS. Keep going
+        // with the new def.
+        MI = Def = J;
+        break;
+      }
+
+      if (J == I)
+        return false;
+    }
+  }
+
   // We are searching for an earlier instruction that can make CmpInstr
   // redundant and that instruction will be saved in Sub.
-  MachineInstr *Sub = NULL;
+  MachineInstr *Sub = nullptr;
   const TargetRegisterInfo *TRI = &getRegisterInfo();
 
   // We iterate backward, starting from the instruction before CmpInstr and
@@ -3634,7 +3685,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
       RE = CmpInstr->getParent() == MI->getParent() ?
            MachineBasicBlock::reverse_iterator(++Def) /* points to MI */ :
            CmpInstr->getParent()->rend();
-  MachineInstr *Movr0Inst = 0;
+  MachineInstr *Movr0Inst = nullptr;
   for (; RI != RE; ++RI) {
     MachineInstr *Instr = &*RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
@@ -3716,13 +3767,28 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         // CF and OF are used, we can't perform this optimization.
         return false;
       }
+
+      // If we're updating the condition code check if we have to reverse the
+      // condition.
+      if (ShouldUpdateCC)
+        switch (OldCC) {
+        default:
+          return false;
+        case X86::COND_E:
+          break;
+        case X86::COND_NE:
+          NewCC = GetOppositeBranchCondition(NewCC);
+          break;
+        }
     } else if (IsSwapped) {
       // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
       // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
       // We swap the condition code and synthesize the new opcode.
-      X86::CondCode NewCC = getSwappedCondition(OldCC);
+      NewCC = getSwappedCondition(OldCC);
       if (NewCC == X86::COND_INVALID) return false;
+    }
 
+    if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
       // Synthesize the new opcode.
       bool HasMemoryOperand = Instr.hasOneMemOperand();
       unsigned NewOpc;
@@ -3809,19 +3875,19 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
                   unsigned &FoldAsLoadDefReg,
                   MachineInstr *&DefMI) const {
   if (FoldAsLoadDefReg == 0)
-    return 0;
+    return nullptr;
   // To be conservative, if there exists another load, clear the load candidate.
   if (MI->mayLoad()) {
     FoldAsLoadDefReg = 0;
-    return 0;
+    return nullptr;
   }
 
   // Check whether we can move DefMI here.
   DefMI = MRI->getVRegDef(FoldAsLoadDefReg);
   assert(DefMI);
   bool SawStore = false;
-  if (!DefMI->isSafeToMove(this, 0, SawStore))
-    return 0;
+  if (!DefMI->isSafeToMove(this, nullptr, SawStore))
+    return nullptr;
 
   // We try to commute MI if possible.
   unsigned IdxEnd = (MI->isCommutable()) ? 2 : 1;
@@ -3838,12 +3904,12 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
         continue;
       // Do not fold if we have a subreg use or a def or multiple uses.
       if (MO.getSubReg() || MO.isDef() || FoundSrcOperand)
-        return 0;
+        return nullptr;
 
       SrcOperandId = i;
       FoundSrcOperand = true;
     }
-    if (!FoundSrcOperand) return 0;
+    if (!FoundSrcOperand) return nullptr;
 
     // Check whether we can fold the def into SrcOperandId.
     SmallVector<unsigned, 8> Ops;
@@ -3857,22 +3923,22 @@ optimizeLoadInstr(MachineInstr *MI, const MachineRegisterInfo *MRI,
     if (Idx == 1) {
       // MI was changed but it didn't help, commute it back!
       commuteInstruction(MI, false);
-      return 0;
+      return nullptr;
     }
 
     // Check whether we can commute MI and enable folding.
     if (MI->isCommutable()) {
       MachineInstr *NewMI = commuteInstruction(MI, false);
       // Unable to commute.
-      if (!NewMI) return 0;
+      if (!NewMI) return nullptr;
       if (NewMI != MI) {
         // New instruction. It doesn't need to be kept.
         NewMI->eraseFromParent();
-        return 0;
+        return nullptr;
       }
     }
   }
-  return 0;
+  return nullptr;
 }
 
 /// Expand2AddrUndef - Expand a single-def pseudo instruction to a two-addr
@@ -4007,7 +4073,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
                                     MachineInstr *MI, unsigned i,
                                     const SmallVectorImpl<MachineOperand> &MOs,
                                     unsigned Size, unsigned Align) const {
-  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   bool isCallRegIndirect = TM.getSubtarget<X86Subtarget>().callRegIndirect();
   bool isTwoAddrFold = false;
 
@@ -4015,7 +4082,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // when X86Subtarget is Atom.
   if (isCallRegIndirect &&
     (MI->getOpcode() == X86::CALL32r || MI->getOpcode() == X86::CALL64r)) {
-    return NULL;
+    return nullptr;
   }
 
   unsigned NumOps = MI->getDesc().getNumOperands();
@@ -4026,9 +4093,9 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // X86II::MO_GOT_ABSOLUTE_ADDRESS after folding.
   if (MI->getOpcode() == X86::ADD32ri &&
       MI->getOperand(2).getTargetFlags() == X86II::MO_GOT_ABSOLUTE_ADDRESS)
-    return NULL;
+    return nullptr;
 
-  MachineInstr *NewMI = NULL;
+  MachineInstr *NewMI = nullptr;
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
@@ -4063,7 +4130,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       unsigned Opcode = I->second.first;
       unsigned MinAlign = (I->second.second & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
       if (Align < MinAlign)
-        return NULL;
+        return nullptr;
       bool NarrowToMOV32rm = false;
       if (Size) {
         unsigned RCSize = getRegClass(MI->getDesc(), i, &RI, MF)->getSize();
@@ -4071,12 +4138,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
           // Check if it's safe to fold the load. If the size of the object is
           // narrower than the load width, then it's not.
           if (Opcode != X86::MOV64rm || RCSize != 8 || Size != 4)
-            return NULL;
+            return nullptr;
           // If this is a 64-bit load, but the spill slot is 32, then we can do
           // a 32-bit load which is implicitly zero-extended. This likely is due
           // to liveintervalanalysis remat'ing a load from stack slot.
           if (MI->getOperand(0).getSubReg() || MI->getOperand(1).getSubReg())
-            return NULL;
+            return nullptr;
           Opcode = X86::MOV32rm;
           NarrowToMOV32rm = true;
         }
@@ -4105,7 +4172,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   // No fusion
   if (PrintFailedFusing && !MI->isCopy())
     dbgs() << "We failed to fuse operand " << i << " in " << *MI;
-  return NULL;
+  return nullptr;
 }
 
 /// hasPartialRegUpdate - Return true for all instructions that only update
@@ -4270,14 +4337,14 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
                                     const SmallVectorImpl<unsigned> &Ops,
                                     int FrameIndex) const {
   // Check switch flag
-  if (NoFusing) return NULL;
+  if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
-    return 0;
+    return nullptr;
 
   const MachineFrameInfo *MFI = MF.getFrameInfo();
   unsigned Size = MFI->getObjectSize(FrameIndex);
@@ -4290,7 +4357,7 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
     switch (MI->getOpcode()) {
-    default: return NULL;
+    default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; RCSize = 1; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; RCSize = 2; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; RCSize = 4; break;
@@ -4299,12 +4366,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr *MI,
     // Check if it's safe to fold the load. If the size of the object is
     // narrower than the load width, then it's not.
     if (Size < RCSize)
-      return NULL;
+      return nullptr;
     // Change to CMPXXri r, 0 first.
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
-    return NULL;
+    return nullptr;
 
   SmallVector<MachineOperand,4> MOs;
   MOs.push_back(MachineOperand::CreateFI(FrameIndex));
@@ -4322,14 +4389,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     return foldMemoryOperandImpl(MF, MI, Ops, FrameIndex);
 
   // Check switch flag
-  if (NoFusing) return NULL;
+  if (NoFusing) return nullptr;
 
   // Unless optimizing for size, don't fold to avoid partial
   // register update stalls
   if (!MF.getFunction()->getAttributes().
         hasAttribute(AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
       hasPartialRegUpdate(MI->getOpcode()))
-    return 0;
+    return nullptr;
 
   // Determine the alignment of the load.
   unsigned Alignment = 0;
@@ -4352,12 +4419,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
       Alignment = 4;
       break;
     default:
-      return 0;
+      return nullptr;
     }
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     switch (MI->getOpcode()) {
-    default: return NULL;
+    default: return nullptr;
     case X86::TEST8rr:  NewOpc = X86::CMP8ri; break;
     case X86::TEST16rr: NewOpc = X86::CMP16ri8; break;
     case X86::TEST32rr: NewOpc = X86::CMP32ri8; break;
@@ -4367,12 +4434,12 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     MI->setDesc(get(NewOpc));
     MI->getOperand(1).ChangeToImmediate(0);
   } else if (Ops.size() != 1)
-    return NULL;
+    return nullptr;
 
   // Make sure the subregisters match.
   // Otherwise we risk changing the size of the load.
   if (LoadMI->getOperand(0).getSubReg() != MI->getOperand(Ops[0]).getSubReg())
-    return NULL;
+    return nullptr;
 
   SmallVector<MachineOperand,X86::AddrNumOperands> MOs;
   switch (LoadMI->getOpcode()) {
@@ -4388,7 +4455,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     // Medium and large mode can't fold loads this way.
     if (TM.getCodeModel() != CodeModel::Small &&
         TM.getCodeModel() != CodeModel::Kernel)
-      return NULL;
+      return nullptr;
 
     // x86-32 PIC requires a PIC base register for constant pools.
     unsigned PICBase = 0;
@@ -4400,7 +4467,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
         // This doesn't work for several reasons.
         // 1. GlobalBaseReg may have been spilled.
         // 2. It may not be live at MI.
-        return NULL;
+        return nullptr;
     }
 
     // Create a constant-pool entry.
@@ -4436,14 +4503,14 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
           > 4)
       // These instructions only load 32 bits, we can't fold them if the
       // destination register is wider than 32 bits (4 bytes).
-      return NULL;
+      return nullptr;
     if ((LoadMI->getOpcode() == X86::MOVSDrm ||
          LoadMI->getOpcode() == X86::VMOVSDrm) &&
         MF.getRegInfo().getRegClass(LoadMI->getOperand(0).getReg())->getSize()
           > 8)
       // These instructions only load 64 bits, we can't fold them if the
       // destination register is wider than 64 bits (8 bytes).
-      return NULL;
+      return nullptr;
 
     // Folding a normal load. Just copy the load's address operands.
     for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
@@ -4489,7 +4556,8 @@ bool X86InstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
   // Folding a memory location into the two-address part of a two-address
   // instruction is different than folding it other places.  It requires
   // replacing the *two* registers with the memory location.
-  const DenseMap<unsigned, std::pair<unsigned,unsigned> > *OpcodeTablePtr = 0;
+  const DenseMap<unsigned,
+                 std::pair<unsigned,unsigned> > *OpcodeTablePtr = nullptr;
   if (isTwoAddr && NumOps >= 2 && OpNum < 2) {
     OpcodeTablePtr = &RegOp2MemOpTable2Addr;
   } else if (OpNum == 0) { // If operand 0
@@ -4671,7 +4739,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
   AddrOps.push_back(Chain);
 
   // Emit the load instruction.
-  SDNode *Load = 0;
+  SDNode *Load = nullptr;
   if (FoldedLoad) {
     EVT VT = *RC->vt_begin();
     std::pair<MachineInstr::mmo_iterator,
@@ -4696,7 +4764,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
 
   // Emit the data processing instruction.
   std::vector<EVT> VTs;
-  const TargetRegisterClass *DstRC = 0;
+  const TargetRegisterClass *DstRC = nullptr;
   if (MCID.getNumDefs() > 0) {
     DstRC = getRegClass(MCID, 0, &RI, MF);
     VTs.push_back(*DstRC->vt_begin());
@@ -5190,14 +5258,14 @@ static const uint16_t *lookup(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrs); i != e; ++i)
     if (ReplaceableInstrs[i][domain-1] == opcode)
       return ReplaceableInstrs[i];
-  return 0;
+  return nullptr;
 }
 
 static const uint16_t *lookupAVX2(unsigned opcode, unsigned domain) {
   for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
     if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
       return ReplaceableInstrsAVX2[i];
-  return 0;
+  return nullptr;
 }
 
 std::pair<uint16_t, uint16_t>
@@ -5327,8 +5395,10 @@ namespace {
       const X86TargetMachine *TM =
         static_cast<const X86TargetMachine *>(&MF.getTarget());
 
-      assert(!TM->getSubtarget<X86Subtarget>().is64Bit() &&
-             "X86-64 PIC uses RIP relative addressing");
+      // Don't do anything if this is 64-bit as 64-bit PIC
+      // uses RIP relative addressing.
+      if (TM->getSubtarget<X86Subtarget>().is64Bit())
+        return false;
 
       // Only emit a global base reg in PIC mode.
       if (TM->getRelocationModel() != Reloc::PIC_)
@@ -5383,7 +5453,7 @@ namespace {
 
 char CGBR::ID = 0;
 FunctionPass*
-llvm::createGlobalBaseRegPass() { return new CGBR(); }
+llvm::createX86GlobalBaseRegPass() { return new CGBR(); }
 
 namespace {
   struct LDTLSCleanup : public MachineFunctionPass {
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 156291e..5f34915 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -325,7 +325,7 @@ public:
   /// value.
   unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
                               bool UnfoldLoad, bool UnfoldStore,
-                              unsigned *LoadRegIndex = 0) const override;
+                              unsigned *LoadRegIndex = nullptr) const override;
 
   /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
   /// to determine if two loads are loading from the same base address. It
@@ -359,6 +359,13 @@ public:
   /// instruction that defines the specified register class.
   bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const override;
 
+  /// isSafeToClobberEFLAGS - Return true if it's safe insert an instruction tha
+  /// would clobber the EFLAGS condition register. Note the result may be
+  /// conservative. If it cannot definitely determine the safety after visiting
+  /// a few instructions in each direction it assumes it's not safe.
+  bool isSafeToClobberEFLAGS(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) const;
+
   static bool isX86_64ExtendedReg(const MachineOperand &MO) {
     if (!MO.isReg()) return false;
     return X86II::isX86_64ExtendedReg(MO.getReg());
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 8edf873..0d97669 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -206,6 +206,8 @@ def X86rep_movs: SDNode<"X86ISD::REP_MOVS", SDTX86RepStr,
 
 def X86rdtsc   : SDNode<"X86ISD::RDTSC_DAG", SDTX86Void,
                         [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
+def X86rdtscp  : SDNode<"X86ISD::RDTSCP_DAG", SDTX86Void,
+                        [SDNPHasChain, SDNPOutGlue, SDNPSideEffect]>;
 
 def X86Wrapper    : SDNode<"X86ISD::Wrapper",     SDTX86Wrapper>;
 def X86WrapperRIP : SDNode<"X86ISD::WrapperRIP",  SDTX86Wrapper>;
@@ -249,7 +251,6 @@ def X86xor_flag  : SDNode<"X86ISD::XOR",  SDTBinaryArithWithFlags,
 def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 
-def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntShiftOp>;
 def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
@@ -2001,6 +2002,46 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
                       (implicit EFLAGS)]>, XS;
 }
 
+let Predicates = [HasLZCNT] in {
+  def : Pat<(X86cmov (ctlz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))), 
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (ctlz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (ctlz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (LZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (LZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (LZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (ctlz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (ctlz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (ctlz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (LZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (ctlz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (LZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (ctlz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (LZCNT64rm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // BMI Instructions
 //
@@ -2077,6 +2118,47 @@ let Predicates = [HasBMI] in {
             (BLSI64rr GR64:$src)>;
 }
 
+let Predicates = [HasBMI] in {
+  def : Pat<(X86cmov (cttz GR16:$src), (i16 16), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (cttz GR32:$src), (i32 32), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (cttz GR64:$src), (i64 64), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz GR16:$src), (X86_COND_E),
+              (X86cmp GR16:$src, (i16 0))),
+            (TZCNT16rr GR16:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz GR32:$src), (X86_COND_E),
+              (X86cmp GR32:$src, (i32 0))),
+            (TZCNT32rr GR32:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz GR64:$src), (X86_COND_E),
+              (X86cmp GR64:$src, (i64 0))),
+            (TZCNT64rr GR64:$src)>;
+
+  def : Pat<(X86cmov (cttz (loadi16 addr:$src)), (i16 16), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi32 addr:$src)), (i32 32), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (cttz (loadi64 addr:$src)), (i64 64), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+  def : Pat<(X86cmov (i16 16), (cttz (loadi16 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi16 addr:$src), (i16 0))), 
+            (TZCNT16rm addr:$src)>;
+  def : Pat<(X86cmov (i32 32), (cttz (loadi32 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi32 addr:$src), (i32 0))), 
+            (TZCNT32rm addr:$src)>;
+  def : Pat<(X86cmov (i64 64), (cttz (loadi64 addr:$src)), (X86_COND_E),
+              (X86cmp (loadi64 addr:$src), (i64 0))), 
+            (TZCNT64rm addr:$src)>;
+}
+
+
 multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
                           X86MemOperand x86memop, Intrinsic Int,
                           PatFrag ld_frag> {
@@ -2104,18 +2186,38 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
                                int_x86_bmi_bzhi_64, loadi64>, VEX_W;
 }
 
-def : Pat<(X86bzhi GR32:$src1, GR8:$src2),
-          (BZHI32rr GR32:$src1,
-                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi32 addr:$src1), GR8:$src2),
-          (BZHI32rm addr:$src1,
-                    (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi GR64:$src1, GR8:$src2),
-          (BZHI64rr GR64:$src1,
-                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-def : Pat<(X86bzhi (loadi64 addr:$src1), GR8:$src2),
-          (BZHI64rm addr:$src1,
-                    (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+def CountTrailingOnes : SDNodeXForm<imm, [{
+  // Count the trailing ones in the immediate.
+  return getI8Imm(CountTrailingOnes_64(N->getZExtValue()));
+}]>;
+
+def BZHIMask : ImmLeaf<i64, [{
+  return isMask_64(Imm) && (CountTrailingOnes_64(Imm) > 32);
+}]>;
+
+let Predicates = [HasBMI2] in {
+  def : Pat<(and GR64:$src, BZHIMask:$mask),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+                             (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+
+  def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rr GR32:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi32 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI32rm addr:$src,
+              (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and GR64:$src, (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rr GR64:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+
+  def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)),
+            (BZHI64rm addr:$src,
+              (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
+} // HasBMI2
 
 let Predicates = [HasBMI] in {
   def : Pat<(X86bextr GR32:$src1, GR32:$src2),
@@ -2617,21 +2719,21 @@ def : InstAlias<"fnstsw"     , (FNSTSW16r)>;
 
 // lcall and ljmp aliases.  This seems to be an odd mapping in 64-bit mode, but
 // this is compatible with what GAS does.
-def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst)>, Requires<[Not16BitMode]>;
-def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg)>, Requires<[In16BitMode]>;
-def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst)>, Requires<[In16BitMode]>;
-
-def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst)>, Requires<[In64BitMode]>;
-def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst)>, Requires<[In32BitMode]>;
-def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst)>, Requires<[In16BitMode]>;
-def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst)>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL32i i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL32m opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP32m  opaque48mem:$dst), 0>, Requires<[Not16BitMode]>;
+def : InstAlias<"lcall $seg, $off", (FARCALL16i i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp $seg, $off",  (FARJMP16i  i16imm:$off, i16imm:$seg), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"lcall *$dst",      (FARCALL16m opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"ljmp *$dst",       (FARJMP16m  opaque32mem:$dst), 0>, Requires<[In16BitMode]>;
+
+def : InstAlias<"call *$dst",       (CALL64m i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP64m  i16mem:$dst), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"call *$dst",       (CALL32m i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP32m  i16mem:$dst), 0>, Requires<[In32BitMode]>;
+def : InstAlias<"call *$dst",       (CALL16m i16mem:$dst), 0>, Requires<[In16BitMode]>;
+def : InstAlias<"jmp *$dst",        (JMP16m  i16mem:$dst), 0>, Requires<[In16BitMode]>;
 
 
 // "imul <imm>, B" is an alias for "imul <imm>, B, B".
@@ -2664,11 +2766,11 @@ def : InstAlias<"jmpl $seg, $off",  (FARJMP32i  i32imm:$off, i16imm:$seg)>;
 // Force mov without a suffix with a segment and mem to prefer the 'l' form of
 // the move.  All segment/mem forms are equivalent, this has the shortest
 // encoding.
-def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem)>;
-def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg)>;
+def : InstAlias<"mov $mem, $seg", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
+def : InstAlias<"mov $seg, $mem", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
 
 // Match 'movq <largeimm>, <reg>' as an alias for movabsq.
-def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm)>;
+def : InstAlias<"movq $imm, $reg", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
 
 // Match 'movq GR64, MMX' as an alias for movd.
 def : InstAlias<"movq $src, $dst",
@@ -2705,7 +2807,7 @@ def : InstAlias<"outl\t$port", (OUT32ir i8imm:$port), 0>;
 // 'sldt <mem>' can be encoded with either sldtw or sldtq with the same
 // effect (both store to a 16-bit mem).  Force to sldtw to avoid ambiguity
 // errors, since its encoding is the most compact.
-def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem)>;
+def : InstAlias<"sldt $mem", (SLDT16m i16mem:$mem), 0>;
 
 // shld/shrd op,op -> shld op, op, CL
 def : InstAlias<"shld{w}\t{$r2, $r1|$r1, $r2}", (SHLD16rrCL GR16:$r1, GR16:$r2), 0>;
@@ -2751,19 +2853,29 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
 FIXME */
 
 // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", (TEST8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", (TEST16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", (TEST32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", (TEST64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
+                (TEST8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
+                (TEST16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
+                (TEST32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
+                (TEST64rm GR64:$val, i64mem:$mem), 0>;
 
 // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
-def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", (XCHG8rm  GR8 :$val, i8mem :$mem)>;
-def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}", (XCHG16rm GR16:$val, i16mem:$mem)>;
-def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}", (XCHG32rm GR32:$val, i32mem:$mem)>;
-def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}", (XCHG64rm GR64:$val, i64mem:$mem)>;
+def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
+                (XCHG8rm  GR8 :$val, i8mem :$mem), 0>;
+def : InstAlias<"xchg{w}\t{$mem, $val|$val, $mem}",
+                (XCHG16rm GR16:$val, i16mem:$mem), 0>;
+def : InstAlias<"xchg{l}\t{$mem, $val|$val, $mem}",
+                (XCHG32rm GR32:$val, i32mem:$mem), 0>;
+def : InstAlias<"xchg{q}\t{$mem, $val|$val, $mem}",
+                (XCHG64rm GR64:$val, i64mem:$mem), 0>;
 
 // xchg: We accept "xchgX <reg>, %eax" and "xchgX %eax, <reg>" as synonyms.
-def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src)>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar GR32:$src)>, Requires<[Not64BitMode]>;
-def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}", (XCHG32ar64 GR32_NOAX:$src)>, Requires<[In64BitMode]>;
-def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src)>;
+def : InstAlias<"xchg{w}\t{%ax, $src|$src, ax}", (XCHG16ar GR16:$src), 0>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar GR32:$src), 0>, Requires<[Not64BitMode]>;
+def : InstAlias<"xchg{l}\t{%eax, $src|$src, eax}",
+                (XCHG32ar64 GR32_NOAX:$src), 0>, Requires<[In64BitMode]>;
+def : InstAlias<"xchg{q}\t{%rax, $src|$src, rax}", (XCHG64ar GR64:$src), 0>;
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 050ee39..ecf80a1 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -254,6 +254,11 @@ let neverHasSideEffects = 1 in
 def MMX_MOVQ64rr : MMXI<0x6F, MRMSrcReg, (outs VR64:$dst), (ins VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}", [],
                         IIC_MMX_MOVQ_RR>;
+let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
+def MMX_MOVQ64rr_REV : MMXI<0x7F, MRMDestReg, (outs VR64:$dst), (ins VR64:$src),
+                        "movq\t{$src, $dst|$dst, $src}", [],
+                        IIC_MMX_MOVQ_RR>;
+}
 } // SchedRW
 
 let SchedRW = [WriteLoad] in {
@@ -262,11 +267,12 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst, (load_mmx addr:$src))],
                         IIC_MMX_MOVQ_RM>;
+} // SchedRW
+let SchedRW = [WriteStore] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (x86mmx VR64:$src), addr:$dst)],
                         IIC_MMX_MOVQ_RM>;
-} // SchedRW
 
 let SchedRW = [WriteMove] in {
 def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index f2f3967..1eb0485 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1561,9 +1561,9 @@ defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
 
 let Predicates = [UseAVX] in {
   def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+                (VCVTSI2SSrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
   def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
-                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src)>;
+                (VCVTSI2SDrm FR64:$dst, FR64:$src1, i32mem:$src), 0>;
 
   def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
             (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
@@ -1627,9 +1627,9 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
                 (CVTTSD2SI64rm GR64:$dst, f64mem:$src), 0>;
 
 def : InstAlias<"cvtsi2ss\t{$src, $dst|$dst, $src}",
-                (CVTSI2SSrm FR64:$dst, i32mem:$src)>;
+                (CVTSI2SSrm FR64:$dst, i32mem:$src), 0>;
 def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
-                (CVTSI2SDrm FR64:$dst, i32mem:$src)>;
+                (CVTSI2SDrm FR64:$dst, i32mem:$src), 0>;
 
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
@@ -2005,7 +2005,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQrr VR128:$dst, VR128:$src)>;
+                (VCVTPD2DQrr VR128:$dst, VR128:$src), 0>;
 def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -2024,7 +2024,7 @@ def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          (int_x86_avx_cvt_pd2dq_256 (loadv4f64 addr:$src)))]>,
                        VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvtpd2dq\t{$src, $dst|$dst, $src}",
-                (VCVTPD2DQYrr VR128:$dst, VR256:$src)>;
+                (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 }
 
 def CVTPD2DQrm  : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
@@ -2127,7 +2127,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQrr VR128:$dst, VR128:$src)>;
+                (VCVTTPD2DQrr VR128:$dst, VR128:$src), 0>;
 def VCVTTPD2DQXrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttpd2dqx\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst, (int_x86_sse2_cvttpd2dq
@@ -2146,7 +2146,7 @@ def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                           (int_x86_avx_cvtt_pd2dq_256 (loadv4f64 addr:$src)))],
                          IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
 def : InstAlias<"vcvttpd2dq\t{$src, $dst|$dst, $src}",
-                (VCVTTPD2DQYrr VR128:$dst, VR256:$src)>;
+                (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 
 let Predicates = [HasAVX] in {
   def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
@@ -2252,7 +2252,7 @@ def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSrr VR128:$dst, VR128:$src)>;
+                (VCVTPD2PSrr VR128:$dst, VR128:$src), 0>;
 def VCVTPD2PSXrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvtpd2psx\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -2271,7 +2271,7 @@ def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                           (int_x86_avx_cvt_pd2_ps_256 (loadv4f64 addr:$src)))],
                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
 def : InstAlias<"vcvtpd2ps\t{$src, $dst|$dst, $src}",
-                (VCVTPD2PSYrr VR128:$dst, VR256:$src)>;
+                (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
 
 def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtpd2ps\t{$src, $dst|$dst, $src}",
@@ -2973,6 +2973,19 @@ defm XOR  : sse12_fp_packed_logical<0x57, "xor", xor>;
 let isCommutable = 0 in
   defm ANDN : sse12_fp_packed_logical<0x55, "andn", X86andnp>;
 
+// AVX1 requires type coercions in order to fold loads directly into logical
+// operations.
+let Predicates = [HasAVX1Only] in {
+  def : Pat<(bc_v8f32 (and VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (or VR256:$src1, (loadv4i64 addr:$src2))),
+            (VORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (xor VR256:$src1, (loadv4i64 addr:$src2))),
+            (VXORPSYrm VR256:$src1, addr:$src2)>;
+  def : Pat<(bc_v8f32 (X86andnp VR256:$src1, (loadv4i64 addr:$src2))),
+            (VANDNPSYrm VR256:$src1, addr:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Arithmetic Instructions
 //===----------------------------------------------------------------------===//
@@ -3144,23 +3157,23 @@ let Predicates = [UseSSE2] in {
 
 let Predicates = [UseSSE41] in {
   // If the subtarget has SSE4.1 but not AVX, the vector insert
-  // instruction is lowered into a X86insrtps rather than a X86Movss.
+  // instruction is lowered into a X86insertps rather than a X86Movss.
   // When selecting SSE scalar single-precision fp arithmetic instructions,
-  // make sure that we correctly match the X86insrtps.
+  // make sure that we correctly match the X86insertps.
 
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (ADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (SUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (MULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                   (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                     FR32:$src))), (iPTR 0))),
             (DIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -3186,19 +3199,19 @@ let Predicates = [HasAVX] in {
                       (f64 (vector_extract (v2f64 VR128:$dst), (iPTR 0))),
                       FR64:$src))))),
             (VDIVSDrr_Int v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fadd (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VADDSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fsub (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VSUBSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fmul (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VMULSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
-  def : Pat<(v4f32 (X86insrtps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
                  (fdiv (f32 (vector_extract (v4f32 VR128:$dst), (iPTR 0))),
                        FR32:$src))), (iPTR 0))),
             (VDIVSSrr_Int v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -4068,6 +4081,10 @@ defm PADDQ   : PDI_binop_all<0xD4, "paddq", add, v2i64, v4i64,
                              SSE_INTALUQ_ITINS_P, 1>;
 defm PMULLW  : PDI_binop_all<0xD5, "pmullw", mul, v8i16, v16i16,
                              SSE_INTMUL_ITINS_P, 1>;
+defm PMULHUW : PDI_binop_all<0xE4, "pmulhuw", mulhu, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
+defm PMULHW  : PDI_binop_all<0xE5, "pmulhw", mulhs, v8i16, v16i16,
+                             SSE_INTMUL_ITINS_P, 1>;
 defm PSUBB   : PDI_binop_all<0xF8, "psubb", sub, v16i8, v32i8,
                              SSE_INTALU_ITINS_P, 0>;
 defm PSUBW   : PDI_binop_all<0xF9, "psubw", sub, v8i16, v16i16,
@@ -4102,10 +4119,6 @@ defm PADDUSB : PDI_binop_all_int<0xDC, "paddusb", int_x86_sse2_paddus_b,
                                  int_x86_avx2_paddus_b, SSE_INTALU_ITINS_P, 1>;
 defm PADDUSW : PDI_binop_all_int<0xDD, "paddusw", int_x86_sse2_paddus_w,
                                  int_x86_avx2_paddus_w, SSE_INTALU_ITINS_P, 1>;
-defm PMULHUW : PDI_binop_all_int<0xE4, "pmulhuw", int_x86_sse2_pmulhu_w,
-                                 int_x86_avx2_pmulhu_w, SSE_INTMUL_ITINS_P, 1>;
-defm PMULHW  : PDI_binop_all_int<0xE5, "pmulhw" , int_x86_sse2_pmulh_w,
-                                 int_x86_avx2_pmulh_w, SSE_INTMUL_ITINS_P, 1>;
 defm PMADDWD : PDI_binop_all_int<0xF5, "pmaddwd", int_x86_sse2_pmadd_wd,
                                  int_x86_avx2_pmadd_wd, SSE_PMADD, 1>;
 defm PAVGB   : PDI_binop_all_int<0xE0, "pavgb", int_x86_sse2_pavg_b,
@@ -6515,7 +6528,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
+        (X86insertps VR128:$src1, VR128:$src2, imm:$src3))], itins.rr>,
       Sched<[WriteFShuffle]>;
   def rm : SS4AIi8<opc, MRMSrcMem, (outs VR128:$dst),
       (ins VR128:$src1, f32mem:$src2, u32u8imm:$src3),
@@ -6524,7 +6537,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
         !strconcat(asm,
                    "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
       [(set VR128:$dst,
-        (X86insrtps VR128:$src1,
+        (X86insertps VR128:$src1,
                    (v4f32 (scalar_to_vector (loadf32 addr:$src2))),
                     imm:$src3))], itins.rm>,
       Sched<[WriteFShuffleLd, ReadAfterLd]>;
@@ -6537,6 +6550,29 @@ let ExeDomain = SSEPackedSingle in {
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
 
+let Predicates = [UseSSE41] in {
+  // If we're inserting an element from a load or a null pshuf of a load,
+  // fold the load into the insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
+                       (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
+                   imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
+                      (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
+            (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
+let Predicates = [UseAVX] in {
+  // If we're inserting an element from a vbroadcast of a load, fold the
+  // load into the X86insertps instruction.
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadf32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+  def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1),
+                (X86VBroadcast (loadv4f32 addr:$src2)), imm:$src3)),
+            (VINSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Round Instructions
 //===----------------------------------------------------------------------===//
@@ -6990,6 +7026,31 @@ multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 
+/// SS48I_binop_rm2 - Simple SSE41 binary operator with different src and dst
+/// types.
+multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                         ValueType DstVT, ValueType SrcVT, RegisterClass RC,
+                         PatFrag memop_frag, X86MemOperand x86memop,
+                         OpndItins itins,
+                         bit IsCommutable = 0, bit Is2Addr = 1> {
+  let isCommutable = IsCommutable in
+  def rr : SS48I<opc, MRMSrcReg, (outs RC:$dst),
+       (ins RC:$src1, RC:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1), RC:$src2)))]>,
+       Sched<[itins.Sched]>;
+  def rm : SS48I<opc, MRMSrcMem, (outs RC:$dst),
+       (ins RC:$src1, x86memop:$src2),
+       !if(Is2Addr,
+           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+       [(set RC:$dst, (DstVT (OpNode (SrcVT RC:$src1),
+                                     (bitconvert (memop_frag addr:$src2)))))]>,
+       Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int<0x2B, "vpackusdw", int_x86_sse41_packusdw,
@@ -7018,8 +7079,9 @@ let Predicates = [HasAVX] in {
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V;
-  defm VPMULDQ   : SS41I_binop_rm_int<0x28, "vpmuldq",   int_x86_sse41_pmuldq,
-                                      0, DEFAULT_ITINS_VECIMULSCHED>, VEX_4V;
+  defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
+                                   VR128, loadv2i64, i128mem,
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
@@ -7051,9 +7113,9 @@ let Predicates = [HasAVX2] in {
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", X86umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
                                   VEX_4V, VEX_L;
-  defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq, WriteVecIMul>,
-                                        VEX_4V, VEX_L;
+  defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
+                                  VR256, loadv4i64, i256mem,
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7076,8 +7138,9 @@ let Constraints = "$src1 = $dst" in {
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
   defm PMAXUW   : SS48I_binop_rm<0x3E, "pmaxuw", X86umax, v8i16, VR128,
                                  memopv2i64, i128mem, 1, SSE_INTALU_ITINS_P>;
-  defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq,
-                                     1, SSE_INTMUL_ITINS_P>;
+  defm PMULDQ   : SS48I_binop_rm2<0x28, "pmuldq", X86pmuldq, v2i64, v4i32,
+                                  VR128, memopv2i64, i128mem,
+                                  SSE_INTMUL_ITINS_P, 1>;
 }
 
 let Predicates = [HasAVX] in {
@@ -7394,6 +7457,7 @@ let Predicates = [UseSSE41] in {
 
 }
 
+let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
@@ -7407,6 +7471,7 @@ def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
+} // SchedRW
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - Compare Instructions
@@ -7831,18 +7896,20 @@ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
 
 multiclass pclmul_alias<string asm, int immop> {
   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
-                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop)>;
+                  (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
 
   def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
-                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop)>;
+                  (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
 
   def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop)>;
+                  (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
+                  0>;
 
   def : InstAlias<!strconcat("vpclmul", asm,
                              "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
-                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop)>;
+                  (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
+                  0>;
 }
 defm : pclmul_alias<"hqhq", 0x11>;
 defm : pclmul_alias<"hqlq", 0x01>;
@@ -8291,6 +8358,12 @@ let Predicates = [HasF16C] in {
   defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
   defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+
+  // Pattern match vcvtph2ps of a scalar i64 load.
+  def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+            (VCVTPH2PSrm addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 9d3aa1c..b5595cb 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -19,7 +19,7 @@ let Defs = [RAX, RDX] in
               TB;
 
 let Defs = [RAX, RCX, RDX] in
-  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
+  def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
 
 // CPU flow control instructions
 
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index e99f2d9..e969ef2 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "jit"
 #include "X86JITInfo.h"
 #include "X86Relocations.h"
 #include "X86Subtarget.h"
@@ -24,6 +23,8 @@
 #include <cstring>
 using namespace llvm;
 
+#define DEBUG_TYPE "jit"
+
 // Determine the platform we're running on
 #if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
 # define X86_64_JIT
@@ -427,9 +428,14 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
   TsanIgnoreWritesEnd();
 
 #if defined (X86_32_JIT) && !defined (_MSC_VER)
+#if defined(__SSE__)
+  // SSE Callback should be called for SSE-enabled LLVM.
+  return X86CompilationCallback_SSE;
+#else
   if (Subtarget->hasSSE1())
     return X86CompilationCallback_SSE;
 #endif
+#endif
 
   return X86CompilationCallback;
 }
@@ -437,7 +443,7 @@ X86JITInfo::getLazyResolverFunction(JITCompilerFn F) {
 X86JITInfo::X86JITInfo(X86TargetMachine &tm) : TM(tm) {
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   useGOT = 0;
-  TLSOffset = 0;
+  TLSOffset = nullptr;
 }
 
 void *X86JITInfo::emitGlobalValueIndirectSym(const GlobalValue* GV, void *ptr,
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 6d7f3cb..0190080 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -120,7 +120,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE: {
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getGVStubEntry(Sym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
@@ -132,7 +132,7 @@ GetSymbolFromOperand(const MachineOperand &MO) const {
   case X86II::MO_DARWIN_HIDDEN_NONLAZY_PIC_BASE: {
     MachineModuleInfoImpl::StubValueTy &StubSym =
       getMachOMMI().getHiddenGVStubEntry(Sym);
-    if (StubSym.getPointer() == 0) {
+    if (!StubSym.getPointer()) {
       assert(MO.isGlobal() && "Extern symbol not handled yet");
       StubSym =
         MachineModuleInfoImpl::
@@ -168,7 +168,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
                                              MCSymbol *Sym) const {
   // FIXME: We would like an efficient form for this, so we don't have to do a
   // lot of extra uniquing.
-  const MCExpr *Expr = 0;
+  const MCExpr *Expr = nullptr;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
   switch (MO.getTargetFlags()) {
@@ -223,7 +223,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
     break;
   }
 
-  if (Expr == 0)
+  if (!Expr)
     Expr = MCSymbolRefExpr::Create(Sym, RefKind, Ctx);
 
   if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 746d0d6..6639875 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -15,9 +15,9 @@
 
 #include <algorithm>
 
-#define DEBUG_TYPE "x86-pad-short-functions"
 #include "X86.h"
 #include "X86InstrInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -30,6 +30,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-pad-short-functions"
+
 STATISTIC(NumBBsPadded, "Number of basic blocks padded");
 
 namespace {
@@ -49,7 +51,7 @@ namespace {
   struct PadShortFunc : public MachineFunctionPass {
     static char ID;
     PadShortFunc() : MachineFunctionPass(ID)
-                   , Threshold(4), TM(0), TII(0) {}
+                   , Threshold(4), TM(nullptr), TII(nullptr) {}
 
     bool runOnMachineFunction(MachineFunction &MF) override;
 
@@ -100,6 +102,9 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
   }
 
   TM = &MF.getTarget();
+  if (!TM->getSubtarget<X86Subtarget>().padShortFunctions())
+    return false;
+
   TII = TM->getInstrInfo();
 
   // Search through basic blocks and mark the ones that have early returns
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 85aa9b5..a83e1e4 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -38,11 +38,11 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
 #define GET_REGINFO_TARGET_DESC
 #include "X86GenRegisterInfo.inc"
 
-using namespace llvm;
-
 cl::opt<bool>
 ForceStackAlign("force-align-stack",
                  cl::desc("Force align the stack to the minimum alignment"
@@ -129,7 +129,7 @@ X86RegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
   if (!Is64Bit && SubIdx == X86::sub_8bit) {
     A = X86GenRegisterInfo::getSubClassWithSubReg(A, X86::sub_8bit_hi);
     if (!A)
-      return 0;
+      return nullptr;
   }
   return X86GenRegisterInfo::getMatchingSuperRegClass(A, B, SubIdx);
 }
@@ -231,7 +231,7 @@ X86RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
   }
 }
 
-const uint16_t *
+const MCPhysReg *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
   bool HasAVX512 = TM.getSubtarget<X86Subtarget>().hasAVX512();
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 6a71113..2289d91 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -100,7 +100,7 @@ public:
 
   /// getCalleeSavedRegs - Return a null-terminated list of all of the
   /// callee-save registers on this target.
-  const uint16_t *
+  const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction* MF) const override;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const override;
   const uint32_t *getNoPreservedMask() const;
@@ -122,7 +122,7 @@ public:
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const override;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
   unsigned getFrameRegister(const MachineFunction &MF) const override;
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index f5b51ee..6966d61 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -20,6 +20,9 @@ def HaswellModel : SchedMachineModel {
   let LoadLatency = 4;
   let MispredictPenalty = 16;
 
+  // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+  let LoopMicroOpBufferSize = 50;
+
   // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
   // the scheduler to assign a default model to unrecognized opcodes.
   let CompleteModel = 0;
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index a58859a..83f0534 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -21,6 +21,9 @@ def SandyBridgeModel : SchedMachineModel {
   let LoadLatency = 4;
   let MispredictPenalty = 16;
 
+  // Based on the LSD (loop-stream detector) queue size.
+  let LoopMicroOpBufferSize = 28;
+
   // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
   // the scheduler to assign a default model to unrecognized opcodes.
   let CompleteModel = 0;
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index ba72f29..3256ee7 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -535,5 +535,9 @@ def AtomModel : SchedMachineModel {
   let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
   let HighLatency = 30;// Expected, may be overriden by OperandCycles.
 
+  // On the Atom, the throughput for taken branches is 2 cycles. For small
+  // simple loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
   let Itineraries = AtomItineraries;
 }
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 6c2a304..823d101 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -1,4 +1,4 @@
-//===- X86ScheduleSLM.td - X86 Atom Scheduling Definitions -*- tablegen -*-==//
+//=- X86ScheduleSLM.td - X86 Silvermont Scheduling -----------*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,662 +7,225 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines the itinerary class data for the Intel Atom
-// (Silvermont) processor.
+// This file defines the machine model for Intel Silvermont to support
+// instruction scheduling and other instruction cost heuristics.
 //
 //===----------------------------------------------------------------------===//
 
-def IEC_RSV0 : FuncUnit;
-def IEC_RSV1 : FuncUnit;
-def FPC_RSV0 : FuncUnit;
-def FPC_RSV1 : FuncUnit;
-def MEC_RSV : FuncUnit;
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-def SLMItineraries : ProcessorItineraries<
-  [ IEC_RSV0, IEC_RSV1, FPC_RSV0, FPC_RSV1, MEC_RSV ],
-  [], [
-  // [InstrStage<N, [FPC_RSV0, FPC_RSV1]>] 
-  // [InstrStage<N, [FPC_RSV0, FPC_RSV1], 0>, InstrStage<N, [MEC_RSV]>]
-  // [InstrStage<N, [IEC_RSV0, IEC_RSV1]>] 
-  // [InstrStage<N, [IEC_RSV0, IEC_RSV1], 0>,InstrStage<N,[MEC_RSV]>]
-  //
-  // Default is 1 cycle, IEC_RSV0 or IEC_RSV1
-  //InstrItinData<IIC_DEFAULT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ALU_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_ALU_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEA_16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // mul
-  InstrItinData<IIC_MUL8, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL16_MEM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_MUL16_REG, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_MUL32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MUL64, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  // imul by al, ax, eax, rax
-  InstrItinData<IIC_IMUL8, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL16_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL16_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  // imul reg by reg|mem
-  InstrItinData<IIC_IMUL16_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL16_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_RM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_RR, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64_RM, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL64_RR, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>]  >,
-  // imul reg = reg/mem * imm
-  InstrItinData<IIC_IMUL16_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL32_RRI, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL64_RRI, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IMUL16_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL32_RMI, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_IMUL64_RMI, [InstrStage<4, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  // idiv - min latency
-  InstrItinData<IIC_IDIV8, [InstrStage<34, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV16, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV32, [InstrStage<35, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IDIV64, [InstrStage<49, [IEC_RSV0, IEC_RSV1]>] >,
-  // div - min latency
-  InstrItinData<IIC_DIV8_REG, [InstrStage<25, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV8_MEM, [InstrStage<25, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<25, [MEC_RSV]>] >,
-  InstrItinData<IIC_DIV16, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV32, [InstrStage<26, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DIV64, [InstrStage<38, [IEC_RSV0, IEC_RSV1]>] >,
-  // neg/not/inc/dec
-  InstrItinData<IIC_UNARY_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_UNARY_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // add/sub/and/or/xor/adc/sbc/cmp/test
-  InstrItinData<IIC_BIN_NONMEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIN_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // adc/sbb
-  InstrItinData<IIC_BIN_CARRY_NONMEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIN_CARRY_MEM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  // shift/rotate
-  InstrItinData<IIC_SR, [InstrStage<1, [IEC_RSV0], 0>,
-                   InstrStage<1, [MEC_RSV]>] >,
-  // shift double
-  InstrItinData<IIC_SHD16_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD16_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD16_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD16_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD32_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD32_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD32_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD32_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD64_REG_IM, [InstrStage<2, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD64_REG_CL, [InstrStage<4, [IEC_RSV0]>] >,
-  InstrItinData<IIC_SHD64_MEM_IM, [InstrStage<2, [IEC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SHD64_MEM_CL, [InstrStage<4, [IEC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-  // cmov
-  InstrItinData<IIC_CMOV16_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV16_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMOV32_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV32_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMOV64_RM, [InstrStage<2, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMOV64_RR, [InstrStage<2, [IEC_RSV0, IEC_RSV1]>] >,
-  // set
-  InstrItinData<IIC_SET_M, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SET_R, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jcc
-  InstrItinData<IIC_Jcc, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jcxz/jecxz/jrcxz
-  InstrItinData<IIC_JCXZ, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jmp rel
-  InstrItinData<IIC_JMP_REL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // jmp indirect
-  InstrItinData<IIC_JMP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_JMP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  // jmp far
-  InstrItinData<IIC_JMP_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_JMP_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // loop/loope/loopne
-  InstrItinData<IIC_LOOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LOOPE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LOOPNE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // call - all but reg/imm
-  InstrItinData<IIC_CALL_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CALL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_CALL_FAR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_CALL_FAR_PTR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //ret
-  InstrItinData<IIC_RET, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RET_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //sign extension movs
-  InstrItinData<IIC_MOVSX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_MOVSX_R16_R16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVSX_R32_R32, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  //zero extension movs
-  InstrItinData<IIC_MOVZX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVZX_R16_R8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVZX_R16_M8, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_REP_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_REP_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  // SSE binary operations
-  // arithmetic fp scalar
-  InstrItinData<IIC_SSE_ALU_F32S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64S_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F64S_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<13, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RR, [InstrStage<13, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64S_RM, [InstrStage<13, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<13, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_COMIS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_COMIS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_HADDSUB_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_HADDSUB_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-
-  // arithmetic fp parallel
-  InstrItinData<IIC_SSE_ALU_F32P_RR, [InstrStage<3, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F32P_RM, [InstrStage<3, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ALU_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RR, [InstrStage<2, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F32P_RM, [InstrStage<2, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MUL_F64P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F32P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<27, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RR, [InstrStage<27, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DIV_F64P_RM, [InstrStage<27, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<27, [MEC_RSV]>] >,
-
-  // bitwise parallel
-  InstrItinData<IIC_SSE_BIT_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_BIT_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  // arithmetic int parallel
-  InstrItinData<IIC_SSE_INTALU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INTALU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INTALUQ_P_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-
-  // multiply int parallel
-  InstrItinData<IIC_SSE_INTMUL_P_RR, [InstrStage<5, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_INTMUL_P_RM, [InstrStage<5, [FPC_RSV0], 0>,
-                   InstrStage<5, [MEC_RSV]>] >,
-
-  // shift parallel
-  InstrItinData<IIC_SSE_INTSH_P_RR, [InstrStage<2, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RM, [InstrStage<2, [FPC_RSV0], 0>,
-                   InstrStage<2, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INTSH_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_INTSHDQ_P_RI, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_SHUFP, [InstrStage<1, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [FPC_RSV0], 0>,
-                   InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [FPC_RSV0]>] >,
-
-  InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<26, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTPS_RM, [InstrStage<26, [FPC_RSV0], 0>,
-                   InstrStage<26, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RR, [InstrStage<13, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTSS_RM, [InstrStage<13, [FPC_RSV0], 0>,
-                   InstrStage<13, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTPD_RR, [InstrStage<26, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTPD_RM, [InstrStage<26, [FPC_RSV0], 0>,
-                   InstrStage<26, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RR, [InstrStage<13, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_SQRTSD_RM, [InstrStage<13, [FPC_RSV0], 0>,
-                   InstrStage<13, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_RCPP_RR, [InstrStage<9, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_RCPP_RM, [InstrStage<9, [FPC_RSV0], 0>,
-                   InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_RCPS_RR, [InstrStage<4, [FPC_RSV0]>] >,
-  InstrItinData<IIC_SSE_RCPS_RM, [InstrStage<4, [FPC_RSV0], 0>,
-                   InstrStage<4, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVMSK, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MASKMOV, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PEXTRW, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PINSRW, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PABS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PABS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOV_S_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOV_S_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOV_S_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVA_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOVA_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOVU_P_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MOVU_P_MR, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_MOV_LH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_LDDQU, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_MOVDQ, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVD_ToGP, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MOVQ_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_MOVNT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PREFETCH, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PAUSE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_LFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_SFENCE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_LDMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_STMXCSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_SSE_PHADDSUBD_RR, [InstrStage<6, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBD_RM, [InstrStage<6, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBSW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RR, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PHADDSUBW_RM, [InstrStage<9, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<9, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PSHUFB_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PSIGN_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-
-  InstrItinData<IIC_SSE_PMADD, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PMULHRSW, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PALIGNRR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PALIGNRM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MWAIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MONITOR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  // conversions
-  // to/from PD ...
-  InstrItinData<IIC_SSE_CVT_PD_RR, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_PD_RM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  // to/from PS except to/from PD and PS2PI
-  InstrItinData<IIC_SSE_CVT_PS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_PS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_Scalar_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI32_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SS2SI64_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_CVT_SD2SI_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-
-  // MMX MOVs
-  InstrItinData<IIC_MMX_MOV_MM_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOV_REG_MM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MOVQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // other MMX
-  InstrItinData<IIC_MMX_ALU_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALU_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_ALUQ_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBW_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PHADDSUBD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PMUL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MISC_FUNC_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PSADBW,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_SHIFT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_H_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_UNPCK_L, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PCK_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PCK_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PSHUF,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PEXTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_PINSRW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // conversions
-  // from/to PD
-  InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_CVT_PD_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // from/to PI
-  InstrItinData<IIC_MMX_CVT_PS_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MMX_CVT_PS_RM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_CMPX_LOCK, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_8B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPX_LOCK_16B, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XADD_LOCK_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-
-  InstrItinData<IIC_FILD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLD80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def SLMModel : SchedMachineModel {
+  // All x86 instructions are modeled as a single micro-op, and SLM can decode 2
+  // instructions per cycle.
+  let IssueWidth = 2;
+  let MicroOpBufferSize = 32; // Based on the reorder buffer.
+  let LoadLatency = 3;
+  let MispredictPenalty = 10;
+
+  // For small loops, expand by a small factor to hide the backedge cost.
+  let LoopMicroOpBufferSize = 10;
+
+  // FIXME: SSE4 is unimplemented. This flag is set to allow
+  // the scheduler to assign a default model to unrecognized opcodes.
+  let CompleteModel = 0;
+}
 
-  InstrItinData<IIC_FST,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FST80, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FIST,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+let SchedModel = SLMModel in {
+
+// Silvermont has 5 reservation stations for micro-ops
+
+def IEC_RSV0 : ProcResource<1>;
+def IEC_RSV1 : ProcResource<1>;
+def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
+def FPC_RSV1 : ProcResource<1> { let BufferSize = 1; }
+def MEC_RSV  : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def IEC_RSV01  : ProcResGroup<[IEC_RSV0, IEC_RSV1]>;
+def FPC_RSV01  : ProcResGroup<[FPC_RSV0, FPC_RSV1]>;
+
+def SMDivider      : ProcResource<1>;
+def SMFPMultiplier : ProcResource<1>;
+def SMFPDivider    : ProcResource<1>;
+
+// Loads are 3 cycles, so ReadAfterLd registers needn't be available until 3
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 3>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SMWriteResPair<X86FoldableSchedWrite SchedRW,
+                          ProcResourceKind ExePort,
+                          int Lat> {
+  // Register variant is using a single cycle on ExePort.
+  def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+  // Memory variant also uses a cycle on MEC_RSV and adds 3 cycles to the
+  // latency.
+  def : WriteRes<SchedRW.Folded, [MEC_RSV, ExePort]> {
+     let Latency = !add(Lat, 3);
+  }
+}
 
-  InstrItinData<IIC_FLDZ,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FUCOM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FUCOMI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FCOMI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNSTSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNSTCW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLDCW,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNINIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FFREE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNCLEX, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_WAIT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXAM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FNOP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FLDL,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_F2XM1,  [InstrStage<88, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FYL2X,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FPTAN,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FPATAN,  [InstrStage<296, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FXTRACT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPREM1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPSTP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FPREM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FYL2XP1,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FSINCOS,  [InstrStage<281, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FRNDINT,  [InstrStage<25, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FSCALE,  [InstrStage<74, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_FCOMPP,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXSAVE,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXRSTOR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_FXCH, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
+// A folded store needs a cycle on MEC_RSV for the store data, but it does not
+// need an extra port cycle to recompute the address.
+def : WriteRes<WriteRMW, [MEC_RSV]>;
+
+def : WriteRes<WriteStore, [IEC_RSV01, MEC_RSV]>;
+def : WriteRes<WriteLoad,  [MEC_RSV]> { let Latency = 3; }
+def : WriteRes<WriteMove,  [IEC_RSV01]>;
+def : WriteRes<WriteZero,  []>;
+
+defm : SMWriteResPair<WriteALU,   IEC_RSV01, 1>;
+defm : SMWriteResPair<WriteIMul,  IEC_RSV1,  3>;
+defm : SMWriteResPair<WriteShift, IEC_RSV0,  1>;
+defm : SMWriteResPair<WriteJump,  IEC_RSV1,   1>;
+
+// This is for simple LEAs with one or two input operands.
+// The complex ones can only execute on port 1, and they require two cycles on
+// the port to read all inputs. We don't model that.
+def : WriteRes<WriteLEA, [IEC_RSV1]>;
+
+// This is quite rough, latency depends on the dividend.
+def : WriteRes<WriteIDiv, [IEC_RSV01, SMDivider]> {
+  let Latency = 25;
+  let ResourceCycles = [1, 25];
+}
+def : WriteRes<WriteIDivLd, [MEC_RSV, IEC_RSV01, SMDivider]> {
+  let Latency = 29;
+  let ResourceCycles = [1, 1, 25];
+}
 
-  // System instructions
-  InstrItinData<IIC_CPUID, [InstrStage<60, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INT3,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INVD,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INVLPG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IRET,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_HLT,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LXS,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LTR,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDTSC, [InstrStage<30, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RSM,   [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SIDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SGDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SLDT,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SWAPGS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SYSCALL, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SYS_ENTER_EXIT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Scalar and vector floating point.
+defm : SMWriteResPair<WriteFAdd,   FPC_RSV1, 3>;
+defm : SMWriteResPair<WriteFRcp,   FPC_RSV0, 5>;
+defm : SMWriteResPair<WriteFSqrt,  FPC_RSV0, 15>;
+defm : SMWriteResPair<WriteCvtF2I, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtI2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteCvtF2F, FPC_RSV01, 4>;
+defm : SMWriteResPair<WriteFShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteFBlend,  FPC_RSV0,  1>;
+
+// This is quite rough, latency depends on precision
+def : WriteRes<WriteFMul, [FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 5;
+  let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteFMulLd, [MEC_RSV, FPC_RSV0, SMFPMultiplier]> {
+  let Latency = 8;
+  let ResourceCycles = [1, 1, 2];
+}
 
-  InstrItinData<IIC_IN_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_IN_RI,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUT_IR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_INS,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteFDiv, [FPC_RSV0, SMFPDivider]> {
+  let Latency = 34;
+  let ResourceCycles = [1, 34];
+}
+def : WriteRes<WriteFDivLd, [MEC_RSV, FPC_RSV0, SMFPDivider]> {
+  let Latency = 37;
+  let ResourceCycles = [1, 1, 34];
+}
 
-  InstrItinData<IIC_MOV_REG_DR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_DR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // worst case for mov REG_CRx
-  InstrItinData<IIC_MOV_REG_CR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_CR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Vector integer operations.
+defm : SMWriteResPair<WriteVecShift, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVecLogic, FPC_RSV01, 1>;
+defm : SMWriteResPair<WriteVecALU,   FPC_RSV01,  1>;
+defm : SMWriteResPair<WriteVecIMul,  FPC_RSV0,   4>;
+defm : SMWriteResPair<WriteShuffle,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteBlend,  FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteMPSAD,  FPC_RSV0,  7>;
+
+// String instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+def : WriteRes<WritePCmpIStrM, [FPC_RSV0]> {
+  let Latency = 13;
+  let ResourceCycles = [13];
+}
+def : WriteRes<WritePCmpIStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 13;
+  let ResourceCycles = [13, 1];
+}
 
-  InstrItinData<IIC_MOV_REG_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_MEM_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_SR_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_SR_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // LAR
-  InstrItinData<IIC_LAR_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LAR_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // LSL
-  InstrItinData<IIC_LSL_RM,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LSL_RR,  [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpEStrMLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
 
-  InstrItinData<IIC_LGDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LIDT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LLDT_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LLDT_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // push control register, segment registers
-  InstrItinData<IIC_PUSH_CS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_SR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // pop control register, segment registers
-  InstrItinData<IIC_POP_SR,    [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_SR_SS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // VERR, VERW
-  InstrItinData<IIC_VERR,     [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_VERW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_VERW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // WRMSR, RDMSR
-  InstrItinData<IIC_WRMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDMSR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_RDPMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  // SMSW, LMSW
-  InstrItinData<IIC_SMSW, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LMSW_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LMSW_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [FPC_RSV0]> {
+  let Latency = 17;
+  let ResourceCycles = [17];
+}
+def : WriteRes<WritePCmpIStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 17;
+  let ResourceCycles = [17, 1];
+}
 
-  InstrItinData<IIC_ENTER, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LEAVE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [FPC_RSV0]> {
+  let Latency = 21;
+  let ResourceCycles = [21];
+}
+def : WriteRes<WritePCmpEStrILd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 21;
+  let ResourceCycles = [21, 1];
+}
 
-  InstrItinData<IIC_POP_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_REG16, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_FD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_POP_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+// AES Instructions.
+def : WriteRes<WriteAESDecEnc, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESDecEncLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_PUSH_IMM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_F, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_PUSH_A, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
+def : WriteRes<WriteAESIMC, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESIMCLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_BSWAP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOVS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STOS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_SCAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_MOV_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_AHF, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BT_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BT_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BT_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BT_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BTX_MI, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BTX_MR, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_BTX_RI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BTX_RR, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XCHG_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XCHG_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_XADD_REG, [InstrStage<5, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XADD_MEM, [InstrStage<5, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_REG, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_MEM8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_REG8, [InstrStage<6, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<6, [MEC_RSV]>] >,
-  InstrItinData<IIC_CMPXCHG_8B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMPXCHG_16B, [InstrStage<6, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_LODS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_OUTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CMC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CLTS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STC, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STI, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_STD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_XLAT, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAD, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAM, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DAA, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_DAS, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_BOUND, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ARPL_REG, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_ARPL_MEM, [InstrStage<1, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_MOVBE, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_AES, [InstrStage<8, [FPC_RSV0]>] >,
-  InstrItinData<IIC_BLEND_NOMEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_BLEND_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_MEM, [InstrStage<10, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
-  InstrItinData<IIC_BIT_SCAN_REG, [InstrStage<10, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CBW, [InstrStage<4, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CRC32_REG, [InstrStage<3, [IEC_RSV0, IEC_RSV1]>] >,
-  InstrItinData<IIC_CRC32_MEM, [InstrStage<3, [IEC_RSV0, IEC_RSV1], 0>,
-                  InstrStage<3, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DPPD_RR, [InstrStage<12, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DPPD_RM, [InstrStage<12, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<12, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_DPPS_RR, [InstrStage<15, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_DPPS_RM, [InstrStage<15, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<15, [MEC_RSV]>] >,
-  InstrItinData<IIC_MMX_EMMS, [InstrStage<10, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_EXTRACTPS_RR, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_EXTRACTPS_RM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_INSERTPS_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_INSERTPS_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_MPSADBW_RR, [InstrStage<1, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_MPSADBW_RM, [InstrStage<1, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<1, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PMULLD_RR, [InstrStage<11, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PMULLD_RM, [InstrStage<11, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<11, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ROUNDPS_REG, [InstrStage<5, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ROUNDPS_MEM, [InstrStage<5, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<5, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_ROUNDPD_REG, [InstrStage<4, [FPC_RSV0, FPC_RSV1]>] >,
-  InstrItinData<IIC_SSE_ROUNDPD_MEM, [InstrStage<4, [FPC_RSV0, FPC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_POPCNT_RR, [InstrStage<4, [IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_POPCNT_RM, [InstrStage<4, [IEC_RSV1], 0>,
-                  InstrStage<4, [MEC_RSV]>] >,
-  InstrItinData<IIC_SSE_PCLMULQDQ_RR, [InstrStage<10, [IEC_RSV1]>] >,
-  InstrItinData<IIC_SSE_PCLMULQDQ_RM, [InstrStage<10, [IEC_RSV1], 0>,
-                  InstrStage<10, [MEC_RSV]>] >,
+def : WriteRes<WriteAESKeyGen, [FPC_RSV0]> {
+  let Latency = 8;
+  let ResourceCycles = [5];
+}
+def : WriteRes<WriteAESKeyGenLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 8;
+  let ResourceCycles = [5, 1];
+}
 
-  InstrItinData<IIC_NOP, [InstrStage<1, [IEC_RSV0, IEC_RSV1]>] >
-  ]>;
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [FPC_RSV0]> {
+  let Latency = 10;
+  let ResourceCycles = [10];
+}
+def : WriteRes<WriteCLMulLd, [FPC_RSV0, MEC_RSV]> {
+  let Latency = 10;
+  let ResourceCycles = [10, 1];
+}
 
-// Silvermont machine model.
-def SLMModel : SchedMachineModel {
-  let IssueWidth = 2;  // Allows 2 instructions per scheduling group.
-  let MinLatency = 1;  // InstrStage cycles overrides MinLatency.
-                       // OperandCycles may be used for expected latency.
-  let LoadLatency = 3; // Expected cycles, may be overriden by OperandCycles.
-  let HighLatency = 30;// Expected, may be overriden by OperandCycles.
 
-  let Itineraries = SLMItineraries;
-}
+def : WriteRes<WriteSystem,     [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
+def : WriteRes<WriteFence, [MEC_RSV]>;
+def : WriteRes<WriteNop, []>;
+
+// AVX is not supported on that architecture, but we should define the basic
+// scheduling resources anyway.
+def  : WriteRes<WriteIMulH, [FPC_RSV0]>;
+defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteShuffle256, FPC_RSV0,  1>;
+defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0,  1>;
+} // SchedModel
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index b9c620f..744890d 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -11,12 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-selectiondag-info"
 #include "X86TargetMachine.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/IR/DerivedTypes.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-selectiondag-info"
+
 X86SelectionDAGInfo::X86SelectionDAGInfo(const X86TargetMachine &TM) :
   TargetSelectionDAGInfo(TM),
   Subtarget(&TM.getSubtarget<X86Subtarget>()),
@@ -50,7 +51,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
 
     if (const char *bzeroEntry =  V &&
-        V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
+        V->isNullValue() ? Subtarget->getBZeroEntry() : nullptr) {
       EVT IntPtr = TLI.getPointerTy();
       Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
       TargetLowering::ArgListTy Args;
@@ -60,15 +61,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
       Args.push_back(Entry);
       Entry.Node = Size;
       Args.push_back(Entry);
-      TargetLowering::
-      CallLoweringInfo CLI(Chain, Type::getVoidTy(*DAG.getContext()),
-                        false, false, false, false,
-                        0, CallingConv::C, /*isTailCall=*/false,
-                        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-                        DAG.getExternalSymbol(bzeroEntry, IntPtr), Args,
-                        DAG, dl);
-      std::pair<SDValue,SDValue> CallResult =
-        TLI.LowerCallTo(CLI);
+
+      TargetLowering::CallLoweringInfo CLI(DAG);
+      CLI.setDebugLoc(dl).setChain(Chain)
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(bzeroEntry, IntPtr), &Args, 0)
+        .setDiscardResult();
+
+      std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
     }
 
@@ -77,7 +77,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
   }
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
-  SDValue InFlag(0, 0);
+  SDValue InFlag;
   EVT AVT;
   SDValue Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Src);
@@ -139,7 +139,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
-  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+  Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
 
   if (TwoRepStos) {
     InFlag = Chain.getValue(1);
@@ -153,7 +153,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, SDLoc dl,
     InFlag = Chain.getValue(1);
     Tys = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, DAG.getValueType(MVT::i8), InFlag };
-    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops, array_lengthof(Ops));
+    Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops);
   } else if (BytesLeft) {
     // Handle the last 1 - 7 bytes.
     unsigned Offset = SizeVal - BytesLeft;
@@ -225,7 +225,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
   SDValue Count = DAG.getIntPtrConstant(CountVal);
   unsigned BytesLeft = SizeVal % UBytes;
 
-  SDValue InFlag(0, 0);
+  SDValue InFlag;
   Chain  = DAG.getCopyToReg(Chain, dl, Subtarget->is64Bit() ? X86::RCX :
                                                               X86::ECX,
                             Count, InFlag);
@@ -241,8 +241,7 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
 
   SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag };
-  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops,
-                                array_lengthof(Ops));
+  SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops);
 
   SmallVector<SDValue, 4> Results;
   Results.push_back(RepMovs);
@@ -263,6 +262,5 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                                     SrcPtrInfo.getWithOffset(Offset)));
   }
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                     &Results[0], Results.size());
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
 }
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 207d0ba..989e0d6 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "subtarget"
 #include "X86Subtarget.h"
 #include "X86InstrInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -24,15 +24,24 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#if defined(_MSC_VER)
+#include <intrin.h>
+#endif
+
+using namespace llvm;
+
+#define DEBUG_TYPE "subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "X86GenSubtargetInfo.inc"
 
-using namespace llvm;
+// Temporary option to control early if-conversion for x86 while adding machine
+// models.
+static cl::opt<bool>
+X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
+               cl::desc("Enable early if-conversion on X86"));
 
-#if defined(_MSC_VER)
-#include <intrin.h>
-#endif
 
 /// ClassifyBlockAddressReference - Classify a blockaddress reference for the
 /// current subtarget according to how we should reference it in a non-pcrel
@@ -153,7 +162,7 @@ const char *X86Subtarget::getBZeroEntry() const {
       !getTargetTriple().isMacOSXVersionLT(10, 6))
     return "__bzero";
 
-  return 0;
+  return nullptr;
 }
 
 bool X86Subtarget::hasSinCos() const {
@@ -173,251 +182,16 @@ bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
 }
 
-static bool OSHasAVXSupport() {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-#if defined(__GNUC__)
-  // Check xgetbv; this uses a .byte sequence instead of the instruction
-  // directly because older assemblers do not include support for xgetbv and
-  // there is no easy way to conditionally compile based on the assembler used.
-  int rEAX, rEDX;
-  __asm__ (".byte 0x0f, 0x01, 0xd0" : "=a" (rEAX), "=d" (rEDX) : "c" (0));
-#elif defined(_MSC_FULL_VER) && defined(_XCR_XFEATURE_ENABLED_MASK)
-  unsigned long long rEAX = _xgetbv(_XCR_XFEATURE_ENABLED_MASK);
-#else
-  int rEAX = 0; // Ensures we return false
-#endif
-  return (rEAX & 6) == 6;
-#else
-  return false;
-#endif
-}
-
-void X86Subtarget::AutoDetectSubtargetFeatures() {
-  unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
-  unsigned MaxLevel;
-  union {
-    unsigned u[3];
-    char     c[12];
-  } text;
-
-  if (X86_MC::GetCpuIDAndInfo(0, &MaxLevel, text.u+0, text.u+2, text.u+1) ||
-      MaxLevel < 1)
-    return;
-
-  X86_MC::GetCpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
-
-  if ((EDX >> 15) & 1) { HasCMov = true;      ToggleFeature(X86::FeatureCMOV); }
-  if ((EDX >> 23) & 1) { X86SSELevel = MMX;   ToggleFeature(X86::FeatureMMX);  }
-  if ((EDX >> 25) & 1) { X86SSELevel = SSE1;  ToggleFeature(X86::FeatureSSE1); }
-  if ((EDX >> 26) & 1) { X86SSELevel = SSE2;  ToggleFeature(X86::FeatureSSE2); }
-  if (ECX & 0x1)       { X86SSELevel = SSE3;  ToggleFeature(X86::FeatureSSE3); }
-  if ((ECX >> 9)  & 1) { X86SSELevel = SSSE3; ToggleFeature(X86::FeatureSSSE3);}
-  if ((ECX >> 19) & 1) { X86SSELevel = SSE41; ToggleFeature(X86::FeatureSSE41);}
-  if ((ECX >> 20) & 1) { X86SSELevel = SSE42; ToggleFeature(X86::FeatureSSE42);}
-  if (((ECX >> 27) & 1) && ((ECX >> 28) & 1) && OSHasAVXSupport()) {
-    X86SSELevel = AVX;   ToggleFeature(X86::FeatureAVX);
-  }
-
-  bool IsIntel = memcmp(text.c, "GenuineIntel", 12) == 0;
-  bool IsAMD   = !IsIntel && memcmp(text.c, "AuthenticAMD", 12) == 0;
-
-  if ((ECX >> 1) & 0x1) {
-    HasPCLMUL = true;
-    ToggleFeature(X86::FeaturePCLMUL);
-  }
-  if ((ECX >> 12) & 0x1) {
-    HasFMA = true;
-    ToggleFeature(X86::FeatureFMA);
-  }
-  if (IsIntel && ((ECX >> 22) & 0x1)) {
-    HasMOVBE = true;
-    ToggleFeature(X86::FeatureMOVBE);
-  }
-  if ((ECX >> 23) & 0x1) {
-    HasPOPCNT = true;
-    ToggleFeature(X86::FeaturePOPCNT);
-  }
-  if ((ECX >> 25) & 0x1) {
-    HasAES = true;
-    ToggleFeature(X86::FeatureAES);
-  }
-  if ((ECX >> 29) & 0x1) {
-    HasF16C = true;
-    ToggleFeature(X86::FeatureF16C);
-  }
-  if (IsIntel && ((ECX >> 30) & 0x1)) {
-    HasRDRAND = true;
-    ToggleFeature(X86::FeatureRDRAND);
-  }
-
-  if ((ECX >> 13) & 0x1) {
-    HasCmpxchg16b = true;
-    ToggleFeature(X86::FeatureCMPXCHG16B);
-  }
-
-  if (IsIntel || IsAMD) {
-    // Determine if bit test memory instructions are slow.
-    unsigned Family = 0;
-    unsigned Model  = 0;
-    X86_MC::DetectFamilyModel(EAX, Family, Model);
-    if (IsAMD || (Family == 6 && Model >= 13)) {
-      IsBTMemSlow = true;
-      ToggleFeature(X86::FeatureSlowBTMem);
-    }
-
-    // Determine if SHLD/SHRD instructions have higher latency then the
-    // equivalent series of shifts/or instructions. 
-    // FIXME: Add Intel's processors that have SHLD instructions with very
-    // poor latency. 
-    if (IsAMD) {
-      IsSHLDSlow = true;
-      ToggleFeature(X86::FeatureSlowSHLD);
-    }
-
-    // If it's an Intel chip since Nehalem and not an Atom chip, unaligned
-    // memory access is fast. We hard code model numbers here because they
-    // aren't strictly increasing for Intel chips it seems.
-    if (IsIntel &&
-        ((Family == 6 && Model == 0x1E) || // Nehalem: Clarksfield, Lynnfield,
-                                           //          Jasper Froest
-         (Family == 6 && Model == 0x1A) || // Nehalem: Bloomfield, Nehalem-EP
-         (Family == 6 && Model == 0x2E) || // Nehalem: Nehalem-EX
-         (Family == 6 && Model == 0x25) || // Westmere: Arrandale, Clarksdale
-         (Family == 6 && Model == 0x2C) || // Westmere: Gulftown, Westmere-EP
-         (Family == 6 && Model == 0x2F) || // Westmere: Westmere-EX
-         (Family == 6 && Model == 0x2A) || // SandyBridge
-         (Family == 6 && Model == 0x2D) || // SandyBridge: SandyBridge-E*
-         (Family == 6 && Model == 0x3A) || // IvyBridge
-         (Family == 6 && Model == 0x3E) || // IvyBridge EP
-         (Family == 6 && Model == 0x3C) || // Haswell
-         (Family == 6 && Model == 0x3F) || // ...
-         (Family == 6 && Model == 0x45) || // ...
-         (Family == 6 && Model == 0x46))) { // ...
-      IsUAMemFast = true;
-      ToggleFeature(X86::FeatureFastUAMem);
-    }
-
-    // Set processor type. Currently only Atom or Silvermont (SLM) is detected.
-    if (Family == 6 &&
-        (Model == 28 || Model == 38 || Model == 39 ||
-         Model == 53 || Model == 54)) {
-      X86ProcFamily = IntelAtom;
-
-      UseLeaForSP = true;
-      ToggleFeature(X86::FeatureLeaForSP);
-    }
-    else if (Family == 6 &&
-        (Model == 55 || Model == 74 || Model == 77)) {
-      X86ProcFamily = IntelSLM;
-    }
-
-    unsigned MaxExtLevel;
-    X86_MC::GetCpuIDAndInfo(0x80000000, &MaxExtLevel, &EBX, &ECX, &EDX);
-
-    if (MaxExtLevel >= 0x80000001) {
-      X86_MC::GetCpuIDAndInfo(0x80000001, &EAX, &EBX, &ECX, &EDX);
-      if ((EDX >> 29) & 0x1) {
-        HasX86_64 = true;
-        ToggleFeature(X86::Feature64Bit);
-      }
-      if ((ECX >> 5) & 0x1) {
-        HasLZCNT = true;
-        ToggleFeature(X86::FeatureLZCNT);
-      }
-      if (IsIntel && ((ECX >> 8) & 0x1)) {
-        HasPRFCHW = true;
-        ToggleFeature(X86::FeaturePRFCHW);
-      }
-      if (IsAMD) {
-        if ((ECX >> 6) & 0x1) {
-          HasSSE4A = true;
-          ToggleFeature(X86::FeatureSSE4A);
-        }
-        if ((ECX >> 11) & 0x1) {
-          HasXOP = true;
-          ToggleFeature(X86::FeatureXOP);
-        }
-        if ((ECX >> 16) & 0x1) {
-          HasFMA4 = true;
-          ToggleFeature(X86::FeatureFMA4);
-        }
-      }
-    }
-  }
-
-  if (MaxLevel >= 7) {
-    if (!X86_MC::GetCpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX)) {
-      if (IsIntel && (EBX & 0x1)) {
-        HasFSGSBase = true;
-        ToggleFeature(X86::FeatureFSGSBase);
-      }
-      if ((EBX >> 3) & 0x1) {
-        HasBMI = true;
-        ToggleFeature(X86::FeatureBMI);
-      }
-      if ((EBX >> 4) & 0x1) {
-        HasHLE = true;
-        ToggleFeature(X86::FeatureHLE);
-      }
-      if (IsIntel && ((EBX >> 5) & 0x1)) {
-        X86SSELevel = AVX2;
-        ToggleFeature(X86::FeatureAVX2);
-      }
-      if (IsIntel && ((EBX >> 8) & 0x1)) {
-        HasBMI2 = true;
-        ToggleFeature(X86::FeatureBMI2);
-      }
-      if (IsIntel && ((EBX >> 11) & 0x1)) {
-        HasRTM = true;
-        ToggleFeature(X86::FeatureRTM);
-      }
-      if (IsIntel && ((EBX >> 16) & 0x1)) {
-        X86SSELevel = AVX512F;
-        ToggleFeature(X86::FeatureAVX512);
-      }
-      if (IsIntel && ((EBX >> 18) & 0x1)) {
-        HasRDSEED = true;
-        ToggleFeature(X86::FeatureRDSEED);
-      }
-      if (IsIntel && ((EBX >> 19) & 0x1)) {
-        HasADX = true;
-        ToggleFeature(X86::FeatureADX);
-      }
-      if (IsIntel && ((EBX >> 26) & 0x1)) {
-        HasPFI = true;
-        ToggleFeature(X86::FeaturePFI);
-      }
-      if (IsIntel && ((EBX >> 27) & 0x1)) {
-        HasERI = true;
-        ToggleFeature(X86::FeatureERI);
-      }
-      if (IsIntel && ((EBX >> 28) & 0x1)) {
-        HasCDI = true;
-        ToggleFeature(X86::FeatureCDI);
-      }
-      if (IsIntel && ((EBX >> 29) & 0x1)) {
-        HasSHA = true;
-        ToggleFeature(X86::FeatureSHA);
-      }
-    }
-    if (IsAMD && ((ECX >> 21) & 0x1)) {
-      HasTBM = true;
-      ToggleFeature(X86::FeatureTBM);
-    }
-  }
-}
-
 void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
   AttributeSet FnAttrs = MF->getFunction()->getAttributes();
-  Attribute CPUAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                           "target-cpu");
-  Attribute FSAttr = FnAttrs.getAttribute(AttributeSet::FunctionIndex,
-                                          "target-features");
+  Attribute CPUAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+  Attribute FSAttr =
+      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
   std::string CPU =
-    !CPUAttr.hasAttribute(Attribute::None) ?CPUAttr.getValueAsString() : "";
+      !CPUAttr.hasAttribute(Attribute::None) ? CPUAttr.getValueAsString() : "";
   std::string FS =
-    !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
+      !FSAttr.hasAttribute(Attribute::None) ? FSAttr.getValueAsString() : "";
   if (!FS.empty()) {
     initializeEnvironment();
     resetSubtargetFeatures(CPU, FS);
@@ -426,54 +200,23 @@ void X86Subtarget::resetSubtargetFeatures(const MachineFunction *MF) {
 
 void X86Subtarget::resetSubtargetFeatures(StringRef CPU, StringRef FS) {
   std::string CPUName = CPU;
-  if (!FS.empty() || !CPU.empty()) {
-    if (CPUName.empty()) {
-#if defined(i386) || defined(__i386__) || defined(__x86__) || defined(_M_IX86)\
-    || defined(__x86_64__) || defined(_M_AMD64) || defined (_M_X64)
-      CPUName = sys::getHostCPUName();
-#else
-      CPUName = "generic";
-#endif
-    }
-
-    // Make sure 64-bit features are available in 64-bit mode. (But make sure
-    // SSE2 can be turned off explicitly.)
-    std::string FullFS = FS;
-    if (In64BitMode) {
-      if (!FullFS.empty())
-        FullFS = "+64bit,+sse2," + FullFS;
-      else
-        FullFS = "+64bit,+sse2";
-    }
-
-    // If feature string is not empty, parse features string.
-    ParseSubtargetFeatures(CPUName, FullFS);
-  } else {
-    if (CPUName.empty()) {
-#if defined (__x86_64__) || defined(__i386__)
-      CPUName = sys::getHostCPUName();
-#else
-      CPUName = "generic";
-#endif
-    }
-    // Otherwise, use CPUID to auto-detect feature set.
-    AutoDetectSubtargetFeatures();
-
-    // Make sure 64-bit features are available in 64-bit mode.
-    if (In64BitMode) {
-      if (!HasX86_64) { HasX86_64 = true; ToggleFeature(X86::Feature64Bit); }
-      if (!HasCMov)   { HasCMov   = true; ToggleFeature(X86::FeatureCMOV); }
-
-      if (X86SSELevel < SSE2) {
-        X86SSELevel = SSE2;
-        ToggleFeature(X86::FeatureSSE1);
-        ToggleFeature(X86::FeatureSSE2);
-      }
-    }
+  if (CPUName.empty())
+    CPUName = "generic";
+
+  // Make sure 64-bit features are available in 64-bit mode. (But make sure
+  // SSE2 can be turned off explicitly.)
+  std::string FullFS = FS;
+  if (In64BitMode) {
+    if (!FullFS.empty())
+      FullFS = "+64bit,+sse2," + FullFS;
+    else
+      FullFS = "+64bit,+sse2";
   }
 
-  // CPUName may have been set by the CPU detection code. Make sure the
-  // new MCSchedModel is used.
+  // If feature string is not empty, parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
+  // Make sure the right MCSchedModel is used.
   InitCPUSchedModel(CPUName);
 
   if (X86ProcFamily == IntelAtom || X86ProcFamily == IntelSLM)
@@ -547,33 +290,36 @@ void X86Subtarget::initializeEnvironment() {
   PadShortFunctions = false;
   CallRegIndirect = false;
   LEAUsesAG = false;
+  SlowLEA = false;
   stackAlignment = 4;
   // FIXME: this is a known good value for Yonah. How about others?
   MaxInlineSizeThreshold = 128;
 }
 
 X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
-                           const std::string &FS,
-                           unsigned StackAlignOverride)
-  : X86GenSubtargetInfo(TT, CPU, FS)
-  , X86ProcFamily(Others)
-  , PICStyle(PICStyles::None)
-  , TargetTriple(TT)
-  , StackAlignOverride(StackAlignOverride)
-  , In64BitMode(TargetTriple.getArch() == Triple::x86_64)
-  , In32BitMode(TargetTriple.getArch() == Triple::x86 &&
-                TargetTriple.getEnvironment() != Triple::CODE16)
-  , In16BitMode(TargetTriple.getArch() == Triple::x86 &&
-                TargetTriple.getEnvironment() == Triple::CODE16) {
+                           const std::string &FS, unsigned StackAlignOverride)
+    : X86GenSubtargetInfo(TT, CPU, FS), X86ProcFamily(Others),
+      PICStyle(PICStyles::None), TargetTriple(TT),
+      StackAlignOverride(StackAlignOverride),
+      In64BitMode(TargetTriple.getArch() == Triple::x86_64),
+      In32BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() != Triple::CODE16),
+      In16BitMode(TargetTriple.getArch() == Triple::x86 &&
+                  TargetTriple.getEnvironment() == Triple::CODE16) {
   initializeEnvironment();
   resetSubtargetFeatures(CPU, FS);
 }
 
-bool X86Subtarget::enablePostRAScheduler(
-           CodeGenOpt::Level OptLevel,
-           TargetSubtargetInfo::AntiDepBreakMode& Mode,
-           RegClassVector& CriticalPathRCs) const {
+bool
+X86Subtarget::enablePostRAScheduler(CodeGenOpt::Level OptLevel,
+                                    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+                                    RegClassVector &CriticalPathRCs) const {
   Mode = TargetSubtargetInfo::ANTIDEP_CRITICAL;
   CriticalPathRCs.clear();
   return PostRAScheduler && OptLevel >= CodeGenOpt::Default;
 }
+
+bool
+X86Subtarget::enableEarlyIfConversion() const {
+  return hasCMov() && X86EarlyIfConv;
+}
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 52986b9..703559a 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -178,6 +178,9 @@ protected:
   ///             address generation (AG) time.
   bool LEAUsesAG;
 
+  /// SlowLEA - True if the LEA instruction with certain arguments is slow
+  bool SlowLEA;
+
   /// Processor has AVX-512 PreFetch Instructions
   bool HasPFI;
   
@@ -235,10 +238,6 @@ public:
   /// subtarget options.  Definition of function is auto generated by tblgen.
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
-  /// AutoDetectSubtargetFeatures - Auto-detect CPU features using CPUID
-  /// instruction.
-  void AutoDetectSubtargetFeatures();
-
   /// \brief Reset the features for the X86 target.
   void resetSubtargetFeatures(const MachineFunction *MF) override;
 private:
@@ -319,11 +318,13 @@ public:
   bool padShortFunctions() const { return PadShortFunctions; }
   bool callRegIndirect() const { return CallRegIndirect; }
   bool LEAusesAG() const { return LEAUsesAG; }
+  bool slowLEA() const { return SlowLEA; }
   bool hasCDI() const { return HasCDI; }
   bool hasPFI() const { return HasPFI; }
   bool hasERI() const { return HasERI; }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
+  bool isSLM() const { return X86ProcFamily == IntelSLM; }
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
@@ -429,6 +430,8 @@ public:
 
   bool postRAScheduler() const { return PostRAScheduler; }
 
+  bool enableEarlyIfConversion() const override;
+
   /// getInstrItins = Return the instruction itineraries based on the
   /// subtarget selection.
   const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 6f09ccf..93760ef 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -108,6 +108,13 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
   if (Options.FloatABIType == FloatABI::Default)
     this->Options.FloatABIType = FloatABI::Hard;
 
+  // Windows stack unwinder gets confused when execution flow "falls through"
+  // after a call to 'noreturn' function.
+  // To prevent that, we emit a trap for 'unreachable' IR instructions.
+  // (which on X86, happens to be the 'ud2' instruction)
+  if (Subtarget.isTargetWin64())
+    this->Options.TrapUnreachable = true;
+
   initAsmInfo();
 }
 
@@ -119,12 +126,6 @@ UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
   cl::desc("Minimize AVX to SSE transition penalty"),
   cl::init(true));
 
-// Temporary option to control early if-conversion for x86 while adding machine
-// models.
-static cl::opt<bool>
-X86EarlyIfConv("x86-early-ifcvt", cl::Hidden,
-	       cl::desc("Enable early if-conversion on X86"));
-
 //===----------------------------------------------------------------------===//
 // X86 Analysis Pass Setup
 //===----------------------------------------------------------------------===//
@@ -177,19 +178,14 @@ bool X86PassConfig::addInstSelector() {
   if (getX86Subtarget().isTargetELF() && getOptLevel() != CodeGenOpt::None)
     addPass(createCleanupLocalDynamicTLSPass());
 
-  // For 32-bit, prepend instructions to set the "global base reg" for PIC.
-  if (!getX86Subtarget().is64Bit())
-    addPass(createGlobalBaseRegPass());
+  addPass(createX86GlobalBaseRegPass());
 
   return false;
 }
 
 bool X86PassConfig::addILPOpts() {
-  if (X86EarlyIfConv && getX86Subtarget().hasCMov()) {
-    addPass(&EarlyIfConverterID);
-    return true;
-  }
-  return false;
+  addPass(&EarlyIfConverterID);
+  return true;
 }
 
 bool X86PassConfig::addPreRegAlloc() {
@@ -208,18 +204,13 @@ bool X86PassConfig::addPreEmitPass() {
     ShouldPrint = true;
   }
 
-  if (getX86Subtarget().hasAVX() && UseVZeroUpper) {
+  if (UseVZeroUpper) {
     addPass(createX86IssueVZeroUpperPass());
     ShouldPrint = true;
   }
 
-  if (getOptLevel() != CodeGenOpt::None &&
-      getX86Subtarget().padShortFunctions()) {
+  if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86PadShortFunctions());
-    ShouldPrint = true;
-  }
-  if (getOptLevel() != CodeGenOpt::None &&
-      getX86Subtarget().LEAusesAG()){
     addPass(createX86FixupLEAs());
     ShouldPrint = true;
   }
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 0a88e98..8157085 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -26,7 +26,7 @@ const MCExpr *X86_64MachoTargetObjectFile::getTTypeGlobalReference(
 
   // On Darwin/X86-64, we can reference dwarf symbols with foo@GOTPCREL+4, which
   // is an indirect pc-relative reference.
-  if (Encoding & (DW_EH_PE_indirect | DW_EH_PE_pcrel)) {
+  if ((Encoding & DW_EH_PE_indirect) && (Encoding & DW_EH_PE_pcrel)) {
     const MCSymbol *Sym = TM.getSymbol(GV, Mang);
     const MCExpr *Res =
       MCSymbolRefExpr::Create(Sym, MCSymbolRefExpr::VK_GOTPCREL, getContext());
@@ -62,7 +62,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   // operation.
   const SubOperator *Sub = dyn_cast<SubOperator>(CE);
   if (!Sub)
-    return 0;
+    return nullptr;
 
   // Symbols must first be numbers before we can subtract them, we need to see a
   // ptrtoint on both subtraction operands.
@@ -71,13 +71,13 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   const PtrToIntOperator *SubRHS =
       dyn_cast<PtrToIntOperator>(Sub->getOperand(1));
   if (!SubLHS || !SubRHS)
-    return 0;
+    return nullptr;
 
   // Our symbols should exist in address space zero, cowardly no-op if
   // otherwise.
   if (SubLHS->getPointerAddressSpace() != 0 ||
       SubRHS->getPointerAddressSpace() != 0)
-    return 0;
+    return nullptr;
 
   // Both ptrtoint instructions must wrap global variables:
   // - Only global variables are eligible for image relative relocations.
@@ -87,7 +87,7 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   const GlobalVariable *GVRHS =
       dyn_cast<GlobalVariable>(SubRHS->getPointerOperand());
   if (!GVLHS || !GVRHS)
-    return 0;
+    return nullptr;
 
   // We expect __ImageBase to be a global variable without a section, externally
   // defined.
@@ -96,11 +96,11 @@ const MCExpr *X86WindowsTargetObjectFile::getExecutableRelativeSymbol(
   if (GVRHS->isThreadLocal() || GVRHS->getName() != "__ImageBase" ||
       !GVRHS->hasExternalLinkage() || GVRHS->hasInitializer() ||
       GVRHS->hasSection())
-    return 0;
+    return nullptr;
 
   // An image-relative, thread-local, symbol makes no sense.
   if (GVLHS->isThreadLocal())
-    return 0;
+    return nullptr;
 
   return MCSymbolRefExpr::Create(TM.getSymbol(GVLHS, Mang),
                                  MCSymbolRefExpr::VK_COFF_IMGREL32,
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c04964d..91b9d40 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -14,37 +14,24 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86tti"
 #include "X86.h"
 #include "X86TargetMachine.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Target/CostTable.h"
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86tti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeX86TTIPass(PassRegistry &);
 }
 
-static cl::opt<bool>
-UsePartialUnrolling("x86-use-partial-unrolling", cl::init(true),
-  cl::desc("Use partial unrolling for some X86 targets"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingThreshold("x86-partial-unrolling-threshold", cl::init(0),
-  cl::desc("Threshold for X86 partial unrolling"), cl::Hidden);
-static cl::opt<unsigned>
-PartialUnrollingMaxBranches("x86-partial-max-branches", cl::init(2),
-  cl::desc("Threshold for taken branches in X86 partial unrolling"),
-  cl::Hidden);
-
 namespace {
 
 class X86TTI final : public ImmutablePass, public TargetTransformInfo {
@@ -56,7 +43,7 @@ class X86TTI final : public ImmutablePass, public TargetTransformInfo {
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
 public:
-  X86TTI() : ImmutablePass(ID), ST(0), TLI(0) {
+  X86TTI() : ImmutablePass(ID), ST(nullptr), TLI(nullptr) {
     llvm_unreachable("This pass cannot be directly constructed");
   }
 
@@ -87,8 +74,6 @@ public:
   /// \name Scalar TTI Implementations
   /// @{
   PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override;
-  void getUnrollingPreferences(Loop *L,
-                               UnrollingPreferences &UP) const override;
 
   /// @}
 
@@ -153,93 +138,6 @@ X86TTI::PopcntSupportKind X86TTI::getPopcntSupport(unsigned TyWidth) const {
   return ST->hasPOPCNT() ? PSK_FastHardware : PSK_Software;
 }
 
-void X86TTI::getUnrollingPreferences(Loop *L, UnrollingPreferences &UP) const {
-  if (!UsePartialUnrolling)
-    return;
-  // According to the Intel 64 and IA-32 Architectures Optimization Reference
-  // Manual, Intel Core models and later have a loop stream detector
-  // (and associated uop queue) that can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have no more than 4 (8 for Nehalem and later) branches
-  //    taken, and none of them may be calls.
-  //  - The loop can have no more than 18 (28 for Nehalem and later) uops.
-
-  // According to the Software Optimization Guide for AMD Family 15h Processors,
-  // models 30h-4fh (Steamroller and later) have a loop predictor and loop
-  // buffer which can benefit from partial unrolling.
-  // The relevant requirements are:
-  //  - The loop must have fewer than 16 branches
-  //  - The loop must have less than 40 uops in all executed loop branches
-
-  unsigned MaxBranches, MaxOps;
-  if (PartialUnrollingThreshold.getNumOccurrences() > 0) {
-    MaxBranches = PartialUnrollingMaxBranches;
-    MaxOps = PartialUnrollingThreshold;
-  } else if (ST->isAtom()) {
-    // On the Atom, the throughput for taken branches is 2 cycles. For small
-    // simple loops, expand by a small factor to hide the backedge cost.
-    MaxBranches = 2;
-    MaxOps = 10;
-  } else if (ST->hasFSGSBase() && ST->hasXOP() /* Steamroller and later */) {
-    MaxBranches = 16;
-    MaxOps = 40;
-  } else if (ST->hasFMA4() /* Any other recent AMD */) {
-    return;
-  } else if (ST->hasAVX() || ST->hasSSE42() /* Nehalem and later */) {
-    MaxBranches = 8;
-    MaxOps = 28;
-  } else if (ST->hasSSSE3() /* Intel Core */) {
-    MaxBranches = 4;
-    MaxOps = 18;
-  } else {
-    return;
-  }
-
-  // Scan the loop: don't unroll loops with calls, and count the potential
-  // number of taken branches (this is somewhat conservative because we're
-  // counting all block transitions as potential branches while in reality some
-  // of these will become implicit via block placement).
-  unsigned MaxDepth = 0;
-  for (df_iterator<BasicBlock*> DI = df_begin(L->getHeader()),
-       DE = df_end(L->getHeader()); DI != DE;) {
-    if (!L->contains(*DI)) {
-      DI.skipChildren();
-      continue;
-    }
-
-    MaxDepth = std::max(MaxDepth, DI.getPathLength());
-    if (MaxDepth > MaxBranches)
-      return;
-
-    for (BasicBlock::iterator I = DI->begin(), IE = DI->end(); I != IE; ++I)
-      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        ImmutableCallSite CS(I);
-        if (const Function *F = CS.getCalledFunction()) {
-          if (!isLoweredToCall(F))
-            continue;
-        }
-
-        return;
-      }
-
-    ++DI;
-  }
-
-  // Enable runtime and partial unrolling up to the specified size.
-  UP.Partial = UP.Runtime = true;
-  UP.PartialThreshold = UP.PartialOptSizeThreshold = MaxOps;
-
-  // Set the maximum count based on the loop depth. The maximum number of
-  // branches taken in a loop (including the backedge) is equal to the maximum
-  // loop depth (the DFS path length from the loop header to any block in the
-  // loop). When the loop is unrolled, this depth (except for the backedge
-  // itself) is multiplied by the unrolling factor. This new unrolled depth
-  // must be less than the target-specific maximum branch count (which limits
-  // the number of taken branches in the uop buffer).
-  if (MaxDepth > 1)
-    UP.MaxCount = (MaxBranches-1)/(MaxDepth-1);
-}
-
 unsigned X86TTI::getNumberOfRegisters(bool Vector) const {
   if (Vector && !ST->hasSSE1())
     return 0;
@@ -283,6 +181,21 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  static const CostTblEntry<MVT::SimpleValueType>
+  AVX2UniformConstCostTable[] = {
+    { ISD::SDIV, MVT::v16i16,  6 }, // vpmulhw sequence
+    { ISD::UDIV, MVT::v16i16,  6 }, // vpmulhuw sequence
+    { ISD::SDIV, MVT::v8i32,  15 }, // vpmuldq sequence
+    { ISD::UDIV, MVT::v8i32,  15 }, // vpmuludq sequence
+  };
+
+  if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
+      ST->hasAVX2()) {
+    int Idx = CostTableLookup(AVX2UniformConstCostTable, ISD, LT.second);
+    if (Idx != -1)
+      return LT.first * AVX2UniformConstCostTable[Idx].Cost;
+  }
+
   static const CostTblEntry<MVT::SimpleValueType> AVX2CostTable[] = {
     // Shifts on v4i64/v8i32 on AVX2 is legal even though we declare to
     // customize them to detect the cases where shift amount is a scalar one.
@@ -350,10 +263,19 @@ unsigned X86TTI::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v16i8,  4 }, // psrlw, pand, pxor, psubb.
     { ISD::SRA,  MVT::v8i16,  1 }, // psraw.
     { ISD::SRA,  MVT::v4i32,  1 }, // psrad.
+
+    { ISD::SDIV, MVT::v8i16,  6 }, // pmulhw sequence
+    { ISD::UDIV, MVT::v8i16,  6 }, // pmulhuw sequence
+    { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence
+    { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
       ST->hasSSE2()) {
+    // pmuldq sequence.
+    if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41())
+      return LT.first * 15;
+
     int Idx = CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second);
     if (Idx != -1)
       return LT.first * SSE2UniformConstCostTable[Idx].Cost;
@@ -893,6 +815,13 @@ unsigned X86TTI::getIntImmCost(const APInt &Imm, Type *Ty) const {
   if (BitSize == 0)
     return ~0U;
 
+  // Never hoist constants larger than 128bit, because this might lead to
+  // incorrect code generation or assertions in codegen.
+  // Fixme: Create a cost model for types larger than i128 once the codegen
+  // issues have been fixed.
+  if (BitSize > 128)
+    return TCC_Free;
+
   if (Imm == 0)
     return TCC_Free;
 
@@ -908,8 +837,10 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return ~0U;
+    return TCC_Free;
 
   unsigned ImmIdx = ~0U;
   switch (Opcode) {
@@ -931,15 +862,19 @@ unsigned X86TTI::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
   case Instruction::Xor:
   case Instruction::ICmp:
     ImmIdx = 1;
     break;
+  // Always return TCC_Free for the shift value of a shift instruction.
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    if (Idx == 1)
+      return TCC_Free;
+    break;
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -966,8 +901,10 @@ unsigned X86TTI::getIntImmCost(Intrinsic::ID IID, unsigned Idx,
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
+  // There is no cost model for constants with a bit size of 0. Return TCC_Free
+  // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
-    return ~0U;
+    return TCC_Free;
 
   switch (IID) {
   default: return TCC_Free;
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index d4341b9..0bb5f99 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -14,7 +14,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-vzeroupper"
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
@@ -28,6 +27,8 @@
 #include "llvm/Target/TargetInstrInfo.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "x86-vzeroupper"
+
 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
 
 namespace {
@@ -246,7 +247,8 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// runOnMachineFunction - Loop over all of the basic blocks, inserting
 /// vzero upper instructions before function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
-  if (MF.getTarget().getSubtarget<X86Subtarget>().hasAVX512())
+  const X86Subtarget &ST = MF.getTarget().getSubtarget<X86Subtarget>();
+  if (!ST.hasAVX() || ST.hasAVX512())
     return false;
   TII = MF.getTarget().getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
index 9c20abd..7fef796 100644
--- a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -14,6 +14,7 @@
 
 #include "XCore.h"
 #include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -23,16 +24,17 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-disassembler"
+
 typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 
 /// \brief A disassembler class for XCore.
 class XCoreDisassembler : public MCDisassembler {
-  OwningPtr<const MCRegisterInfo> RegInfo;
 public:
-  XCoreDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
-    MCDisassembler(STI), RegInfo(Info) {}
+  XCoreDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx) :
+    MCDisassembler(STI, Ctx) {}
 
   /// \brief See MCDisassembler.
   virtual DecodeStatus getInstruction(MCInst &instr,
@@ -40,9 +42,8 @@ public:
                                       const MemoryObject &region,
                                       uint64_t address,
                                       raw_ostream &vStream,
-                                      raw_ostream &cStream) const;
+                                      raw_ostream &cStream) const override;
 
-  const MCRegisterInfo *getRegInfo() const { return RegInfo.get(); }
 };
 }
 
@@ -81,7 +82,8 @@ static bool readInstruction32(const MemoryObject &region,
 
 static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
   const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
-  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+  const MCRegisterInfo *RegInfo = Dis->getContext().getRegisterInfo();
+  return *(RegInfo->getRegClass(RC).begin() + RegNo);
 }
 
 static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
@@ -788,8 +790,9 @@ namespace llvm {
 }
 
 static MCDisassembler *createXCoreDisassembler(const Target &T,
-                                               const MCSubtargetInfo &STI) {
-  return new XCoreDisassembler(STI, T.createMCRegInfo(""));
+                                               const MCSubtargetInfo &STI,
+                                               MCContext &Ctx) {
+  return new XCoreDisassembler(STI, Ctx);
 }
 
 extern "C" void LLVMInitializeXCoreDisassembler() {
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 9ae8c0d..215fe89 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "XCoreInstPrinter.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCExpr.h"
@@ -22,6 +21,8 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 #include "XCoreGenAsmWriter.inc"
 
 void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index 772c515..98e7c98 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -31,8 +31,8 @@ public:
   void printInstruction(const MCInst *MI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
-  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
+  void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot) override;
 private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index f788c59..5665911 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -17,7 +17,7 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(StringRef TT) {
   SupportsDebugInformation = true;
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
-  Data64bitsDirective = 0;
+  Data64bitsDirective = nullptr;
   ZeroDirective = "\t.space\t";
   CommentString = "#";
 
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
index e53c96b..da2689a 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.h
@@ -21,7 +21,7 @@ namespace llvm {
   class Target;
 
   class XCoreMCAsmInfo : public MCAsmInfoELF {
-    virtual void anchor();
+    void anchor() override;
   public:
     explicit XCoreMCAsmInfo(StringRef TT);
   };
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 439d0ab..d54e94f 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_MC_DESC
 #include "XCoreGenInstrInfo.inc"
 
@@ -32,8 +34,6 @@
 #define GET_REGINFO_MC_DESC
 #include "XCoreGenRegisterInfo.inc"
 
-using namespace llvm;
-
 static MCInstrInfo *createXCoreMCInstrInfo() {
   MCInstrInfo *X = new MCInstrInfo();
   InitXCoreMCInstrInfo(X);
@@ -58,7 +58,7 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(0, XCore::SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -128,12 +128,11 @@ void XCoreTargetAsmStreamer::emitCCBottomFunction(StringRef Name) {
 
 static MCStreamer *
 createXCoreMCAsmStreamer(MCContext &Ctx, formatted_raw_ostream &OS,
-                         bool isVerboseAsm, bool useCFI, bool useDwarfDirectory,
+                         bool isVerboseAsm, bool useDwarfDirectory,
                          MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                          MCAsmBackend *TAB, bool ShowInst) {
-  MCStreamer *S =
-      llvm::createAsmStreamer(Ctx, OS, isVerboseAsm, useCFI, useDwarfDirectory,
-                              InstPrint, CE, TAB, ShowInst);
+  MCStreamer *S = llvm::createAsmStreamer(
+      Ctx, OS, isVerboseAsm, useDwarfDirectory, InstPrint, CE, TAB, ShowInst);
   new XCoreTargetAsmStreamer(*S, OS);
   return S;
 }
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 21acedf..e98d4f9 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "asm-printer"
 #include "XCore.h"
 #include "InstPrinter/XCoreInstPrinter.h"
 #include "XCoreInstrInfo.h"
@@ -47,6 +46,8 @@
 #include <cctype>
 using namespace llvm;
 
+#define DEBUG_TYPE "asm-printer"
+
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
@@ -58,7 +59,7 @@ namespace {
       : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
         MCInstLowering(*this) {}
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore Assembly Printer";
     }
 
@@ -70,18 +71,18 @@ namespace {
     void printOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
     bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                          unsigned AsmVariant, const char *ExtraCode,
-                         raw_ostream &O);
+                         raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                                unsigned AsmVariant, const char *ExtraCode,
                                raw_ostream &O) override;
 
     void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
-    virtual void EmitGlobalVariable(const GlobalVariable *GV);
+    void EmitGlobalVariable(const GlobalVariable *GV) override;
 
-    void EmitFunctionEntryLabel();
-    void EmitInstruction(const MachineInstr *MI);
-    void EmitFunctionBodyStart();
-    void EmitFunctionBodyEnd();
+    void EmitFunctionEntryLabel() override;
+    void EmitInstruction(const MachineInstr *MI) override;
+    void EmitFunctionBodyStart() override;
+    void EmitFunctionBodyEnd() override;
   };
 } // end of anonymous namespace
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index 954fddf..5499aba 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -64,7 +64,8 @@ static void EmitDefCfaRegister(MachineBasicBlock &MBB,
                                MachineModuleInfo *MMI, unsigned DRegNum) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createDefCfaRegister(nullptr, DRegNum));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitDefCfaOffset(MachineBasicBlock &MBB,
@@ -73,7 +74,8 @@ static void EmitDefCfaOffset(MachineBasicBlock &MBB,
                              MachineModuleInfo *MMI, int Offset) {
   unsigned CFIIndex =
       MMI->addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 static void EmitCfiOffset(MachineBasicBlock &MBB,
@@ -82,7 +84,8 @@ static void EmitCfiOffset(MachineBasicBlock &MBB,
                           unsigned DRegNum, int Offset) {
   unsigned CFIIndex = MMI->addFrameInst(
       MCCFIInstruction::createOffset(nullptr, DRegNum, Offset));
-  BuildMI(MBB, MBBI, dl, TII.get(XCore::CFI_INSTRUCTION)).addCFIIndex(CFIIndex);
+  BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
+      .addCFIIndex(CFIIndex);
 }
 
 /// The SP register is moved in steps of 'MaxImmU16' towards the bottom of the
@@ -113,7 +116,8 @@ static void IfNeededExtSP(MachineBasicBlock &MBB,
 /// IfNeededLDAWSP emits the necessary LDAWSP instructions to move the SP only
 /// as far as to make 'OffsetFromTop' reachable using an LDAWSP_lru6.
 /// \param OffsetFromTop the spill offset from the top of the frame.
-/// \param [in,out] RemainingAdj the current SP offset from the top of the frame.
+/// \param [in,out] RemainingAdj the current SP offset from the top of the
+/// frame.
 static void IfNeededLDAWSP(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI, DebugLoc dl,
                            const TargetInstrInfo &TII, int OffsetFromTop,
@@ -346,7 +350,8 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   RemainingAdj /= 4;
 
   if (RetOpcode == XCore::EH_RETURN) {
-    // 'Restore' the exception info the unwinder has placed into the stack slots.
+    // 'Restore' the exception info the unwinder has placed into the stack
+    // slots.
     SmallVector<StackSlotInfo,2> SpillList;
     GetEHSpillList(SpillList, MFI, XFI, MF.getTarget().getTargetLowering());
     RestoreSpillList(MBB, MBBI, dl, TII, RemainingAdj, SpillList);
@@ -495,7 +500,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
         errs() << "eliminateCallFramePseudoInstr size too big: "
                << Amount << "\n";
 #endif
-        llvm_unreachable(0);
+        llvm_unreachable(nullptr);
       }
 
       MachineInstr *New;
@@ -514,7 +519,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       MBB.insert(I, New);
     }
   }
-  
+
   MBB.erase(I);
 }
 
diff --git a/lib/Target/XCore/XCoreFrameLowering.h b/lib/Target/XCore/XCoreFrameLowering.h
index 6cd90c9..e4f806a 100644
--- a/lib/Target/XCore/XCoreFrameLowering.h
+++ b/lib/Target/XCore/XCoreFrameLowering.h
@@ -27,29 +27,30 @@ namespace llvm {
 
     /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
     /// the function.
-    void emitPrologue(MachineFunction &MF) const;
-    void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+    void emitPrologue(MachineFunction &MF) const override;
+    void emitEpilogue(MachineFunction &MF,
+                      MachineBasicBlock &MBB) const override;
 
     bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   const std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
     bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     const std::vector<CalleeSavedInfo> &CSI,
-                                     const TargetRegisterInfo *TRI) const;
+                                  MachineBasicBlock::iterator MI,
+                                  const std::vector<CalleeSavedInfo> &CSI,
+                                  const TargetRegisterInfo *TRI) const override;
 
     void eliminateCallFramePseudoInstr(MachineFunction &MF,
-                                       MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator I) const;
+                                  MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator I) const override;
 
-    bool hasFP(const MachineFunction &MF) const;
+    bool hasFP(const MachineFunction &MF) const override;
 
     void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
-                                              RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
     void processFunctionBeforeFrameFinalized(MachineFunction &MF,
-                                             RegScavenger *RS = NULL) const;
+                                     RegScavenger *RS = nullptr) const override;
 
     //! Stack slot size (4 bytes)
     static int stackSlotSize() {
diff --git a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
index c18eff9..30c7b59 100644
--- a/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
+++ b/lib/Target/XCore/XCoreFrameToArgsOffsetElim.cpp
@@ -26,9 +26,9 @@ namespace {
     static char ID;
     XCoreFTAOElim() : MachineFunctionPass(ID) {}
 
-    virtual bool runOnMachineFunction(MachineFunction &Fn);
+    bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore FRAME_TO_ARGS_OFFSET Elimination";
     }
   };
diff --git a/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index 5b0fcfa..86bc6f2 100644
--- a/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -44,7 +44,7 @@ namespace {
       : SelectionDAGISel(TM, OptLevel),
         Subtarget(*TM.getSubtargetImpl()) { }
 
-    SDNode *Select(SDNode *N);
+    SDNode *Select(SDNode *N) override;
     SDNode *SelectBRIND(SDNode *N);
 
     /// getI32Imm - Return a target constant with the specified value, of type
@@ -70,7 +70,7 @@ namespace {
     bool SelectInlineAsmMemoryOperand(const SDValue &Op, char ConstraintCode,
                                       std::vector<SDValue> &OutOps) override;
 
-    virtual const char *getPassName() const {
+    const char *getPassName() const override {
       return "XCore DAG->DAG Pattern Instruction Selection";
     } 
     
@@ -89,14 +89,14 @@ FunctionPass *llvm::createXCoreISelDag(XCoreTargetMachine &TM,
 
 bool XCoreDAGToDAGISel::SelectADDRspii(SDValue Addr, SDValue &Base,
                                        SDValue &Offset) {
-  FrameIndexSDNode *FIN = 0;
+  FrameIndexSDNode *FIN = nullptr;
   if ((FIN = dyn_cast<FrameIndexSDNode>(Addr))) {
     Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
     Offset = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
   if (Addr.getOpcode() == ISD::ADD) {
-    ConstantSDNode *CN = 0;
+    ConstantSDNode *CN = nullptr;
     if ((FIN = dyn_cast<FrameIndexSDNode>(Addr.getOperand(0)))
       && (CN = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
       && (CN->getSExtValue() % 4 == 0 && CN->getSExtValue() >= 0)) {
@@ -227,8 +227,7 @@ replaceInChain(SelectionDAG *CurDAG, SDValue Chain, SDValue Old, SDValue New)
   }
   if (!found)
     return SDValue();
-  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other,
-                         &Ops[0], Ops.size());
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, Ops);
 }
 
 SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
@@ -237,10 +236,10 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Addr = N->getOperand(1);
   if (Addr->getOpcode() != ISD::INTRINSIC_W_CHAIN)
-    return 0;
+    return nullptr;
   unsigned IntNo = cast<ConstantSDNode>(Addr->getOperand(1))->getZExtValue();
   if (IntNo != Intrinsic::xcore_checkevent)
-    return 0;
+    return nullptr;
   SDValue nextAddr = Addr->getOperand(2);
   SDValue CheckEventChainOut(Addr.getNode(), 1);
   if (!CheckEventChainOut.use_empty()) {
@@ -252,7 +251,7 @@ SDNode *XCoreDAGToDAGISel::SelectBRIND(SDNode *N) {
     SDValue NewChain = replaceInChain(CurDAG, Chain, CheckEventChainOut,
                                       CheckEventChainIn);
     if (!NewChain.getNode())
-      return 0;
+      return nullptr;
     Chain = NewChain;
   }
   // Enable events on the thread using setsr 1 and then disable them immediately
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 1b74013..9d78586 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -11,8 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcore-lower"
-
 #include "XCoreISelLowering.h"
 #include "XCore.h"
 #include "XCoreMachineFunctionInfo.h"
@@ -41,6 +39,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-lower"
+
 const char *XCoreTargetLowering::
 getTargetNodeName(unsigned Opcode) const
 {
@@ -64,7 +64,7 @@ getTargetNodeName(unsigned Opcode) const
     case XCoreISD::FRAME_TO_ARGS_OFFSET : return "XCoreISD::FRAME_TO_ARGS_OFFSET";
     case XCoreISD::EH_RETURN         : return "XCoreISD::EH_RETURN";
     case XCoreISD::MEMBARRIER        : return "XCoreISD::MEMBARRIER";
-    default                          : return NULL;
+    default                          : return nullptr;
   }
 }
 
@@ -268,21 +268,19 @@ LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
                      Op.getOperand(1));
 }
 
-SDValue XCoreTargetLowering::
-getGlobalAddressWrapper(SDValue GA, const GlobalValue *GV,
-                        SelectionDAG &DAG) const
-{
+SDValue XCoreTargetLowering::getGlobalAddressWrapper(SDValue GA,
+                                                     const GlobalValue *GV,
+                                                     SelectionDAG &DAG) const {
   // FIXME there is no actual debug info here
   SDLoc dl(GA);
   const GlobalValue *UnderlyingGV = GV;
   // If GV is an alias then use the aliasee to determine the wrapper type
   if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    UnderlyingGV = GA->getAliasedGlobal();
+    UnderlyingGV = GA->getAliasee();
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(UnderlyingGV)) {
-    if (  ( GVar->isConstant() &&
-            UnderlyingGV->isLocalLinkage(GV->getLinkage()) )
-       || ( GVar->hasSection() &&
-            StringRef(GVar->getSection()).startswith(".cp.") ) )
+    if ((GVar->isConstant() && GV->hasLocalLinkage()) ||
+        (GVar->hasSection() &&
+         StringRef(GVar->getSection()).startswith(".cp.")))
       return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, GA);
     return DAG.getNode(XCoreISD::DPRelativeWrapper, dl, MVT::i32, GA);
   }
@@ -428,13 +426,13 @@ lowerLoadWordFromAlignedBasePlusOffset(SDLoc DL, SDValue Chain, SDValue Base,
   Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
                       High.getValue(1));
   SDValue Ops[] = { Result, Chain };
-  return DAG.getMergeValues(Ops, 2, DL);
+  return DAG.getMergeValues(Ops, DL);
 }
 
 static bool isWordAligned(SDValue Value, SelectionDAG &DAG)
 {
   APInt KnownZero, KnownOne;
-  DAG.ComputeMaskedBits(Value, KnownZero, KnownOne);
+  DAG.computeKnownBits(Value, KnownZero, KnownOne);
   return KnownZero.countTrailingOnes() >= 2;
 }
 
@@ -494,7 +492,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Low.getValue(1),
                              High.getValue(1));
     SDValue Ops[] = { Result, Chain };
-    return DAG.getMergeValues(Ops, 2, DL);
+    return DAG.getMergeValues(Ops, DL);
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
@@ -506,17 +504,15 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Entry.Node = BasePtr;
   Args.push_back(Entry);
 
-  TargetLowering::CallLoweringInfo CLI(Chain, IntPtrTy, false, false,
-                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
-                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                    DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
-                    Args, DAG, DL);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
-
-  SDValue Ops[] =
-    { CallResult.first, CallResult.second };
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL).setChain(Chain)
+    .setCallee(CallingConv::C, IntPtrTy,
+               DAG.getExternalSymbol("__misaligned_load", getPointerTy()),
+               &Args, 0);
 
-  return DAG.getMergeValues(Ops, 2, DL);
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  SDValue Ops[] = { CallResult.first, CallResult.second };
+  return DAG.getMergeValues(Ops, DL);
 }
 
 SDValue XCoreTargetLowering::
@@ -568,14 +564,13 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   Entry.Node = Value;
   Args.push_back(Entry);
 
-  TargetLowering::CallLoweringInfo CLI(Chain,
-                    Type::getVoidTy(*DAG.getContext()), false, false,
-                    false, false, 0, CallingConv::C, /*isTailCall=*/false,
-                    /*doesNotRet=*/false, /*isReturnValueUsed=*/true,
-                    DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
-                    Args, DAG, dl);
-  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+               DAG.getExternalSymbol("__misaligned_store", getPointerTy()),
+               &Args, 0);
 
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
 }
 
@@ -593,7 +588,7 @@ LowerSMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
                            LHS, RHS);
   SDValue Lo(Hi.getNode(), 1);
   SDValue Ops[] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 SDValue XCoreTargetLowering::
@@ -610,7 +605,7 @@ LowerUMUL_LOHI(SDValue Op, SelectionDAG &DAG) const
                            Zero, Zero);
   SDValue Lo(Hi.getNode(), 1);
   SDValue Ops[] = { Lo, Hi };
-  return DAG.getMergeValues(Ops, 2, dl);
+  return DAG.getMergeValues(Ops, dl);
 }
 
 /// isADDADDMUL - Return whether Op is in a form that is equivalent to
@@ -741,7 +736,7 @@ ExpandADDSUB(SDNode *N, SelectionDAG &DAG) const
 
   if (N->getOpcode() == ISD::ADD) {
     SDValue Result = TryExpandADDWithMul(N, DAG);
-    if (Result.getNode() != 0)
+    if (Result.getNode())
       return Result;
   }
 
@@ -886,7 +881,7 @@ LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
     DAG.getCopyToReg(Chain, dl, HandlerReg, Handler)
   };
 
-  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 2);
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 
   return DAG.getNode(XCoreISD::EH_RETURN, dl, MVT::Other, Chain,
                      DAG.getRegister(StackReg, MVT::i32),
@@ -952,7 +947,7 @@ LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const {
                               MachinePointerInfo(TrmpAddr, 16), false, false,
                               0);
 
-  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains, 5);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
 }
 
 SDValue XCoreTargetLowering::
@@ -967,7 +962,7 @@ LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const {
                     Op.getOperand(1), Op.getOperand(2) , Op.getOperand(3));
       SDValue Crc(Data.getNode(), 1);
       SDValue Results[] = { Crc, Data };
-      return DAG.getMergeValues(Results, 2, DL);
+      return DAG.getMergeValues(Results, DL);
   }
   return SDValue();
 }
@@ -1111,7 +1106,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag,
     unsigned index = ResultMemLocs[i].second;
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
     SDValue Ops[] = { Chain, DAG.getConstant(offset / 4, MVT::i32) };
-    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops, 2);
+    SDValue load = DAG.getNode(XCoreISD::LDWSP, dl, VTs, Ops);
     InVals[index] = load;
     MemOpChains.push_back(load.getValue(1));
   }
@@ -1119,8 +1114,7 @@ LowerCallResult(SDValue Chain, SDValue InFlag,
   // Transform all loads nodes into one single node because
   // all load nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   return Chain;
 }
@@ -1204,8 +1198,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   // Transform all store nodes into one single node because
   // all store nodes are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
@@ -1244,7 +1237,7 @@ XCoreTargetLowering::LowerCCCCallTo(SDValue Chain, SDValue Callee,
   if (InFlag.getNode())
     Ops.push_back(InFlag);
 
-  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, &Ops[0], Ops.size());
+  Chain  = DAG.getNode(XCoreISD::BL, dl, NodeTys, Ops);
   InFlag = Chain.getValue(1);
 
   // Create the CALLSEQ_END node.
@@ -1347,7 +1340,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
           errs() << "LowerFormalArguments Unhandled argument type: "
                  << RegVT.getSimpleVT().SimpleTy << "\n";
 #endif
-          llvm_unreachable(0);
+          llvm_unreachable(nullptr);
         }
       case MVT::i32:
         unsigned VReg = RegInfo.createVirtualRegister(&XCore::GRRegsRegClass);
@@ -1384,7 +1377,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // 1b. CopyFromReg vararg registers.
   if (isVarArg) {
     // Argument registers
-    static const uint16_t ArgRegs[] = {
+    static const MCPhysReg ArgRegs[] = {
       XCore::R0, XCore::R1, XCore::R2, XCore::R3
     };
     XCoreFunctionInfo *XFI = MF.getInfo<XCoreFunctionInfo>();
@@ -1422,8 +1415,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
 
   // 2. chain CopyFromReg nodes into a TokenFactor.
   if (!CFRegNode.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &CFRegNode[0],
-                        CFRegNode.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, CFRegNode);
 
   // 3. Memcpy 'byVal' args & push final InVals.
   // Aggregates passed "byVal" need to be copied by the callee.
@@ -1452,8 +1444,7 @@ XCoreTargetLowering::LowerCCCArguments(SDValue Chain,
   // 4, chain mem ops nodes into a TokenFactor.
   if (!MemOps.empty()) {
     MemOps.push_back(Chain);
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, &MemOps[0],
-                        MemOps.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
   }
 
   return Chain;
@@ -1535,8 +1526,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   // Transform all store nodes into one single node because
   // all stores are independent of each other.
   if (!MemOpChains.empty())
-    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
-                        &MemOpChains[0], MemOpChains.size());
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
   // Now handle return values copied to registers.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
@@ -1558,8 +1548,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other,
-                     &RetOps[0], RetOps.size());
+  return DAG.getNode(XCoreISD::RETSP, dl, MVT::Other, RetOps);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1696,7 +1685,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       SDValue Result = DAG.getNode(ISD::AND, dl, VT, N2,
                                    DAG.getConstant(1, VT));
       SDValue Ops[] = { Result, Carry };
-      return DAG.getMergeValues(Ops, 2, dl);
+      return DAG.getMergeValues(Ops, dl);
     }
 
     // fold (ladd x, 0, y) -> 0, add x, y iff carry is unused and y has only the
@@ -1705,12 +1694,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Carry = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::ADD, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Carry };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
   }
@@ -1728,13 +1717,13 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = N2;
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT,
                                      DAG.getConstant(0, VT), N2);
         SDValue Ops[] = { Result, Borrow };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
 
@@ -1744,12 +1733,12 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       APInt KnownZero, KnownOne;
       APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
                                          VT.getSizeInBits() - 1);
-      DAG.ComputeMaskedBits(N2, KnownZero, KnownOne);
+      DAG.computeKnownBits(N2, KnownZero, KnownOne);
       if ((KnownZero & Mask) == Mask) {
         SDValue Borrow = DAG.getConstant(0, VT);
         SDValue Result = DAG.getNode(ISD::SUB, dl, VT, N0, N2);
         SDValue Ops[] = { Result, Borrow };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
     }
   }
@@ -1775,14 +1764,14 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
       if (N->hasNUsesOfValue(0, 0)) {
         SDValue Lo = DAG.getNode(ISD::ADD, dl, VT, N2, N3);
         SDValue Ops[] = { Lo, Lo };
-        return DAG.getMergeValues(Ops, 2, dl);
+        return DAG.getMergeValues(Ops, dl);
       }
       // Otherwise fold to ladd(a, b, 0)
       SDValue Result =
         DAG.getNode(XCoreISD::LADD, dl, DAG.getVTList(VT, VT), N2, N3, N1);
       SDValue Carry(Result.getNode(), 1);
       SDValue Ops[] = { Carry, Result };
-      return DAG.getMergeValues(Ops, 2, dl);
+      return DAG.getMergeValues(Ops, dl);
     }
   }
   break;
@@ -1866,11 +1855,11 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
   return SDValue();
 }
 
-void XCoreTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
-                                                         APInt &KnownZero,
-                                                         APInt &KnownOne,
-                                                         const SelectionDAG &DAG,
-                                                         unsigned Depth) const {
+void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                        APInt &KnownZero,
+                                                        APInt &KnownOne,
+                                                        const SelectionDAG &DAG,
+                                                        unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
   switch (Op.getOpcode()) {
   default: break;
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 65e2bad..d28715b 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -97,31 +97,30 @@ namespace llvm {
     explicit XCoreTargetLowering(XCoreTargetMachine &TM);
 
     using TargetLowering::isZExtFree;
-    virtual bool isZExtFree(SDValue Val, EVT VT2) const;
+    bool isZExtFree(SDValue Val, EVT VT2) const override;
 
 
-    virtual unsigned getJumpTableEncoding() const;
-    virtual MVT getScalarShiftAmountTy(EVT LHSTy) const { return MVT::i32; }
+    unsigned getJumpTableEncoding() const override;
+    MVT getScalarShiftAmountTy(EVT LHSTy) const override { return MVT::i32; }
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
-    virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
     /// ReplaceNodeResults - Replace the results of node with an illegal result
     /// type with new values built out of custom code.
     ///
-    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
-                                    SelectionDAG &DAG) const;
+    void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                            SelectionDAG &DAG) const override;
 
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
-    virtual const char *getTargetNodeName(unsigned Opcode) const;
+    const char *getTargetNodeName(unsigned Opcode) const override;
 
-    virtual MachineBasicBlock *
+    MachineBasicBlock *
       EmitInstrWithCustomInserter(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
+                                  MachineBasicBlock *MBB) const override;
 
-    virtual bool isLegalAddressingMode(const AddrMode &AM,
-                                       Type *Ty) const;
+    bool isLegalAddressingMode(const AddrMode &AM, Type *Ty) const override;
 
   private:
     const XCoreTargetMachine &TM;
@@ -176,44 +175,44 @@ namespace llvm {
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass*>
     getRegForInlineAsmConstraint(const std::string &Constraint,
-                                 MVT VT) const;
+                                 MVT VT) const override;
 
     // Expand specifics
     SDValue TryExpandADDWithMul(SDNode *Op, SelectionDAG &DAG) const;
     SDValue ExpandADDSUB(SDNode *Op, SelectionDAG &DAG) const;
 
-    virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    virtual void computeMaskedBitsForTargetNode(const SDValue Op,
-                                                APInt &KnownZero,
-                                                APInt &KnownOne,
-                                                const SelectionDAG &DAG,
-                                                unsigned Depth = 0) const;
+    void computeKnownBitsForTargetNode(const SDValue Op,
+                                       APInt &KnownZero,
+                                       APInt &KnownOne,
+                                       const SelectionDAG &DAG,
+                                       unsigned Depth = 0) const override;
 
-    virtual SDValue
+    SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv,
                            bool isVarArg,
                            const SmallVectorImpl<ISD::InputArg> &Ins,
                            SDLoc dl, SelectionDAG &DAG,
-                           SmallVectorImpl<SDValue> &InVals) const;
+                           SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
-                SmallVectorImpl<SDValue> &InVals) const;
+                SmallVectorImpl<SDValue> &InVals) const override;
 
-    virtual SDValue
+    SDValue
       LowerReturn(SDValue Chain,
                   CallingConv::ID CallConv, bool isVarArg,
                   const SmallVectorImpl<ISD::OutputArg> &Outs,
                   const SmallVectorImpl<SDValue> &OutVals,
-                  SDLoc dl, SelectionDAG &DAG) const;
+                  SDLoc dl, SelectionDAG &DAG) const override;
 
-    virtual bool
+    bool
       CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                      bool isVarArg,
                      const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
-                     LLVMContext &Context) const;
+                     LLVMContext &Context) const override;
   };
 }
 
diff --git a/lib/Target/XCore/XCoreInstrInfo.cpp b/lib/Target/XCore/XCoreInstrInfo.cpp
index cea3bbf..984f0cd 100644
--- a/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -26,6 +26,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "XCoreGenInstrInfo.inc"
 
@@ -41,9 +43,6 @@ namespace XCore {
 }
 }
 
-using namespace llvm;
-
-
 // Pin the vtable to this file.
 void XCoreInstrInfo::anchor() {}
 
@@ -289,7 +288,7 @@ XCoreInstrInfo::InsertBranch(MachineBasicBlock &MBB,MachineBasicBlock *TBB,
   assert((Cond.size() == 2 || Cond.size() == 0) &&
          "Unexpected number of components!");
   
-  if (FBB == 0) { // One way branch.
+  if (!FBB) { // One way branch.
     if (Cond.empty()) {
       // Unconditional branch
       BuildMI(&MBB, DL, get(XCore::BRFU_lu6)).addMBB(TBB);
@@ -428,13 +427,21 @@ static inline bool isImmU16(unsigned val) {
   return val < (1 << 16);
 }
 
+static bool isImmMskBitp(unsigned val) {
+  if (!isMask_32(val)) {
+    return false;
+  }
+  int N = Log2_32(val) + 1;
+  return (N >= 1 && N <= 8) || N == 16 || N == 24 || N == 32;
+}
+
 MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
                                               MachineBasicBlock &MBB,
                                               MachineBasicBlock::iterator MI,
                                               unsigned Reg, uint64_t Value) const {
   DebugLoc dl;
   if (MI != MBB.end()) dl = MI->getDebugLoc();
-  if (isMask_32(Value)) {
+  if (isImmMskBitp(Value)) {
     int N = Log2_32(Value) + 1;
     return BuildMI(MBB, MI, dl, get(XCore::MKMSK_rus), Reg).addImm(N);
   }
diff --git a/lib/Target/XCore/XCoreInstrInfo.h b/lib/Target/XCore/XCoreInstrInfo.h
index 48c9cb5..e0be96b 100644
--- a/lib/Target/XCore/XCoreInstrInfo.h
+++ b/lib/Target/XCore/XCoreInstrInfo.h
@@ -32,55 +32,55 @@ public:
   /// such, whenever a client has an instance of instruction info, it should
   /// always be able to get register info as well (through this method).
   ///
-  virtual const TargetRegisterInfo &getRegisterInfo() const { return RI; }
+  const TargetRegisterInfo &getRegisterInfo() const { return RI; }
 
   /// isLoadFromStackSlot - If the specified machine instruction is a direct
   /// load from a stack slot, return the virtual or physical register number of
   /// the destination along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than loading from the stack slot.
-  virtual unsigned isLoadFromStackSlot(const MachineInstr *MI,
-                                       int &FrameIndex) const;
-  
+  unsigned isLoadFromStackSlot(const MachineInstr *MI,
+                               int &FrameIndex) const override;
+
   /// isStoreToStackSlot - If the specified machine instruction is a direct
   /// store to a stack slot, return the virtual or physical register number of
   /// the source reg along with the FrameIndex of the loaded stack slot.  If
   /// not, return 0.  This predicate must return 0 if the instruction has
   /// any side effects other than storing to the stack slot.
-  virtual unsigned isStoreToStackSlot(const MachineInstr *MI,
-                                      int &FrameIndex) const;
-  
-  virtual bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
-                             MachineBasicBlock *&FBB,
-                             SmallVectorImpl<MachineOperand> &Cond,
-                             bool AllowModify) const;
-  
-  virtual unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
-                                MachineBasicBlock *FBB,
-                                const SmallVectorImpl<MachineOperand> &Cond,
-                                DebugLoc DL) const;
-  
-  virtual unsigned RemoveBranch(MachineBasicBlock &MBB) const;
-
-  virtual void copyPhysReg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, DebugLoc DL,
-                           unsigned DestReg, unsigned SrcReg,
-                           bool KillSrc) const;
-
-  virtual void storeRegToStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned SrcReg, bool isKill, int FrameIndex,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const;
-
-  virtual void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                    MachineBasicBlock::iterator MI,
-                                    unsigned DestReg, int FrameIndex,
-                                    const TargetRegisterClass *RC,
-                                    const TargetRegisterInfo *TRI) const;
-
-  virtual bool ReverseBranchCondition(
-                            SmallVectorImpl<MachineOperand> &Cond) const;
+  unsigned isStoreToStackSlot(const MachineInstr *MI,
+                              int &FrameIndex) const override;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify) const override;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB,
+                        const SmallVectorImpl<MachineOperand> &Cond,
+                        DebugLoc DL) const override;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const override;
+
+  void copyPhysReg(MachineBasicBlock &MBB,
+                   MachineBasicBlock::iterator I, DebugLoc DL,
+                   unsigned DestReg, unsigned SrcReg,
+                   bool KillSrc) const override;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+
+  bool ReverseBranchCondition(
+                          SmallVectorImpl<MachineOperand> &Cond) const override;
 
   // Emit code before MBBI to load immediate value into physical register Reg.
   // Returns an iterator to the new instruction.
diff --git a/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index b398c2d..ac3bae5 100644
--- a/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -48,7 +48,7 @@ namespace {
 
     bool lowerGlobal(GlobalVariable *GV);
 
-    bool runOnModule(Module &M);
+    bool runOnModule(Module &M) override;
   };
 }
 
@@ -189,13 +189,14 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
 
   // Create replacement global.
   ArrayType *NewType = createLoweredType(GV->getType()->getElementType());
-  Constant *NewInitializer = 0;
+  Constant *NewInitializer = nullptr;
   if (GV->hasInitializer())
     NewInitializer = createLoweredInitializer(NewType,
                                               GV->getInitializer());
   GlobalVariable *NewGV =
     new GlobalVariable(*M, NewType, GV->isConstant(), GV->getLinkage(),
-                       NewInitializer, "", 0, GlobalVariable::NotThreadLocal,
+                       NewInitializer, "", nullptr,
+                       GlobalVariable::NotThreadLocal,
                        GV->getType()->getAddressSpace(),
                        GV->isExternallyInitialized());
 
diff --git a/lib/Target/XCore/XCoreRegisterInfo.cpp b/lib/Target/XCore/XCoreRegisterInfo.cpp
index d85d717..316c82c 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -33,11 +33,13 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-reg-info"
+
 #define GET_REGINFO_TARGET_DESC
 #include "XCoreGenRegisterInfo.inc"
 
-using namespace llvm;
-
 XCoreRegisterInfo::XCoreRegisterInfo()
   : XCoreGenRegisterInfo(XCore::LR) {
 }
@@ -205,16 +207,16 @@ bool XCoreRegisterInfo::needsFrameMoves(const MachineFunction &MF) {
     MF.getFunction()->needsUnwindTableEntry();
 }
 
-const uint16_t* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+const MCPhysReg* XCoreRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
                                                                          const {
   // The callee saved registers LR & FP are explicitly handled during
   // emitPrologue & emitEpilogue and related functions.
-  static const uint16_t CalleeSavedRegs[] = {
+  static const MCPhysReg CalleeSavedRegs[] = {
     XCore::R4, XCore::R5, XCore::R6, XCore::R7,
     XCore::R8, XCore::R9, XCore::R10,
     0
   };
-  static const uint16_t CalleeSavedRegsFP[] = {
+  static const MCPhysReg CalleeSavedRegsFP[] = {
     XCore::R4, XCore::R5, XCore::R6, XCore::R7,
     XCore::R8, XCore::R9,
     0
diff --git a/lib/Target/XCore/XCoreRegisterInfo.h b/lib/Target/XCore/XCoreRegisterInfo.h
index 36ba7b4..aa617a0 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/lib/Target/XCore/XCoreRegisterInfo.h
@@ -29,22 +29,23 @@ public:
 
   /// Code Generation virtual methods...
 
-  const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
+  const MCPhysReg *
+  getCalleeSavedRegs(const MachineFunction *MF =nullptr) const override;
 
-  BitVector getReservedRegs(const MachineFunction &MF) const;
+  BitVector getReservedRegs(const MachineFunction &MF) const override;
   
-  bool requiresRegisterScavenging(const MachineFunction &MF) const;
+  bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
-  bool useFPForScavengingIndex(const MachineFunction &MF) const;
+  bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
-                           RegScavenger *RS = NULL) const;
+                           RegScavenger *RS = nullptr) const override;
 
   // Debug information queries.
-  unsigned getFrameRegister(const MachineFunction &MF) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const override;
 
   //! Return whether to emit frame moves
   static bool needsFrameMoves(const MachineFunction &MF);
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index 68ede6a..5a6bbe7 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcore-selectiondag-info"
 #include "XCoreTargetMachine.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "xcore-selectiondag-info"
+
 XCoreSelectionDAGInfo::XCoreSelectionDAGInfo(const XCoreTargetMachine &TM)
   : TargetSelectionDAGInfo(TM) {
 }
@@ -41,13 +42,15 @@ EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl, SDValue Chain,
     Entry.Node = Src; Args.push_back(Entry);
     Entry.Node = Size; Args.push_back(Entry);
 
-    TargetLowering::CallLoweringInfo
-    CLI(Chain, Type::getVoidTy(*DAG.getContext()), false, false, false, false,
-        0, TLI.getLibcallCallingConv(RTLIB::MEMCPY), /*isTailCall=*/false,
-        /*doesNotRet=*/false, /*isReturnValueUsed=*/false,
-        DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()), Args, DAG, dl);
-    std::pair<SDValue,SDValue> CallResult =
-      TLI.LowerCallTo(CLI);
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl).setChain(Chain)
+      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                 Type::getVoidTy(*DAG.getContext()),
+                 DAG.getExternalSymbol("__memcpy_4", TLI.getPointerTy()),
+                 &Args, 0)
+      .setDiscardResult();
+
+    std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
 
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.h b/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 31704f3..ea6af98 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -25,14 +25,14 @@ public:
   explicit XCoreSelectionDAGInfo(const XCoreTargetMachine &TM);
   ~XCoreSelectionDAGInfo();
 
-  virtual SDValue
+  SDValue
   EmitTargetCodeForMemcpy(SelectionDAG &DAG, SDLoc dl,
                           SDValue Chain,
                           SDValue Op1, SDValue Op2,
                           SDValue Op3, unsigned Align, bool isVolatile,
                           bool AlwaysInline,
                           MachinePointerInfo DstPtrInfo,
-                          MachinePointerInfo SrcPtrInfo) const;
+                          MachinePointerInfo SrcPtrInfo) const override;
 };
 
 }
diff --git a/lib/Target/XCore/XCoreSubtarget.cpp b/lib/Target/XCore/XCoreSubtarget.cpp
index 8cfb770..89ea03a 100644
--- a/lib/Target/XCore/XCoreSubtarget.cpp
+++ b/lib/Target/XCore/XCoreSubtarget.cpp
@@ -15,12 +15,14 @@
 #include "XCore.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
+#define DEBUG_TYPE "xcore-subtarget"
+
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "XCoreGenSubtargetInfo.inc"
 
-using namespace llvm;
-
 void XCoreSubtarget::anchor() { }
 
 XCoreSubtarget::XCoreSubtarget(const std::string &TT,
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 781a87b..0fb21c5 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -46,9 +46,9 @@ public:
     return getTM<XCoreTargetMachine>();
   }
 
-  virtual bool addPreISel();
-  virtual bool addInstSelector();
-  virtual bool addPreEmitPass();
+  bool addPreISel() override;
+  bool addInstSelector() override;
+  bool addPreEmitPass() override;
 };
 } // namespace
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index a19a677..a57ca55 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -37,28 +37,28 @@ public:
                      Reloc::Model RM, CodeModel::Model CM,
                      CodeGenOpt::Level OL);
 
-  virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
-  virtual const XCoreFrameLowering *getFrameLowering() const {
+  const XCoreInstrInfo *getInstrInfo() const override { return &InstrInfo; }
+  const XCoreFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
-  virtual const XCoreSubtarget *getSubtargetImpl() const { return &Subtarget; }
-  virtual const XCoreTargetLowering *getTargetLowering() const {
+  const XCoreSubtarget *getSubtargetImpl() const override { return &Subtarget; }
+  const XCoreTargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
 
-  virtual const XCoreSelectionDAGInfo* getSelectionDAGInfo() const {
+  const XCoreSelectionDAGInfo* getSelectionDAGInfo() const override {
     return &TSInfo;
   }
 
-  virtual const TargetRegisterInfo *getRegisterInfo() const {
+  const TargetRegisterInfo *getRegisterInfo() const override {
     return &InstrInfo.getRegisterInfo();
   }
-  virtual const DataLayout       *getDataLayout() const { return &DL; }
+  const DataLayout       *getDataLayout() const override { return &DL; }
 
   // Pass Pipeline Configuration
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
-  virtual void addAnalysisPasses(PassManagerBase &PM);
+  void addAnalysisPasses(PassManagerBase &PM) override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/XCore/XCoreTargetObjectFile.h b/lib/Target/XCore/XCoreTargetObjectFile.h
index 733e6d3..34d756e 100644
--- a/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -22,7 +22,7 @@ static const unsigned CodeModelLargeSize = 256;
    const MCSection *ReadOnlySectionLarge;
    const MCSection *DataRelROSectionLarge;
   public:
-    void Initialize(MCContext &Ctx, const TargetMachine &TM);
+    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
     const MCSection *
       getExplicitSectionGlobal(const GlobalValue *GV,
diff --git a/lib/Target/XCore/XCoreTargetTransformInfo.cpp b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
index 313d18d..80d193d 100644
--- a/lib/Target/XCore/XCoreTargetTransformInfo.cpp
+++ b/lib/Target/XCore/XCoreTargetTransformInfo.cpp
@@ -14,7 +14,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "xcoretti"
 #include "XCore.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
@@ -22,8 +21,10 @@
 #include "llvm/Target/TargetLowering.h"
 using namespace llvm;
 
+#define DEBUG_TYPE "xcoretti"
+
 // Declare the pass initialization routine locally as target-specific passes
-// don't havve a target-wide initialization entry point, and so we rely on the
+// don't have a target-wide initialization entry point, and so we rely on the
 // pass constructor initialization.
 namespace llvm {
 void initializeXCoreTTIPass(PassRegistry &);