39 files changed, 2346 insertions, 1157 deletions
diff --git a/lib/Target/ARM/A15SDOptimizer.cpp b/lib/Target/ARM/A15SDOptimizer.cpp
index f0d4dbe..e8c2f7c 100644
--- a/lib/Target/ARM/A15SDOptimizer.cpp
+++ b/lib/Target/ARM/A15SDOptimizer.cpp
@@ -615,7 +615,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
   SmallVector<unsigned, 8> Defs = getReadDPRs(MI);
   bool Modified = false;
 
-  for (SmallVector<unsigned, 8>::iterator I = Defs.begin(), E = Defs.end();
+  for (SmallVectorImpl<unsigned>::iterator I = Defs.begin(), E = Defs.end();
      I != E; ++I) {
     // Follow the def-use chain for this DPR through COPYs, and also through
     // PHIs (which are essentially multi-way COPYs). It is because of PHIs that
@@ -630,7 +630,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
     elideCopiesAndPHIs(Def, DefSrcs);
 
-    for (SmallVector<MachineInstr*, 8>::iterator II = DefSrcs.begin(),
+    for (SmallVectorImpl<MachineInstr *>::iterator II = DefSrcs.begin(),
       EE = DefSrcs.end(); II != EE; ++II) {
       MachineInstr *MI = *II;
 
@@ -655,7 +655,7 @@ bool A15SDOptimizer::runOnInstruction(MachineInstr *MI) {
 
       if (NewReg != 0) {
         Modified = true;
-        for (SmallVector<MachineOperand*, 8>::const_iterator I = Uses.begin(),
+        for (SmallVectorImpl<MachineOperand *>::const_iterator I = Uses.begin(),
                E = Uses.end(); I != E; ++I) {
           DEBUG(dbgs() << "Replacing operand "
                        << **I << " with "
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 1bc9d6b..e5da3a5 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -45,6 +45,9 @@ def FeatureFP16   : SubtargetFeature<"fp16", "HasFP16", "true",
 def FeatureVFP4   : SubtargetFeature<"vfp4", "HasVFPv4", "true",
                                      "Enable VFP4 instructions",
                                      [FeatureVFP3, FeatureFP16]>;
+def FeatureV8FP : SubtargetFeature<"v8fp", "HasV8FP",
+                                   "true", "Enable ARMv8 FP",
+                                   [FeatureVFP4]>;
 def FeatureD16    : SubtargetFeature<"d16", "HasD16", "true",
                                      "Restrict VFP3 to 16 double registers">;
 def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
@@ -138,6 +141,9 @@ def HasV6T2Ops  : SubtargetFeature<"v6t2", "HasV6T2Ops", "true",
 def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                    "Support ARM v7 instructions",
                                    [HasV6T2Ops, FeaturePerfMon]>;
+def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
+                                   "Support ARM v8 instructions",
+                                   [HasV7Ops]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -173,7 +179,7 @@ def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
 // FIXME: It has not been determined if A15 has these features.
 def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
                                    "Cortex-A15 ARM processors",
-                                   [FeatureT2XtPk, FeatureFP16,
+                                   [FeatureT2XtPk, FeatureFP16, FeatureVFP4,
                                     FeatureAvoidPartialCPSR,
                                     FeatureTrustZone]>;
 def ProcR5      : SubtargetFeature<"r5", "ARMProcFamily", "CortexR5",
@@ -291,6 +297,9 @@ def : ProcessorModel<"swift",       SwiftModel,
                                      FeatureDB, FeatureDSPThumb2,
                                      FeatureHasRAS]>;
 
+// V8 Processors
+def : ProcNoItin<"cortex-a53",      [HasV8Ops]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 13ec208..13a22b1 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -213,77 +213,67 @@ namespace {
 
 } // end of anonymous namespace
 
-MachineLocation ARMAsmPrinter::
-getDebugValueLocation(const MachineInstr *MI) const {
-  MachineLocation Location;
-  assert(MI->getNumOperands() == 4 && "Invalid no. of machine operands!");
-  // Frame address.  Currently handles register +- offset only.
-  if (MI->getOperand(0).isReg() && MI->getOperand(1).isImm())
-    Location.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
-  else {
-    DEBUG(dbgs() << "DBG_VALUE instruction ignored! " << *MI << "\n");
-  }
-  return Location;
-}
-
 /// EmitDwarfRegOp - Emit dwarf register operation.
-void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
+void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc,
+                                   bool Indirect) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
-  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1)
-    AsmPrinter::EmitDwarfRegOp(MLoc);
-  else {
-    unsigned Reg = MLoc.getReg();
-    if (Reg >= ARM::S0 && Reg <= ARM::S31) {
-      assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
-      // S registers are described as bit-pieces of a register
-      // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
-      // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
-
-      unsigned SReg = Reg - ARM::S0;
-      bool odd = SReg & 0x1;
-      unsigned Rx = 256 + (SReg >> 1);
-
-      OutStreamer.AddComment("DW_OP_regx for S register");
-      EmitInt8(dwarf::DW_OP_regx);
-
-      OutStreamer.AddComment(Twine(SReg));
-      EmitULEB128(Rx);
-
-      if (odd) {
-        OutStreamer.AddComment("DW_OP_bit_piece 32 32");
-        EmitInt8(dwarf::DW_OP_bit_piece);
-        EmitULEB128(32);
-        EmitULEB128(32);
-      } else {
-        OutStreamer.AddComment("DW_OP_bit_piece 32 0");
-        EmitInt8(dwarf::DW_OP_bit_piece);
-        EmitULEB128(32);
-        EmitULEB128(0);
-      }
-    } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
-      assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
-      // Q registers Q0-Q15 are described by composing two D registers together.
-      // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
-      // DW_OP_piece(8)
-
-      unsigned QReg = Reg - ARM::Q0;
-      unsigned D1 = 256 + 2 * QReg;
-      unsigned D2 = D1 + 1;
-
-      OutStreamer.AddComment("DW_OP_regx for Q register: D1");
-      EmitInt8(dwarf::DW_OP_regx);
-      EmitULEB128(D1);
-      OutStreamer.AddComment("DW_OP_piece 8");
-      EmitInt8(dwarf::DW_OP_piece);
-      EmitULEB128(8);
-
-      OutStreamer.AddComment("DW_OP_regx for Q register: D2");
-      EmitInt8(dwarf::DW_OP_regx);
-      EmitULEB128(D2);
-      OutStreamer.AddComment("DW_OP_piece 8");
-      EmitInt8(dwarf::DW_OP_piece);
-      EmitULEB128(8);
+  if (RI->getDwarfRegNum(MLoc.getReg(), false) != -1) {
+    AsmPrinter::EmitDwarfRegOp(MLoc, Indirect);
+    return;
+  }
+  assert(MLoc.isReg() && !Indirect &&
+         "This doesn't support offset/indirection - implement it if needed");
+  unsigned Reg = MLoc.getReg();
+  if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+    assert(ARM::S0 + 31 == ARM::S31 && "Unexpected ARM S register numbering");
+    // S registers are described as bit-pieces of a register
+    // S[2x] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 0)
+    // S[2x+1] = DW_OP_regx(256 + (x>>1)) DW_OP_bit_piece(32, 32)
+
+    unsigned SReg = Reg - ARM::S0;
+    bool odd = SReg & 0x1;
+    unsigned Rx = 256 + (SReg >> 1);
+
+    OutStreamer.AddComment("DW_OP_regx for S register");
+    EmitInt8(dwarf::DW_OP_regx);
+
+    OutStreamer.AddComment(Twine(SReg));
+    EmitULEB128(Rx);
+
+    if (odd) {
+      OutStreamer.AddComment("DW_OP_bit_piece 32 32");
+      EmitInt8(dwarf::DW_OP_bit_piece);
+      EmitULEB128(32);
+      EmitULEB128(32);
+    } else {
+      OutStreamer.AddComment("DW_OP_bit_piece 32 0");
+      EmitInt8(dwarf::DW_OP_bit_piece);
+      EmitULEB128(32);
+      EmitULEB128(0);
     }
+  } else if (Reg >= ARM::Q0 && Reg <= ARM::Q15) {
+    assert(ARM::Q0 + 15 == ARM::Q15 && "Unexpected ARM Q register numbering");
+    // Q registers Q0-Q15 are described by composing two D registers together.
+    // Qx = DW_OP_regx(256+2x) DW_OP_piece(8) DW_OP_regx(256+2x+1)
+    // DW_OP_piece(8)
+
+    unsigned QReg = Reg - ARM::Q0;
+    unsigned D1 = 256 + 2 * QReg;
+    unsigned D2 = D1 + 1;
+
+    OutStreamer.AddComment("DW_OP_regx for Q register: D1");
+    EmitInt8(dwarf::DW_OP_regx);
+    EmitULEB128(D1);
+    OutStreamer.AddComment("DW_OP_piece 8");
+    EmitInt8(dwarf::DW_OP_piece);
+    EmitULEB128(8);
+
+    OutStreamer.AddComment("DW_OP_regx for Q register: D2");
+    EmitInt8(dwarf::DW_OP_regx);
+    EmitULEB128(D2);
+    OutStreamer.AddComment("DW_OP_piece 8");
+    EmitInt8(dwarf::DW_OP_piece);
+    EmitULEB128(8);
   }
 }
 
@@ -474,8 +464,14 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
       // This takes advantage of the 2 operand-ness of ldm/stm and that we've
       // already got the operands in registers that are operands to the
       // inline asm statement.
-
-      O << "{" << ARMInstPrinter::getRegisterName(RegBegin);
+      O << "{";
+      if (ARM::GPRPairRegClass.contains(RegBegin)) {
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg0 = TRI->getSubReg(RegBegin, ARM::gsub_0);
+        O << ARMInstPrinter::getRegisterName(Reg0) << ", ";;
+        RegBegin = TRI->getSubReg(RegBegin, ARM::gsub_1);
+      }
+      O << ARMInstPrinter::getRegisterName(RegBegin);
 
       // FIXME: The register allocator not only may not have given us the
       // registers in sequence, but may not be in ascending registers. This
@@ -501,6 +497,20 @@ bool ARMAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
         return true;
       unsigned Flags = FlagsOP.getImm();
       unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+      unsigned RC;
+      InlineAsm::hasRegClassConstraint(Flags, RC);
+      if (RC == ARM::GPRPairRegClassID) {
+        if (NumVals != 1)
+          return true;
+        const MachineOperand &MO = MI->getOperand(OpNum);
+        if (!MO.isReg())
+          return true;
+        const TargetRegisterInfo *TRI = MF->getTarget().getRegisterInfo();
+        unsigned Reg = TRI->getSubReg(MO.getReg(), ExtraCode[0] == 'Q' ?
+            ARM::gsub_0 : ARM::gsub_1);
+        O << ARMInstPrinter::getRegisterName(Reg);
+        return false;
+      }
       if (NumVals != 2)
         return true;
       unsigned RegOp = ExtraCode[0] == 'Q' ? OpNum : OpNum + 1;
@@ -759,16 +769,9 @@ void ARMAsmPrinter::emitAttributes() {
                                ARMBuildAttrs::Allowed);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                                ARMBuildAttrs::Allowed);
-  } else if (CPUString == "generic") {
-    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
-                               ARMBuildAttrs::ApplicationProfile);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
-                               ARMBuildAttrs::Allowed);
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::AllowThumb32);
-  } else if (Subtarget->hasV7Ops()) {
+  } else if (Subtarget->hasV8Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v8);
+  else if (Subtarget->hasV7Ops()) {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                                ARMBuildAttrs::AllowThumb32);
@@ -782,6 +785,8 @@ void ARMAsmPrinter::emitAttributes() {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
   else if (Subtarget->hasV4TOps())
     AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+  else
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4);
 
   if (Subtarget->hasNEON() && emitFPU) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
@@ -796,8 +801,14 @@ void ARMAsmPrinter::emitAttributes() {
     emitFPU = false;
   }
 
-  /* VFPv4 + .fpu */
-  if (Subtarget->hasVFP4()) {
+  /* V8FP + .fpu */
+  if (Subtarget->hasV8FP()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
+                               ARMBuildAttrs::AllowV8FPA);
+    if (emitFPU)
+      AttrEmitter->EmitTextAttribute(ARMBuildAttrs::VFP_arch, "v8fp");
+    /* VFPv4 + .fpu */
+  } else if (Subtarget->hasVFP4()) {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::VFP_arch,
                                ARMBuildAttrs::AllowFPv4A);
     if (emitFPU)
@@ -821,8 +832,12 @@ void ARMAsmPrinter::emitAttributes() {
   /* TODO: ARMBuildAttrs::Allowed is not completely accurate,
    * since NEON can have 1 (allowed) or 2 (MAC operations) */
   if (Subtarget->hasNEON()) {
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                               ARMBuildAttrs::Allowed);
+    if (Subtarget->hasV8Ops())
+      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                                 ARMBuildAttrs::AllowedNeonV8);
+    else
+      AttrEmitter->EmitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
+                                 ARMBuildAttrs::Allowed);
   }
 
   // Signal various FP modes.
@@ -1092,23 +1107,6 @@ void ARMAsmPrinter::EmitJump2Table(const MachineInstr *MI) {
     OutStreamer.EmitDataRegion(MCDR_DataRegionEnd);
 }
 
-void ARMAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
-                                           raw_ostream &OS) {
-  unsigned NOps = MI->getNumOperands();
-  assert(NOps==4);
-  OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
-  // cast away const; DIetc do not take const operands for some reason.
-  DIVariable V(const_cast<MDNode *>(MI->getOperand(NOps-1).getMetadata()));
-  OS << V.getName();
-  OS << " <- ";
-  // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
-  OS << '['; printOperand(MI, 0, OS); OS << '+'; printOperand(MI, 1, OS);
-  OS << ']';
-  OS << "+";
-  printOperand(MI, NOps-2, OS);
-}
-
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   assert(MI->getFlag(MachineInstr::FrameSetup) &&
       "Only instruction which are involved into frame setup code are allowed");
@@ -1272,15 +1270,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
   switch (Opc) {
   case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
-  case ARM::DBG_VALUE: {
-    if (isVerbose() && OutStreamer.hasRawTextSupport()) {
-      SmallString<128> TmpStr;
-      raw_svector_ostream OS(TmpStr);
-      PrintDebugValueComment(MI, OS);
-      OutStreamer.EmitRawText(StringRef(OS.str()));
-    }
-    return;
-  }
+  case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
   case ARM::LEApcrel:
   case ARM::tLEApcrel:
   case ARM::t2LEApcrel: {
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index c945e4f..de72e06 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -97,13 +97,9 @@ private:
                                    const MachineInstr *MI);
 
 public:
-  void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-
-  virtual MachineLocation
-    getDebugValueLocation(const MachineInstr *MI) const LLVM_OVERRIDE;
-
   /// EmitDwarfRegOp - Emit dwarf register operation.
-  virtual void EmitDwarfRegOp(const MachineLocation &MLoc) const LLVM_OVERRIDE;
+  virtual void EmitDwarfRegOp(const MachineLocation &MLoc, bool Indirect) const
+      LLVM_OVERRIDE;
 
   virtual unsigned getISAEncoding() LLVM_OVERRIDE {
     // ARM/Darwin adds ISA to the DWARF info for each function.
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index ad14475..977d936 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -272,104 +272,90 @@ ARMBaseInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,MachineBasicBlock *&TBB,
                                 MachineBasicBlock *&FBB,
                                 SmallVectorImpl<MachineOperand> &Cond,
                                 bool AllowModify) const {
-  // If the block has no terminators, it just falls into the block after it.
+  TBB = 0;
+  FBB = 0;
+
   MachineBasicBlock::iterator I = MBB.end();
   if (I == MBB.begin())
-    return false;
+    return false; // Empty blocks are easy.
   --I;
-  while (I->isDebugValue()) {
-    if (I == MBB.begin())
-      return false;
-    --I;
-  }
 
-  // Get the last instruction in the block.
-  MachineInstr *LastInst = I;
-  unsigned LastOpc = LastInst->getOpcode();
+  // Walk backwards from the end of the basic block until the branch is
+  // analyzed or we give up.
+  while (isPredicated(I) || I->isTerminator()) {
 
-  // Check if it's an indirect branch first, this should return 'unanalyzable'
-  // even if it's predicated.
-  if (isIndirectBranchOpcode(LastOpc))
-    return true;
-
-  if (!isUnpredicatedTerminator(I))
-    return false;
+    // Flag to be raised on unanalyzeable instructions. This is useful in cases
+    // where we want to clean up on the end of the basic block before we bail
+    // out.
+    bool CantAnalyze = false;
 
-  // If there is only one terminator instruction, process it.
-  if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-    if (isUncondBranchOpcode(LastOpc)) {
-      TBB = LastInst->getOperand(0).getMBB();
-      return false;
+    // Skip over DEBUG values and predicated nonterminators.
+    while (I->isDebugValue() || !I->isTerminator()) {
+      if (I == MBB.begin())
+        return false;
+      --I;
     }
-    if (isCondBranchOpcode(LastOpc)) {
-      // Block ends with fall-through condbranch.
-      TBB = LastInst->getOperand(0).getMBB();
-      Cond.push_back(LastInst->getOperand(1));
-      Cond.push_back(LastInst->getOperand(2));
-      return false;
+
+    if (isIndirectBranchOpcode(I->getOpcode()) ||
+        isJumpTableBranchOpcode(I->getOpcode())) {
+      // Indirect branches and jump tables can't be analyzed, but we still want
+      // to clean up any instructions at the tail of the basic block.
+      CantAnalyze = true;
+    } else if (isUncondBranchOpcode(I->getOpcode())) {
+      TBB = I->getOperand(0).getMBB();
+    } else if (isCondBranchOpcode(I->getOpcode())) {
+      // Bail out if we encounter multiple conditional branches.
+      if (!Cond.empty())
+        return true;
+
+      assert(!FBB && "FBB should have been null.");
+      FBB = TBB;
+      TBB = I->getOperand(0).getMBB();
+      Cond.push_back(I->getOperand(1));
+      Cond.push_back(I->getOperand(2));
+    } else if (I->isReturn()) {
+      // Returns can't be analyzed, but we should run cleanup.
+      CantAnalyze = !isPredicated(I);
+    } else {
+      // We encountered other unrecognized terminator. Bail out immediately.
+      return true;
     }
-    return true;  // Can't handle indirect branch.
-  }
 
-  // Get the instruction before it if it is a terminator.
-  MachineInstr *SecondLastInst = I;
-  unsigned SecondLastOpc = SecondLastInst->getOpcode();
-
-  // If AllowModify is true and the block ends with two or more unconditional
-  // branches, delete all but the first unconditional branch.
-  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
-    while (isUncondBranchOpcode(SecondLastOpc)) {
-      LastInst->eraseFromParent();
-      LastInst = SecondLastInst;
-      LastOpc = LastInst->getOpcode();
-      if (I == MBB.begin() || !isUnpredicatedTerminator(--I)) {
-        // Return now the only terminator is an unconditional branch.
-        TBB = LastInst->getOperand(0).getMBB();
-        return false;
-      } else {
-        SecondLastInst = I;
-        SecondLastOpc = SecondLastInst->getOpcode();
+    // Cleanup code - to be run for unpredicated unconditional branches and
+    //                returns.
+    if (!isPredicated(I) &&
+          (isUncondBranchOpcode(I->getOpcode()) ||
+           isIndirectBranchOpcode(I->getOpcode()) ||
+           isJumpTableBranchOpcode(I->getOpcode()) ||
+           I->isReturn())) {
+      // Forget any previous condition branch information - it no longer applies.
+      Cond.clear();
+      FBB = 0;
+
+      // If we can modify the function, delete everything below this
+      // unconditional branch.
+      if (AllowModify) {
+        MachineBasicBlock::iterator DI = llvm::next(I);
+        while (DI != MBB.end()) {
+          MachineInstr *InstToDelete = DI;
+          ++DI;
+          InstToDelete->eraseFromParent();
+        }
       }
     }
-  }
-
-  // If there are three terminators, we don't know what sort of block this is.
-  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(--I))
-    return true;
 
-  // If the block ends with a B and a Bcc, handle it.
-  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB =  SecondLastInst->getOperand(0).getMBB();
-    Cond.push_back(SecondLastInst->getOperand(1));
-    Cond.push_back(SecondLastInst->getOperand(2));
-    FBB = LastInst->getOperand(0).getMBB();
-    return false;
-  }
+    if (CantAnalyze)
+      return true;
 
-  // If the block ends with two unconditional branches, handle it.  The second
-  // one is not executed, so remove it.
-  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
-    TBB = SecondLastInst->getOperand(0).getMBB();
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return false;
-  }
+    if (I == MBB.begin())
+      return false;
 
-  // ...likewise if it ends with a branch table followed by an unconditional
-  // branch. The branch folder can create these, and we must get rid of them for
-  // correctness of Thumb constant islands.
-  if ((isJumpTableBranchOpcode(SecondLastOpc) ||
-       isIndirectBranchOpcode(SecondLastOpc)) &&
-      isUncondBranchOpcode(LastOpc)) {
-    I = LastInst;
-    if (AllowModify)
-      I->eraseFromParent();
-    return true;
+    --I;
   }
 
-  // Otherwise, can't handle this.
-  return true;
+  // We made it past the terminators without bailing out - we must have
+  // analyzed this branch successfully.
+  return false;
 }
 
 
@@ -745,6 +731,9 @@ void ARMBaseInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     if (Opc == ARM::VORRq)
       Mov.addReg(Src);
     Mov = AddDefaultPred(Mov);
+    // MOVr can set CC.
+    if (Opc == ARM::MOVr)
+      Mov = AddDefaultCC(Mov);
   }
   // Add implicit super-register defs and kills to the last instruction.
   Mov->addRegisterDefined(DestReg, TRI);
@@ -1213,16 +1202,6 @@ bool ARMBaseInstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const{
   return true;
 }
 
-MachineInstr*
-ARMBaseInstrInfo::emitFrameIndexDebugValue(MachineFunction &MF,
-                                           int FrameIx, uint64_t Offset,
-                                           const MDNode *MDPtr,
-                                           DebugLoc DL) const {
-  MachineInstrBuilder MIB = BuildMI(MF, DL, get(ARM::DBG_VALUE))
-    .addFrameIndex(FrameIx).addImm(0).addImm(Offset).addMetadata(MDPtr);
-  return &*MIB;
-}
-
 /// Create a copy of a const pool value. Update CPI to the new index and return
 /// the label UID.
 static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
@@ -3684,8 +3663,7 @@ hasHighOperandLatency(const InstrItineraryData *ItinData,
     return true;
 
   // Hoist VFP / NEON instructions with 4 or higher latency.
-  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx,
-                                      /*FindMin=*/false);
+  int Latency = computeOperandLatency(ItinData, DefMI, DefIdx, UseMI, UseIdx);
   if (Latency < 0)
     Latency = getInstrLatency(ItinData, DefMI);
   if (Latency <= 3)
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 4ca3d7b..96f8637 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -125,12 +125,6 @@ public:
 
   virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
 
-  virtual MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF,
-                                                 int FrameIx,
-                                                 uint64_t Offset,
-                                                 const MDNode *MDPtr,
-                                                 DebugLoc DL) const;
-
   virtual void reMaterialize(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator MI,
                              unsigned DestReg, unsigned SubIdx,
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 7c03055..58c06e3 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -58,30 +58,44 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
   }
  
-  if (ghcCall) {
-      return CSR_GHC_SaveList;
-  }
-  else {
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
-  }
+  if (ghcCall)
+    // GHC set of callee saved regs is empty as all those regs are
+    // used for passing STG regs around
+    return CSR_NoRegs_SaveList;
+  else
+    return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+      ? CSR_iOS_SaveList : CSR_AAPCS_SaveList;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return CSR_NoRegs_RegMask;
   return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
     ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID) const {
-  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
-    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
 }
 
 const uint32_t*
-ARMBaseRegisterInfo::getNoPreservedMask() const {
-  return CSR_NoRegs_RegMask;
+ARMBaseRegisterInfo::getThisReturnPreservedMask(CallingConv::ID CC) const {
+  // This should return a register mask that is the same as that returned by
+  // getCallPreservedMask but that additionally preserves the register used for
+  // the first i32 argument (which must also be the register used to return a
+  // single i32 return value)
+  //
+  // In case that the calling convention does not use the same register for
+  // both or otherwise does not want to enable this optimization, the function
+  // should return NULL
+  if (CC == CallingConv::GHC)
+    // This is academic becase all GHC calls are (supposed to be) tail calls
+    return NULL;
+  return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
+    ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask;
 }
 
 BitVector ARMBaseRegisterInfo::
@@ -309,7 +323,7 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // 1. Dynamic stack realignment is explicitly disabled,
   // 2. This is a Thumb1 function (it's not useful, so we don't bother), or
   // 3. There are VLAs in the function and the base pointer is disabled.
-  if (!MF.getTarget().Options.RealignStack)
+  if (MF.getFunction()->hasFnAttribute("no-realign-stack"))
     return false;
   if (AFI->isThumb1OnlyFunction())
     return false;
@@ -702,12 +716,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   }
 #endif // NDEBUG
 
-  // Special handling of dbg_value instructions.
-  if (MI.isDebugValue()) {
-    MI.getOperand(FIOperandNum).  ChangeToRegister(FrameReg, false /*isDef*/);
-    MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
-    return;
-  }
+  assert(!MI.isDebugValue() && "DBG_VALUEs should be handled in target-independent code");
 
   // Modify MI as necessary to handle as much of 'Offset' as possible
   bool Done = false;
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index 03b3682..cdaad05 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -94,9 +94,18 @@ public:
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
-  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
   const uint32_t *getNoPreservedMask() const;
 
+  /// getThisReturnPreservedMask - Returns a call preserved mask specific to the
+  /// case that 'returned' is on an i32 first argument if the calling convention
+  /// is one that can (partially) model this attribute with a preserved mask
+  /// (i.e. it is a calling convention that uses the same register for the first
+  /// i32 argument and an i32 return value)
+  ///
+  /// Should return NULL in the case that the calling convention does not have
+  /// this property
+  const uint32_t *getThisReturnPreservedMask(CallingConv::ID) const;
+  
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
   const TargetRegisterClass*
diff --git a/lib/Target/ARM/ARMBuildAttrs.h b/lib/Target/ARM/ARMBuildAttrs.h
index 11bd6a4..f614dca 100644
--- a/lib/Target/ARM/ARMBuildAttrs.h
+++ b/lib/Target/ARM/ARMBuildAttrs.h
@@ -89,7 +89,8 @@ namespace ARMBuildAttrs {
     v7       = 10,  // e.g. Cortex A8, Cortex M3
     v6_M     = 11,  // e.g. Cortex M1
     v6S_M    = 12,  // v6_M with the System extensions
-    v7E_M    = 13   // v7_M with DSP extensions
+    v7E_M    = 13,  // v7_M with DSP extensions
+    v8       = 14   // v8, AArch32
   };
 
   enum CPUArchProfile { // (=7), uleb128 
@@ -105,6 +106,7 @@ namespace ARMBuildAttrs {
     //ARMISAUse (=8), uleb128  and THUMBISAUse (=9), uleb128
     Not_Allowed = 0,
     Allowed = 1,
+    AllowedNeonV8 = 3,
 
     // FP_arch (=10), uleb128 (formerly Tag_VFP_arch = 10)
     AllowFPv2  = 2, // v2 FP ISA permitted (implies use of the v1 FP ISA)
@@ -112,6 +114,8 @@ namespace ARMBuildAttrs {
     AllowFPv3B = 4, // v3 FP ISA permitted, but only D0-D15, S0-S31 
     AllowFPv4A = 5, // v4 FP ISA permitted (implies use of v3 FP ISA) 
     AllowFPv4B = 6, // v4 FP ISA was permitted, but only D0-D15, S0-S31
+    AllowV8FPA = 7, // Use of the ARM v8-A FP ISA was permitted
+    AllowV8FPB = 8, // Use of the ARM v8-A FP ISA was permitted, but only D0-D15, S0-S31
 
     // Tag_WMMX_arch, (=11), uleb128
     AllowThumb32 = 2, // 32-bit Thumb (implies 16-bit instructions)
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 8ff666e..89c5223 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -208,9 +208,3 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                           (sub CSR_AAPCS_ThisReturn, R9))>;
-
-// GHC set of callee saved regs is empty as all those regs are
-// used for passing STG regs around
-// add is a workaround for not being able to compile empty list:
-// def CSR_GHC : CalleeSavedRegs<()>;
-def CSR_GHC : CalleeSavedRegs<(add)>;
diff --git a/lib/Target/ARM/ARMCodeEmitter.cpp b/lib/Target/ARM/ARMCodeEmitter.cpp
index 4a157d7..96eb764 100644
--- a/lib/Target/ARM/ARMCodeEmitter.cpp
+++ b/lib/Target/ARM/ARMCodeEmitter.cpp
@@ -171,6 +171,8 @@ namespace {
       const { return 0; }
     unsigned NEONThumb2DupPostEncoder(const MachineInstr &MI,unsigned Val)
       const { return 0; }
+    unsigned NEONThumb2V8PostEncoder(const MachineInstr &MI,unsigned Val)
+      const { return 0; }
     unsigned VFPThumb2PostEncoder(const MachineInstr&MI, unsigned Val)
       const {
       if (IsThumb) {
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index a4de941..ed054aa 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -42,7 +42,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -630,6 +629,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     (const TargetRegisterClass*)&ARM::GPRRegClass;
   unsigned DestReg = createResultReg(RC);
 
+  // FastISel TLS support on non-Darwin is broken, punt to SelectionDAG.
+  const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV);
+  bool IsThreadLocal = GVar && GVar->isThreadLocal();
+  if (!Subtarget->isTargetDarwin() && IsThreadLocal) return 0;
+
   // Use movw+movt when possible, it avoids constant pool entries.
   // Darwin targets don't support movt with Reloc::Static, see
   // ARMTargetLowering::LowerGlobalAddressDarwin.  Other targets only support
@@ -816,22 +820,19 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   switch (Opcode) {
     default:
     break;
-    case Instruction::BitCast: {
+    case Instruction::BitCast:
       // Look through bitcasts.
       return ARMComputeAddress(U->getOperand(0), Addr);
-    }
-    case Instruction::IntToPtr: {
+    case Instruction::IntToPtr:
       // Look past no-op inttoptrs.
       if (TLI.getValueType(U->getOperand(0)->getType()) == TLI.getPointerTy())
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
-    }
-    case Instruction::PtrToInt: {
+    case Instruction::PtrToInt:
       // Look past no-op ptrtoints.
       if (TLI.getValueType(U->getType()) == TLI.getPointerTy())
         return ARMComputeAddress(U->getOperand(0), Addr);
       break;
-    }
     case Instruction::GetElementPtr: {
       Address SavedAddr = Addr;
       int TmpOffset = Addr.Offset;
@@ -2184,10 +2185,14 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
 }
 
 unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
+  // Manually compute the global's type to avoid building it when unnecessary.
+  Type *GVTy = Type::getInt32PtrTy(*Context, /*AS=*/0);
+  EVT LCREVT = TLI.getValueType(GVTy);
+  if (!LCREVT.isSimple()) return 0;
+
   GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
                                        GlobalValue::ExternalLinkage, 0, Name);
-  EVT LCREVT = TLI.getValueType(GV->getType());
-  if (!LCREVT.isSimple()) return 0;
+  assert(GV->getType() == GVTy && "We miscomputed the type for the global!");
   return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
 }
 
@@ -2629,34 +2634,46 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   };
 
   // Table governing the instruction(s) to be emitted.
-  static const struct {
-    // First entry for each of the following is sext, second zext.
-    uint16_t Opc[2];
-    uint8_t Imm[2];   // All instructions have either a shift or a mask.
-    uint8_t hasS[2];  // Some instructions have an S bit, always set it to 0.
-  } OpcTbl[2][2][3] = {
+  static const struct InstructionTable {
+    uint32_t Opc   : 16;
+    uint32_t hasS  :  1; // Some instructions have an S bit, always set it to 0.
+    uint32_t Shift :  7; // For shift operand addressing mode, used by MOVsi.
+    uint32_t Imm   :  8; // All instructions have either a shift or a mask.
+  } IT[2][2][3][2] = {
     { // Two instructions (first is left shift, second is in this table).
-      { // ARM
-        /*  1 */ { { ARM::ASRi,   ARM::LSRi    }, {  31,  31 }, { 1, 1 } },
-        /*  8 */ { { ARM::ASRi,   ARM::LSRi    }, {  24,  24 }, { 1, 1 } },
-        /* 16 */ { { ARM::ASRi,   ARM::LSRi    }, {  16,  16 }, { 1, 1 } }
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  31 },
+        /*  1 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  31 } },
+        /*  8 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  24 },
+        /*  8 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  24 } },
+        /* 16 bit sext */ { { ARM::MOVsi  , 1, ARM_AM::asr     ,  16 },
+        /* 16 bit zext */   { ARM::MOVsi  , 1, ARM_AM::lsr     ,  16 } }
       },
-      { // Thumb
-        /*  1 */ { { ARM::tASRri, ARM::tLSRri  }, {  31,  31 }, { 0, 0 } },
-        /*  8 */ { { ARM::tASRri, ARM::tLSRri  }, {  24,  24 }, { 0, 0 } },
-        /* 16 */ { { ARM::tASRri, ARM::tLSRri  }, {  16,  16 }, { 0, 0 } }
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  31 },
+        /*  1 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  31 } },
+        /*  8 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  24 },
+        /*  8 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  24 } },
+        /* 16 bit sext */ { { ARM::tASRri , 0, ARM_AM::no_shift,  16 },
+        /* 16 bit zext */   { ARM::tLSRri , 0, ARM_AM::no_shift,  16 } }
       }
     },
     { // Single instruction.
-      { // ARM
-        /*  1 */ { { ARM::KILL,   ARM::ANDri   }, {   0,   1 }, { 0, 1 } },
-        /*  8 */ { { ARM::SXTB,   ARM::ANDri   }, {   0, 255 }, { 0, 1 } },
-        /* 16 */ { { ARM::SXTH,   ARM::UXTH    }, {   0,   0 }, { 0, 0 } }
+      { // ARM                Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::SXTB   , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::ANDri  , 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::SXTH   , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::UXTH   , 0, ARM_AM::no_shift,   0 } }
       },
-      { // Thumb
-        /*  1 */ { { ARM::KILL,   ARM::t2ANDri }, {   0,   1 }, { 0, 1 } },
-        /*  8 */ { { ARM::t2SXTB, ARM::t2ANDri }, {   0, 255 }, { 0, 1 } },
-        /* 16 */ { { ARM::t2SXTH, ARM::t2UXTH  }, {   0,   0 }, { 0, 0 } }
+      { // Thumb              Opc           S  Shift             Imm
+        /*  1 bit sext */ { { ARM::KILL   , 0, ARM_AM::no_shift,   0 },
+        /*  1 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift,   1 } },
+        /*  8 bit sext */ { { ARM::t2SXTB , 0, ARM_AM::no_shift,   0 },
+        /*  8 bit zext */   { ARM::t2ANDri, 1, ARM_AM::no_shift, 255 } },
+        /* 16 bit sext */ { { ARM::t2SXTH , 0, ARM_AM::no_shift,   0 },
+        /* 16 bit zext */   { ARM::t2UXTH , 0, ARM_AM::no_shift,   0 } }
       }
     }
   };
@@ -2671,20 +2688,28 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
          "other sizes unimplemented");
 
   bool hasV6Ops = Subtarget->hasV6Ops();
-  unsigned Bitness = countTrailingZeros(SrcBits) >> 1;  // {1,8,16}=>{0,1,2}
+  unsigned Bitness = SrcBits / 8;  // {1,8,16}=>{0,1,2}
   assert((Bitness < 3) && "sanity-check table bounds");
 
   bool isSingleInstr = isSingleInstrTbl[Bitness][isThumb2][hasV6Ops][isZExt];
   const TargetRegisterClass *RC = RCTbl[isThumb2][isSingleInstr];
-  unsigned Opc = OpcTbl[isSingleInstr][isThumb2][Bitness].Opc[isZExt];
+  const InstructionTable *ITP = &IT[isSingleInstr][isThumb2][Bitness][isZExt];
+  unsigned Opc = ITP->Opc;
   assert(ARM::KILL != Opc && "Invalid table entry");
-  unsigned Imm = OpcTbl[isSingleInstr][isThumb2][Bitness].Imm[isZExt];
-  unsigned hasS = OpcTbl[isSingleInstr][isThumb2][Bitness].hasS[isZExt];
+  unsigned hasS = ITP->hasS;
+  ARM_AM::ShiftOpc Shift = (ARM_AM::ShiftOpc) ITP->Shift;
+  assert(((Shift == ARM_AM::no_shift) == (Opc != ARM::MOVsi)) &&
+         "only MOVsi has shift operand addressing mode");
+  unsigned Imm = ITP->Imm;
 
   // 16-bit Thumb instructions always set CPSR (unless they're in an IT block).
   bool setsCPSR = &ARM::tGPRRegClass == RC;
-  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::LSLi;
+  unsigned LSLOpc = isThumb2 ? ARM::tLSLri : ARM::MOVsi;
   unsigned ResultReg;
+  // MOVsi encodes shift and immediate in shift operand addressing mode.
+  // The following condition has the same value when emitting two
+  // instruction sequences: both are shifts.
+  bool ImmIsSO = (Shift != ARM_AM::no_shift);
 
   // Either one or two instructions are emitted.
   // They're always of the form:
@@ -2697,13 +2722,16 @@ unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
   unsigned NumInstrsEmitted = isSingleInstr ? 1 : 2;
   for (unsigned Instr = 0; Instr != NumInstrsEmitted; ++Instr) {
     ResultReg = createResultReg(RC);
-    unsigned Opcode = ((0 == Instr) && !isSingleInstr) ? LSLOpc : Opc;
+    bool isLsl = (0 == Instr) && !isSingleInstr;
+    unsigned Opcode = isLsl ? LSLOpc : Opc;
+    ARM_AM::ShiftOpc ShiftAM = isLsl ? ARM_AM::lsl : Shift;
+    unsigned ImmEnc = ImmIsSO ? ARM_AM::getSORegOpc(ShiftAM, Imm) : Imm;
     bool isKill = 1 == Instr;
     MachineInstrBuilder MIB = BuildMI(
         *FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(Opcode), ResultReg);
     if (setsCPSR)
       MIB.addReg(ARM::CPSR, RegState::Define);
-    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(Imm));
+    AddDefaultPred(MIB.addReg(SrcReg, isKill * RegState::Kill).addImm(ImmEnc));
     if (hasS)
       AddDefaultCC(MIB);
     // Second instruction consumes the first's result.
@@ -3025,8 +3053,6 @@ bool ARMFastISel::FastLowerArguments() {
   Idx = 0;
   for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
        I != E; ++I, ++Idx) {
-    if (I->use_empty())
-      continue;
     unsigned SrcReg = GPRArgRegs[Idx];
     unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
@@ -3044,13 +3070,23 @@ bool ARMFastISel::FastLowerArguments() {
 namespace llvm {
   FastISel *ARM::createFastISel(FunctionLoweringInfo &funcInfo,
                                 const TargetLibraryInfo *libInfo) {
-    // Completely untested on non-iOS.
     const TargetMachine &TM = funcInfo.MF->getTarget();
 
-    // Darwin and thumb1 only for now.
     const ARMSubtarget *Subtarget = &TM.getSubtarget<ARMSubtarget>();
-    if (Subtarget->isTargetIOS() && !Subtarget->isThumb1Only())
+    // Thumb2 support on iOS; ARM support on iOS, Linux and NaCl.
+    bool UseFastISel = false;
+    UseFastISel |= Subtarget->isTargetIOS() && !Subtarget->isThumb1Only();
+    UseFastISel |= Subtarget->isTargetLinux() && !Subtarget->isThumb();
+    UseFastISel |= Subtarget->isTargetNaCl() && !Subtarget->isThumb();
+
+    if (UseFastISel) {
+      // iOS always has a FP for backtracking, force other targets
+      // to keep their FP when doing FastISel. The emitted code is
+      // currently superior, and in cases like test-suite's lencod
+      // FastISel isn't quite correct when FP is eliminated.
+      TM.Options.NoFramePointerElim = true;
       return new ARMFastISel(funcInfo, libInfo);
+    }
     return 0;
   }
 }
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 962368d..4ca3af6 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -61,7 +61,6 @@ enum AddrMode2Type {
 
 class ARMDAGToDAGISel : public SelectionDAGISel {
   ARMBaseTargetMachine &TM;
-  const ARMBaseInstrInfo *TII;
 
   /// Subtarget - Keep a pointer to the ARMSubtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -71,7 +70,6 @@ public:
   explicit ARMDAGToDAGISel(ARMBaseTargetMachine &tm,
                            CodeGenOpt::Level OptLevel)
     : SelectionDAGISel(tm, OptLevel), TM(tm),
-      TII(static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo())),
       Subtarget(&TM.getSubtarget<ARMSubtarget>()) {
   }
 
@@ -177,6 +175,7 @@ public:
                                  SDValue &OffImm);
   bool SelectT2AddrModeSoReg(SDValue N, SDValue &Base,
                              SDValue &OffReg, SDValue &ShImm);
+  bool SelectT2AddrModeExclusive(SDValue N, SDValue &Base, SDValue &OffImm);
 
   inline bool is_so_imm(unsigned Imm) const {
     return ARM_AM::getSOImmVal(Imm) != -1;
@@ -423,7 +422,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (!CheckVMLxHazard)
     return true;
 
-  if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9() &&
+  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9() &&
       !Subtarget->isSwift())
     return true;
 
@@ -434,6 +433,9 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
   if (Use->getOpcode() == ISD::CopyToReg)
     return true;
   if (Use->isMachineOpcode()) {
+    const ARMBaseInstrInfo *TII =
+      static_cast<const ARMBaseInstrInfo*>(TM.getInstrInfo());
+
     const MCInstrDesc &MCID = TII->get(Use->getMachineOpcode());
     if (MCID.mayStore())
       return true;
@@ -533,7 +535,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -557,7 +560,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -703,7 +707,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -724,7 +729,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -901,7 +907,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
@@ -915,7 +922,8 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
 
@@ -960,7 +968,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N;
     if (N.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
                !(Subtarget->useMovt() &&
                  N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
@@ -978,7 +987,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
     Base = N.getOperand(0);
     if (Base.getOpcode() == ISD::FrameIndex) {
       int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
     }
 
     ARM_AM::AddrOpc AddSub = ARM_AM::add;
@@ -1202,7 +1212,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
                                             SDValue &Base, SDValue &OffImm) {
   if (N.getOpcode() == ISD::FrameIndex) {
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    Base = CurDAG->getTargetFrameIndex(FI,
+                                       getTargetLowering()->getPointerTy());
     OffImm = CurDAG->getTargetConstant(0, MVT::i32);
     return true;
   }
@@ -1219,7 +1230,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1267,7 +1279,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
     if (N.getOpcode() == ISD::FrameIndex) {
       // Match frame index.
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
-      Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+      Base = CurDAG->getTargetFrameIndex(FI,
+                                         getTargetLowering()->getPointerTy());
       OffImm  = CurDAG->getTargetConstant(0, MVT::i32);
       return true;
     }
@@ -1297,7 +1310,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm12(SDValue N,
       Base   = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1326,7 +1340,8 @@ bool ARMDAGToDAGISel::SelectT2AddrModeImm8(SDValue N,
       Base = N.getOperand(0);
       if (Base.getOpcode() == ISD::FrameIndex) {
         int FI = cast<FrameIndexSDNode>(Base)->getIndex();
-        Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+        Base = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
       }
       OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32);
       return true;
@@ -1403,6 +1418,34 @@ bool ARMDAGToDAGISel::SelectT2AddrModeSoReg(SDValue N,
   return true;
 }
 
+bool ARMDAGToDAGISel::SelectT2AddrModeExclusive(SDValue N, SDValue &Base,
+                                                SDValue &OffImm) {
+  // This *must* succeed since it's used for the irreplacable ldrex and strex
+  // instructions.
+  Base = N;
+  OffImm = CurDAG->getTargetConstant(0, MVT::i32);
+
+  if (N.getOpcode() != ISD::ADD || !CurDAG->isBaseWithConstantOffset(N))
+    return true;
+
+  ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!RHS)
+    return true;
+
+  uint32_t RHSC = (int)RHS->getZExtValue();
+  if (RHSC > 1020 || RHSC % 4 != 0)
+    return true;
+
+  Base = N.getOperand(0);
+  if (Base.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Base)->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, getTargetLowering()->getPointerTy());
+  }
+
+  OffImm = CurDAG->getTargetConstant(RHSC / 4, MVT::i32);
+  return true;
+}
+
 //===--------------------------------------------------------------------===//
 
 /// getAL - Returns a ARMCC::AL immediate node.
@@ -2587,7 +2630,7 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
                                   Type::getInt32Ty(*CurDAG->getContext()), Val),
-                                      TLI->getPointerTy());
+                                      getTargetLowering()->getPointerTy());
 
       SDNode *ResNode;
       if (Subtarget->isThumb1Only()) {
@@ -2617,7 +2660,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
   case ISD::FrameIndex: {
     // Selects to ADDri FI, 0 which in turn will become ADDri SP, imm.
     int FI = cast<FrameIndexSDNode>(N)->getIndex();
-    SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy());
+    SDValue TFI = CurDAG->getTargetFrameIndex(FI,
+                                           getTargetLowering()->getPointerTy());
     if (Subtarget->isThumb1Only()) {
       SDValue Ops[] = { TFI, CurDAG->getTargetConstant(0, MVT::i32),
                         getAL(CurDAG), CurDAG->getRegister(0, MVT::i32) };
@@ -3449,24 +3493,20 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
   bool Changed = false;
   unsigned NumOps = N->getNumOperands();
 
-  ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(
-      N->getOperand(InlineAsm::Op_AsmString));
-  StringRef AsmString = StringRef(S->getSymbol());
-
   // Normally, i64 data is bounded to two arbitrary GRPs for "%r" constraint.
   // However, some instrstions (e.g. ldrexd/strexd in ARM mode) require
   // (even/even+1) GPRs and use %n and %Hn to refer to the individual regs
   // respectively. Since there is no constraint to explicitly specify a
-  // reg pair, we search %H operand inside the asm string. If it is found, the
-  // transformation below enforces a GPRPair reg class for "%r" for 64-bit data.
-  if (AsmString.find(":H}") == StringRef::npos)
-    return NULL;
+  // reg pair, we use GPRPair reg class for "%r" for 64-bit data. For Thumb,
+  // the 64-bit data may be referred by H, Q, R modifiers, so we still pack
+  // them into a GPRPair.
 
   SDLoc dl(N);
-  SDValue Glue = N->getOperand(NumOps-1);
+  SDValue Glue = N->getGluedNode() ? N->getOperand(NumOps-1) : SDValue(0,0);
 
+  SmallVector<bool, 8> OpChanged;
   // Glue node will be appended late.
-  for(unsigned i = 0; i < NumOps -1; ++i) {
+  for(unsigned i = 0, e = N->getGluedNode() ? NumOps - 1 : NumOps; i < e; ++i) {
     SDValue op = N->getOperand(i);
     AsmNodeOperands.push_back(op);
 
@@ -3480,17 +3520,38 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     else
       continue;
 
+    // Immediate operands to inline asm in the SelectionDAG are modeled with
+    // two operands. The first is a constant of value InlineAsm::Kind_Imm, and
+    // the second is a constant with the value of the immediate. If we get here
+    // and we have a Kind_Imm, skip the next operand, and continue.
+    if (Kind == InlineAsm::Kind_Imm) {
+      SDValue op = N->getOperand(++i);
+      AsmNodeOperands.push_back(op);
+      continue;
+    }
+
+    unsigned NumRegs = InlineAsm::getNumOperandRegisters(Flag);
+    if (NumRegs)
+      OpChanged.push_back(false);
+
+    unsigned DefIdx = 0;
+    bool IsTiedToChangedOp = false;
+    // If it's a use that is tied with a previous def, it has no
+    // reg class constraint.
+    if (Changed && InlineAsm::isUseOperandTiedToDef(Flag, DefIdx))
+      IsTiedToChangedOp = OpChanged[DefIdx];
+
     if (Kind != InlineAsm::Kind_RegUse && Kind != InlineAsm::Kind_RegDef
         && Kind != InlineAsm::Kind_RegDefEarlyClobber)
       continue;
 
-    unsigned RegNum = InlineAsm::getNumOperandRegisters(Flag);
     unsigned RC;
     bool HasRC = InlineAsm::hasRegClassConstraint(Flag, RC);
-    if (!HasRC || RC != ARM::GPRRegClassID || RegNum != 2)
+    if ((!IsTiedToChangedOp && (!HasRC || RC != ARM::GPRRegClassID))
+        || NumRegs != 2)
       continue;
 
-    assert((i+2 < NumOps-1) && "Invalid number of operands in inline asm");
+    assert((i+2 < NumOps) && "Invalid number of operands in inline asm");
     SDValue V0 = N->getOperand(i+1);
     SDValue V1 = N->getOperand(i+2);
     unsigned Reg0 = cast<RegisterSDNode>(V0)->getReg();
@@ -3551,6 +3612,7 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     Changed = true;
 
     if(PairedReg.getNode()) {
+      OpChanged[OpChanged.size() -1 ] = true;
       Flag = InlineAsm::getFlagWord(Kind, 1 /* RegNum*/);
       Flag = InlineAsm::getFlagWordForRegClass(Flag, ARM::GPRPairRegClassID);
       // Replace the current flag.
@@ -3563,7 +3625,8 @@ SDNode *ARMDAGToDAGISel::SelectInlineAsm(SDNode *N){
     }
   }
 
-  AsmNodeOperands.push_back(Glue);
+  if (Glue.getNode())
+    AsmNodeOperands.push_back(Glue);
   if (!Changed)
     return NULL;
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index ec0e9c2..caec11e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -74,7 +74,7 @@ namespace {
   class ARMCCState : public CCState {
   public:
     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               const TargetMachine &TM, SmallVector<CCValAssign, 16> &locs,
+               const TargetMachine &TM, SmallVectorImpl<CCValAssign> &locs,
                LLVMContext &C, ParmContext PC)
         : CCState(CC, isVarArg, MF, TM, locs, C) {
       assert(((PC == Call) || (PC == Prologue)) &&
@@ -693,10 +693,36 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     setOperationAction(ISD::SDIV,  MVT::i32, Expand);
     setOperationAction(ISD::UDIV,  MVT::i32, Expand);
   }
+
+  // FIXME: Also set divmod for SREM on EABI
   setOperationAction(ISD::SREM,  MVT::i32, Expand);
   setOperationAction(ISD::UREM,  MVT::i32, Expand);
-  setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
-  setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  // Register based DivRem for AEABI (RTABI 4.2)
+  if (Subtarget->isTargetAEABI()) {
+    setLibcallName(RTLIB::SDIVREM_I8,  "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I16, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I32, "__aeabi_idivmod");
+    setLibcallName(RTLIB::SDIVREM_I64, "__aeabi_ldivmod");
+    setLibcallName(RTLIB::UDIVREM_I8,  "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I16, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I32, "__aeabi_uidivmod");
+    setLibcallName(RTLIB::UDIVREM_I64, "__aeabi_uldivmod");
+
+    setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::SDIVREM_I64, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I8, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I16, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I32, CallingConv::ARM_AAPCS);
+    setLibcallCallingConv(RTLIB::UDIVREM_I64, CallingConv::ARM_AAPCS);
+
+    setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  } else {
+    setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
+    setOperationAction(ISD::UDIVREM, MVT::i32, Expand);
+  }
 
   setOperationAction(ISD::GlobalAddress, MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,  MVT::i32,   Custom);
@@ -717,8 +743,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (!Subtarget->isTargetDarwin()) {
     // Non-Darwin platforms may return values in these registers via the
     // personality function.
-    setOperationAction(ISD::EHSELECTION,      MVT::i32,   Expand);
-    setOperationAction(ISD::EXCEPTIONADDR,    MVT::i32,   Expand);
     setExceptionPointerRegister(ARM::R0);
     setExceptionSelectorRegister(ARM::R1);
   }
@@ -1068,6 +1092,19 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VST2LN_UPD:    return "ARMISD::VST2LN_UPD";
   case ARMISD::VST3LN_UPD:    return "ARMISD::VST3LN_UPD";
   case ARMISD::VST4LN_UPD:    return "ARMISD::VST4LN_UPD";
+
+  case ARMISD::ATOMADD64_DAG:     return "ATOMADD64_DAG";
+  case ARMISD::ATOMSUB64_DAG:     return "ATOMSUB64_DAG";
+  case ARMISD::ATOMOR64_DAG:      return "ATOMOR64_DAG";
+  case ARMISD::ATOMXOR64_DAG:     return "ATOMXOR64_DAG";
+  case ARMISD::ATOMAND64_DAG:     return "ATOMAND64_DAG";
+  case ARMISD::ATOMNAND64_DAG:    return "ATOMNAND64_DAG";
+  case ARMISD::ATOMSWAP64_DAG:    return "ATOMSWAP64_DAG";
+  case ARMISD::ATOMCMPXCHG64_DAG: return "ATOMCMPXCHG64_DAG";
+  case ARMISD::ATOMMIN64_DAG:     return "ATOMMIN64_DAG";
+  case ARMISD::ATOMUMIN64_DAG:    return "ATOMUMIN64_DAG";
+  case ARMISD::ATOMMAX64_DAG:     return "ATOMMAX64_DAG";
+  case ARMISD::ATOMUMAX64_DAG:    return "ATOMUMAX64_DAG";
   }
 }
 
@@ -1332,7 +1369,7 @@ void ARMTargetLowering::PassF64ArgInRegs(SDLoc dl, SelectionDAG &DAG,
                                          RegsToPassVector &RegsToPass,
                                          CCValAssign &VA, CCValAssign &NextVA,
                                          SDValue &StackPtr,
-                                         SmallVector<SDValue, 8> &MemOpChains,
+                                         SmallVectorImpl<SDValue> &MemOpChains,
                                          ISD::ArgFlagsTy Flags) const {
 
   SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
@@ -1360,9 +1397,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                              SmallVectorImpl<SDValue> &InVals) const {
   SelectionDAG &DAG                     = CLI.DAG;
   SDLoc &dl                          = CLI.DL;
-  SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
-  SmallVector<SDValue, 32> &OutVals     = CLI.OutVals;
-  SmallVector<ISD::InputArg, 32> &Ins   = CLI.Ins;
+  SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
+  SmallVectorImpl<SDValue> &OutVals     = CLI.OutVals;
+  SmallVectorImpl<ISD::InputArg> &Ins   = CLI.Ins;
   SDValue Chain                         = CLI.Chain;
   SDValue Callee                        = CLI.Callee;
   bool &isTailCall                      = CLI.IsTailCall;
@@ -1711,10 +1748,17 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   const uint32_t *Mask;
   const TargetRegisterInfo *TRI = getTargetMachine().getRegisterInfo();
   const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo*>(TRI);
-  if (isThisReturn)
-    // For 'this' returns, use the R0-preserving mask
+  if (isThisReturn) {
+    // For 'this' returns, use the R0-preserving mask if applicable
     Mask = ARI->getThisReturnPreservedMask(CallConv);
-  else
+    if (!Mask) {
+      // Set isThisReturn to false if the calling convention is not one that
+      // allows 'returned' to be modeled in this way, so LowerCallResult does
+      // not try to pass 'this' straight through 
+      isThisReturn = false;
+      Mask = ARI->getCallPreservedMask(CallConv);
+    }
+  } else
     Mask = ARI->getCallPreservedMask(CallConv);
 
   assert(Mask && "Missing call preserved mask for calling convention");
@@ -2550,8 +2594,18 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG,
                        DAG.getConstant(0, MVT::i32));
   }
 
+  ConstantSDNode *OrdN = cast<ConstantSDNode>(Op.getOperand(1));
+  AtomicOrdering Ord = static_cast<AtomicOrdering>(OrdN->getZExtValue());
+  unsigned Domain = ARM_MB::ISH;
+  if (Subtarget->isSwift() && Ord == Release) {
+    // Swift happens to implement ISHST barriers in a way that's compatible with
+    // Release semantics but weaker than ISH so we'd be fools not to use
+    // it. Beware: other processors probably don't!
+    Domain = ARM_MB::ISHST;
+  }
+
   return DAG.getNode(ARMISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0),
-                     DAG.getConstant(ARM_MB::ISH, MVT::i32));
+                     DAG.getConstant(Domain, MVT::i32));
 }
 
 static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG,
@@ -2717,7 +2771,7 @@ ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
     lastRegToSaveIndex = REnd - ARM::R0;
   } else {
     firstRegToSaveIndex = CCInfo.getFirstUnallocated
-      (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
+      (GPRArgRegs, array_lengthof(GPRArgRegs));
     lastRegToSaveIndex = 4;
   }
 
@@ -4620,7 +4674,9 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (ValueCounts.size() == 0)
     return DAG.getUNDEF(VT);
 
-  if (isOnlyLowElement)
+  // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
+  // Keep going if we are hitting this case.
+  if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()))
     return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
 
   unsigned EltSize = VT.getVectorElementType().getSizeInBits();
@@ -4719,6 +4775,24 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::BITCAST, dl, VT, Val);
   }
 
+  // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
+  // know the default expansion would otherwise fall back on something even
+  // worse. For a vector with one or two non-undef values, that's
+  // scalar_to_vector for the elements followed by a shuffle (provided the
+  // shuffle is valid for the target) and materialization element by element
+  // on the stack followed by a load for everything else.
+  if (!isConstant && !usesOnlyOneValue) {
+    SDValue Vec = DAG.getUNDEF(VT);
+    for (unsigned i = 0 ; i < NumElts; ++i) {
+      SDValue V = Op.getOperand(i);
+      if (V.getOpcode() == ISD::UNDEF)
+        continue;
+      SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+      Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
+    }
+    return Vec;
+  }
+
   return SDValue();
 }
 
@@ -5830,6 +5904,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:       return LowerDivRem(Op, DAG);
   }
 }
 
@@ -7948,8 +8024,11 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
 
   assert(AddcNode->getNumValues() == 2 &&
          AddcNode->getValueType(0) == MVT::i32 &&
-         AddcNode->getValueType(1) == MVT::Glue &&
-         "Expect ADDC with two result values: i32, glue");
+         "Expect ADDC with two result values. First: i32");
+
+  // Check that we have a glued ADDC node.
+  if (AddcNode->getValueType(1) != MVT::Glue)
+    return SDValue();
 
   // Check that the ADDC adds the low result of the S/UMUL_LOHI.
   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
@@ -8328,22 +8407,29 @@ static SDValue PerformORCombine(SDNode *N,
     unsigned SplatBitSize;
     bool HasAnyUndefs;
 
+    APInt SplatBits0, SplatBits1;
     BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    APInt SplatBits0;
+    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+    // Ensure that the second operand of both ands are constants
     if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                  HasAnyUndefs) && !HasAnyUndefs) {
-      BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-      APInt SplatBits1;
-      if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                    HasAnyUndefs) && !HasAnyUndefs &&
-          SplatBits0 == ~SplatBits1) {
-        // Canonicalize the vector type to make instruction selection simpler.
-        EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
-        SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
-                                     N0->getOperand(1), N0->getOperand(0),
-                                     N1->getOperand(0));
-        return DAG.getNode(ISD::BITCAST, dl, VT, Result);
-      }
+                                      HasAnyUndefs) && !HasAnyUndefs) {
+        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                          HasAnyUndefs) && !HasAnyUndefs) {
+            // Ensure that the bit width of the constants are the same and that
+            // the splat arguments are logical inverses as per the pattern we
+            // are trying to simplify.
+            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+                SplatBits0 == ~SplatBits1) {
+                // Canonicalize the vector type to make instruction selection
+                // simpler.
+                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                             N0->getOperand(1),
+                                             N0->getOperand(0),
+                                             N1->getOperand(0));
+                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+            }
+        }
     }
   }
 
@@ -8753,6 +8839,98 @@ static SDValue PerformBUILD_VECTORCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, BV);
 }
 
+/// \brief Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
+static SDValue
+PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
+  // At that time, we may have inserted bitcasts from integer to float.
+  // If these bitcasts have survived DAGCombine, change the lowering of this
+  // BUILD_VECTOR in something more vector friendly, i.e., that does not
+  // force to use floating point types.
+
+  // Make sure we can change the type of the vector.
+  // This is possible iff:
+  // 1. The vector is only used in a bitcast to a integer type. I.e.,
+  //    1.1. Vector is used only once.
+  //    1.2. Use is a bit convert to an integer type.
+  // 2. The size of its operands are 32-bits (64-bits are not legal).
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+
+  // Check 1.1. and 2.
+  if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
+    return SDValue();
+
+  // By construction, the input type must be float.
+  assert(EltVT == MVT::f32 && "Unexpected type!");
+
+  // Check 1.2.
+  SDNode *Use = *N->use_begin();
+  if (Use->getOpcode() != ISD::BITCAST ||
+      Use->getValueType(0).isFloatingPoint())
+    return SDValue();
+
+  // Check profitability.
+  // Model is, if more than half of the relevant operands are bitcast from
+  // i32, turn the build_vector into a sequence of insert_vector_elt.
+  // Relevant operands are everything that is not statically
+  // (i.e., at compile time) bitcasted.
+  unsigned NumOfBitCastedElts = 0;
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumOfRelevantElts = NumElts;
+  for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
+    SDValue Elt = N->getOperand(Idx);
+    if (Elt->getOpcode() == ISD::BITCAST) {
+      // Assume only bit cast to i32 will go away.
+      if (Elt->getOperand(0).getValueType() == MVT::i32)
+        ++NumOfBitCastedElts;
+    } else if (Elt.getOpcode() == ISD::UNDEF || isa<ConstantSDNode>(Elt))
+      // Constants are statically casted, thus do not count them as
+      // relevant operands.
+      --NumOfRelevantElts;
+  }
+
+  // Check if more than half of the elements require a non-free bitcast.
+  if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  // Create the new vector type.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+  // Check if the type is legal.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isTypeLegal(VecVT))
+    return SDValue();
+
+  // Combine:
+  // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
+  // => BITCAST INSERT_VECTOR_ELT
+  //                      (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
+  //                      (BITCAST EN), N.
+  SDValue Vec = DAG.getUNDEF(VecVT);
+  SDLoc dl(N);
+  for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
+    SDValue V = N->getOperand(Idx);
+    if (V.getOpcode() == ISD::UNDEF)
+      continue;
+    if (V.getOpcode() == ISD::BITCAST &&
+        V->getOperand(0).getValueType() == MVT::i32)
+      // Fold obvious case.
+      V = V.getOperand(0);
+    else {
+      V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V); 
+      // Make the DAGCombiner fold the bitcasts.
+      DCI.AddToWorklist(V.getNode());
+    }
+    SDValue LaneIdx = DAG.getConstant(Idx, MVT::i32);
+    Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
+  }
+  Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
+  // Make the DAGCombiner fold the bitcasts.
+  DCI.AddToWorklist(Vec.getNode());
+  return Vec;
+}
+
 /// PerformInsertEltCombine - Target-specific dag combine xforms for
 /// ISD::INSERT_VECTOR_ELT.
 static SDValue PerformInsertEltCombine(SDNode *N,
@@ -9131,12 +9309,27 @@ static SDValue PerformVCVTCombine(SDNode *N,
       !isConstVecPow2(ConstVec, isSigned, C))
     return SDValue();
 
+  MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
+  MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from f32 to i32. We can handle
+    // smaller integers by generating an extra truncate, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
     Intrinsic::arm_neon_vcvtfp2fxu;
-  return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
-                     N->getValueType(0),
-                     DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
-                     DAG.getConstant(Log2_64(C), MVT::i32));
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  SDValue FixConv =  DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
+                                 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                                 DAG.getConstant(IntrinsicOpcode, MVT::i32), N0,
+                                 DAG.getConstant(Log2_64(C), MVT::i32));
+
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    FixConv = DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), FixConv);
+
+  return FixConv;
 }
 
 /// PerformVDIVCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
@@ -9167,12 +9360,28 @@ static SDValue PerformVDIVCombine(SDNode *N,
       !isConstVecPow2(ConstVec, isSigned, C))
     return SDValue();
 
+  MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
+  MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
+  if (FloatTy.getSizeInBits() != 32 || IntTy.getSizeInBits() > 32) {
+    // These instructions only exist converting from i32 to f32. We can handle
+    // smaller integers by generating an extra extend, but larger ones would
+    // be lossy.
+    return SDValue();
+  }
+
+  SDValue ConvInput = Op.getOperand(0);
+  unsigned NumLanes = Op.getValueType().getVectorNumElements();
+  if (IntTy.getSizeInBits() < FloatTy.getSizeInBits())
+    ConvInput = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                            SDLoc(N), NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
+                            ConvInput);
+
   unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp :
     Intrinsic::arm_neon_vcvtfxu2fp;
   return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SDLoc(N),
                      Op.getValueType(),
                      DAG.getConstant(IntrinsicOpcode, MVT::i32),
-                     Op.getOperand(0), DAG.getConstant(Log2_64(C), MVT::i32));
+                     ConvInput, DAG.getConstant(Log2_64(C), MVT::i32));
 }
 
 /// Getvshiftimm - Check if this is a valid build_vector for the immediate
@@ -9658,6 +9867,8 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ARMISD::VLD3DUP:
   case ARMISD::VLD4DUP:
     return CombineBaseUpdate(N, DCI);
+  case ARMISD::BUILD_VECTOR:
+    return PerformARMBUILD_VECTORCombine(N, DCI);
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -9782,6 +9993,21 @@ bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool ARMTargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+
+  if (!isTypeLegal(EVT::getEVT(Ty1)))
+    return false;
+
+  assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
+
+  // Assuming the caller doesn't have a zeroext or signext return parameter,
+  // truncation all the way down to i1 is valid.
+  return true;
+}
+
+
 static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
   if (V < 0)
     return false;
@@ -10181,9 +10407,19 @@ void ARMTargetLowering::computeMaskedBitsForTargetNode(const SDValue Op,
                                                        APInt &KnownOne,
                                                        const SelectionDAG &DAG,
                                                        unsigned Depth) const {
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+  unsigned BitWidth = KnownOne.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0);
   switch (Op.getOpcode()) {
   default: break;
+  case ARMISD::ADDC:
+  case ARMISD::ADDE:
+  case ARMISD::SUBC:
+  case ARMISD::SUBE:
+    // These nodes' second result is a boolean
+    if (Op.getResNo() == 0)
+      break;
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
   case ARMISD::CMOV: {
     // Bits are known zero/one if known on the LHS and RHS.
     DAG.ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
@@ -10297,7 +10533,7 @@ ARMTargetLowering::getSingleConstraintMatchWeight(
 typedef std::pair<unsigned, const TargetRegisterClass*> RCPair;
 RCPair
 ARMTargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
-                                                EVT VT) const {
+                                                MVT VT) const {
   if (Constraint.size() == 1) {
     // GCC ARM Constraint Letters
     switch (Constraint[0]) {
@@ -10506,6 +10742,54 @@ void ARMTargetLowering::LowerAsmOperandForConstraint(SDValue Op,
   return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
 }
 
+SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
+  assert(Subtarget->isTargetAEABI() && "Register-based DivRem lowering only");
+  unsigned Opcode = Op->getOpcode();
+  assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
+      "Invalid opcode for Div/Rem lowering");
+  bool isSigned = (Opcode == ISD::SDIVREM);
+  EVT VT = Op->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+
+  RTLIB::Libcall LC;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  }
+
+  SDValue InChain = DAG.getEntryNode();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Op->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy());
+
+  Type *RetTy = (Type*)StructType::get(Ty, Ty, NULL);
+
+  SDLoc dl(Op);
+  TargetLowering::
+  CallLoweringInfo CLI(InChain, RetTy, isSigned, !isSigned, false, true,
+                    0, getLibcallCallingConv(LC), /*isTailCall=*/false,
+                    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                    Callee, Args, DAG, dl);
+  std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
+
+  return CallInfo.first;
+}
+
 bool
 ARMTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // The ARM target isn't yet aware of offsets.
@@ -10591,6 +10875,30 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.writeMem = true;
     return true;
   }
+  case Intrinsic::arm_ldrex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(0)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = true;
+    Info.writeMem = false;
+    return true;
+  }
+  case Intrinsic::arm_strex: {
+    PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.ptrVal = I.getArgOperand(1);
+    Info.offset = 0;
+    Info.align = getDataLayout()->getABITypeAlignment(PtrTy->getElementType());
+    Info.vol = true;
+    Info.readMem = false;
+    Info.writeMem = true;
+    return true;
+  }
   case Intrinsic::arm_strexd: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::i64;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 2b65019..44c769f 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -298,6 +298,9 @@ namespace llvm {
     using TargetLowering::isZExtFree;
     virtual bool isZExtFree(SDValue Val, EVT VT2) const;
 
+    virtual bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const;
+
+
     /// isLegalAddressingMode - Return true if the addressing mode represented
     /// by AM is legal for this target, for a load/store of the specified type.
     virtual bool isLegalAddressingMode(const AddrMode &AM, Type *Ty)const;
@@ -349,7 +352,7 @@ namespace llvm {
 
     std::pair<unsigned, const TargetRegisterClass*>
       getRegForInlineAsmConstraint(const std::string &Constraint,
-                                   EVT VT) const;
+                                   MVT VT) const;
 
     /// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
     /// vector.  If it is invalid, don't add anything to Ops. If hasMemory is
@@ -417,7 +420,7 @@ namespace llvm {
                           RegsToPassVector &RegsToPass,
                           CCValAssign &VA, CCValAssign &NextVA,
                           SDValue &StackPtr,
-                          SmallVector<SDValue, 8> &MemOpChains,
+                          SmallVectorImpl<SDValue> &MemOpChains,
                           ISD::ArgFlagsTy Flags) const;
     SDValue GetF64FormalArgument(CCValAssign &VA, CCValAssign &NextVA,
                                  SDValue &Root, SelectionDAG &DAG,
@@ -457,6 +460,18 @@ namespace llvm {
                             const ARMSubtarget *ST) const;
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
                               const ARMSubtarget *ST) const;
+    SDValue LowerDivRem(SDValue Op, SelectionDAG &DAG) const;
+
+    /// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
+    /// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
+    /// expanded to FMAs when this method returns true, otherwise fmuladd is
+    /// expanded to fmul + fadd.
+    ///
+    /// ARM supports both fused and unfused multiply-add operations; we already
+    /// lower a pair of fmul and fadd to the latter so it's not clear that there
+    /// would be a gain or that the gain would be worthwhile enough to risk
+    /// correctness bugs.
+    virtual bool isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; }
 
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index bd9a212..1349476 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -1230,8 +1230,9 @@ class T2JTI<dag oops, dag iops, InstrItinClass itin,
   : Thumb2XI<oops, iops, AddrModeNone, 0, itin, asm, "", pattern>;
 
 // Move to/from coprocessor instructions
-class T2Cop<bits<4> opc, dag oops, dag iops, string asm, list<dag> pattern>
-  : T2XI <oops, iops, NoItinerary, asm, pattern>, Requires<[IsThumb2]> {
+class T2Cop<bits<4> opc, dag oops, dag iops, string opcstr, string asm,
+            list<dag> pattern>
+  : T2I <oops, iops, NoItinerary, opcstr, asm, pattern>, Requires<[IsThumb2]> {
   let Inst{31-28} = opc;
 }
 
@@ -1521,6 +1522,32 @@ class ADuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{4}     = opcod5;
 }
 
+// Double precision, unary, not-predicated
+class ADuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+           bit opcod5, dag oops, dag iops, InstrItinClass itin,
+           string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1;          // Double precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
 // Double precision, binary
 class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
            dag iops, InstrItinClass itin, string opc, string asm,
@@ -1547,7 +1574,36 @@ class ADbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
   let Inst{4}     = op4;
 }
 
-// Single precision, unary
+// FP, binary, not predicated
+class ADbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone, VFPBinaryFrm, itin,
+          asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Dd;
+  bits<5> Dn;
+  bits<5> Dm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+  let Inst{19-16} = Dn{3-0};
+  let Inst{7}     = Dn{4};
+  let Inst{15-12} = Dd{3-0};
+  let Inst{22}    = Dd{4};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 1; // double precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
+// Single precision, unary, predicated
 class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
            bit opcod5, dag oops, dag iops, InstrItinClass itin, string opc,
            string asm, list<dag> pattern>
@@ -1571,6 +1627,33 @@ class ASuI<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
   let Inst{4}     = opcod5;
 }
 
+// Single precision, unary, non-predicated
+class ASuInp<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
+             bit opcod5, dag oops, dag iops, InstrItinClass itin,
+             string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPUnaryFrm, itin, asm, "", pattern> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{19-16} = opcod3;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0;          // Single precision
+  let Inst{7-6}   = opcod4;
+  let Inst{4}     = opcod5;
+}
+
 // Single precision unary, if no NEON. Same as ASuI except not available if
 // NEON is enabled.
 class ASuIn<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3, bits<2> opcod4,
@@ -1606,6 +1689,35 @@ class ASbI<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops, dag iops,
   let Inst{4}     = op4;
 }
 
+// Single precision, binary, not predicated
+class ASbInp<bits<5> opcod1, bits<2> opcod2, bit opcod3, dag oops, dag iops,
+           InstrItinClass itin, string asm, list<dag> pattern>
+  : VFPXI<oops, iops, AddrModeNone, 4, IndexModeNone,
+          VFPBinaryFrm, itin, asm, "", pattern>
+{
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Sn;
+  bits<5> Sm;
+
+  let Inst{31-28} = 0b1111;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{19-16} = Sn{4-1};
+  let Inst{7}     = Sn{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = opcod1;
+  let Inst{21-20} = opcod2;
+  let Inst{11-9}  = 0b101;
+  let Inst{8}     = 0; // Single precision
+  let Inst{6}     = opcod3;
+  let Inst{4}     = 0;
+}
+
 // Single precision binary, if no NEON. Same as ASbI except not available if
 // NEON is enabled.
 class ASbIn<bits<5> opcod1, bits<2> opcod2, bit op6, bit op4, dag oops,
@@ -1718,6 +1830,21 @@ class NeonXI<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
   let DecoderNamespace = "NEON";
 }
 
+// Same as NeonI except it is not predicated
+class NeonInp<dag oops, dag iops, AddrMode am, IndexMode im, Format f,
+            InstrItinClass itin, string opc, string dt, string asm, string cstr,
+            list<dag> pattern>
+  : InstARM<am, 4, im, f, NeonDomain, cstr, itin> {
+  let OutOperandList = oops;
+  let InOperandList = iops;
+  let AsmString = !strconcat(opc, ".", dt, "\t", asm);
+  let Pattern = pattern;
+  list<Predicate> Predicates = [HasNEON];
+  let DecoderNamespace = "NEON";
+
+  let Inst{31-28} = 0b1111;
+}
+
 class NLdSt<bit op23, bits<2> op21_20, bits<4> op11_8, bits<4> op7_4,
             dag oops, dag iops, InstrItinClass itin,
             string opc, string dt, string asm, string cstr, list<dag> pattern>
@@ -1837,6 +1964,35 @@ class N2V<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
   let Inst{5}     = Vm{4};
 }
 
+// Same as N2V but not predicated.
+class N2Vnp<bits<2> op17_16, bits<3> op10_8, bit op7, bit op6,
+            dag oops, dag iops, InstrItinClass itin, string OpcodeStr,
+            string Dt, ValueType ResTy, ValueType OpTy, list<dag> pattern>
+   : NeonInp<oops, iops, AddrModeNone, IndexModeNone, N2RegFrm, itin,
+             OpcodeStr, Dt, "$Vd, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22}    = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{5}     = Vm{4};
+  let Inst{3-0}   = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = 0b00111;
+  let Inst{21-20} = 0b11;
+  let Inst{19-18} = 0b10;
+  let Inst{17-16} = op17_16;
+  let Inst{11} = 0;
+  let Inst{10-8} = op10_8;
+  let Inst{7} = op7;
+  let Inst{6} = op6;
+  let Inst{4} = 0;
+
+  let DecoderNamespace = "NEON";
+}
+
 // Same as N2V except it doesn't have a datatype suffix.
 class N2VX<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18, bits<2> op17_16,
            bits<5> op11_7, bit op6, bit op4,
@@ -1918,6 +2074,32 @@ class N3V<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6, bit op4,
   let Inst{5}     = Vm{4};
 }
 
+class N3Vnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, dag oops, dag iops,Format f, InstrItinClass itin,
+                string OpcodeStr, string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable, list<dag> pattern>
+  : NeonInp<oops, iops, AddrModeNone, IndexModeNone, f, itin, OpcodeStr,
+            Dt, "$Vd, $Vn, $Vm", "", pattern> {
+  bits<5> Vd;
+  bits<5> Vn;
+  bits<5> Vm;
+
+  // Encode instruction operands
+  let Inst{22} = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{7} = Vn{4};
+  let Inst{5} = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+
+  // Encode constant bits
+  let Inst{27-23} = op27_23;
+  let Inst{21-20} = op21_20;
+  let Inst{11-8}  = op11_8;
+  let Inst{6}     = op6;
+  let Inst{4}     = op4;
+}
+
 class N3VLane32<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op6,
                 bit op4, dag oops, dag iops, Format f, InstrItinClass itin,
                 string opc, string dt, string asm, string cstr,
diff --git a/lib/Target/ARM/ARMInstrInfo.cpp b/lib/Target/ARM/ARMInstrInfo.cpp
index 8062111..8cdb853 100644
--- a/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/lib/Target/ARM/ARMInstrInfo.cpp
@@ -106,13 +106,14 @@ namespace {
       if (TM->getRelocationModel() != Reloc::PIC_)
         return false;
 
-      LLVMContext* Context = &MF.getFunction()->getContext();
-      GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
-                                           GlobalValue::ExternalLinkage, 0,
-                                           "_GLOBAL_OFFSET_TABLE_");
-      unsigned Id = AFI->createPICLabelUId();
-      ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id);
-      unsigned Align = TM->getDataLayout()->getPrefTypeAlignment(GV->getType());
+      LLVMContext *Context = &MF.getFunction()->getContext();
+      unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+      unsigned PCAdj = TM->getSubtarget<ARMSubtarget>().isThumb() ? 4 : 8;
+      ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
+          *Context, "_GLOBAL_OFFSET_TABLE_", ARMPCLabelIndex, PCAdj);
+
+      unsigned Align = TM->getDataLayout()
+          ->getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
       unsigned Idx = MF.getConstantPool()->getConstantPoolIndex(CPV, Align);
 
       MachineBasicBlock &FirstMBB = MF.front();
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index da815d5..c243402 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -194,6 +194,8 @@ def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
                                  AssemblerPredicate<"HasV7Ops", "armv7">;
+def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
+                                 AssemblerPredicate<"HasV8Ops", "armv8">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2()">,
                                  AssemblerPredicate<"FeatureVFP2", "VFP2">;
@@ -201,6 +203,8 @@ def HasVFP3          : Predicate<"Subtarget->hasVFP3()">,
                                  AssemblerPredicate<"FeatureVFP3", "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4()">,
                                  AssemblerPredicate<"FeatureVFP4", "VFP4">;
+def HasV8FP          : Predicate<"Subtarget->hasV8FP()">,
+                                 AssemblerPredicate<"FeatureV8FP", "V8FP">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
                                  AssemblerPredicate<"FeatureNEON", "NEON">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
@@ -258,7 +262,9 @@ def UseMulOps        : Predicate<"Subtarget->useMulOps()">;
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast) && "
                                  "!Subtarget->isTargetDarwin()">;
-def DontUseFusedMAC  : Predicate<"!Subtarget->hasVFP4() || "
+def DontUseFusedMAC  : Predicate<"!(TM.Options.AllowFPOpFusion =="
+                                 " FPOpFusion::Fast &&"
+                                 " Subtarget->hasVFP4()) || "
                                  "Subtarget->isTargetDarwin()">;
 
 // VGETLNi32 is microcoded on Swift - prefer VMOV.
@@ -275,8 +281,8 @@ def HasSlowVDUP32 : Predicate<"Subtarget->isSwift()">;
 def UseVMOVSR : Predicate<"Subtarget->isCortexA9() || !Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONForSinglePrecisionFP()">;
 
-def IsLE             : Predicate<"TLI->isLittleEndian()">;
-def IsBE             : Predicate<"TLI->isBigEndian()">;
+def IsLE             : Predicate<"getTargetLowering()->isLittleEndian()">;
+def IsBE             : Predicate<"getTargetLowering()->isBigEndian()">;
 
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
@@ -456,7 +462,7 @@ def AdrLabelAsmOperand : AsmOperandClass { let Name = "AdrLabel"; }
 def adrlabel : Operand<i32> {
   let EncoderMethod = "getAdrLabelOpValue";
   let ParserMatchClass = AdrLabelAsmOperand;
-  let PrintMethod = "printAdrLabelOperand";
+  let PrintMethod = "printAdrLabelOperand<0>";
 }
 
 def neon_vcvt_imm32 : Operand<i32> {
@@ -1007,11 +1013,6 @@ def p_imm : Operand<i32> {
   let DecoderMethod = "DecodeCoprocessor";
 }
 
-def pf_imm : Operand<i32> {
-  let PrintMethod = "printPImmediate";
-  let ParserMatchClass = CoprocNumAsmOperand;
-}
-
 def CoprocRegAsmOperand : AsmOperandClass {
   let Name = "CoprocReg";
   let ParserMethod = "parseCoprocRegOperand";
@@ -1735,12 +1736,13 @@ def SEL : AI<(outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), DPFrm, NoItinerary, "sel",
 
 // The 16-bit operand $val can be used by a debugger to store more information
 // about the breakpoint.
-def BKPT : AI<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
-              "bkpt", "\t$val", []>, Requires<[IsARM]> {
+def BKPT : AInoP<(outs), (ins imm0_65535:$val), MiscFrm, NoItinerary,
+                 "bkpt", "\t$val", []>, Requires<[IsARM]> {
   bits<16> val;
   let Inst{3-0} = val{3-0};
   let Inst{19-8} = val{15-4};
   let Inst{27-20} = 0b00010010;
+  let Inst{31-28} = 0xe; // AL
   let Inst{7-4} = 0b0111;
 }
 
@@ -2293,7 +2295,6 @@ multiclass AI2_ldridx<bit isByte, string opc,
     let Inst{19-16} = addr{16-13};
     let Inst{11-0} = addr{11-0};
     let DecoderMethod = "DecodeLDRPreImm";
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrModeImm12";
   }
 
   def _PRE_REG  : AI2ldstidx<1, isByte, 1, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2306,7 +2307,6 @@ multiclass AI2_ldridx<bit isByte, string opc,
     let Inst{11-0} = addr{11-0};
     let Inst{4} = 0;
     let DecoderMethod = "DecodeLDRPreReg";
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode2";
   }
 
   def _POST_REG : AI2ldstidx<1, isByte, 0, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2364,7 +2364,6 @@ multiclass AI3_ldridx<bits<4> op, string opc, InstrItinClass itin> {
     let Inst{19-16} = addr{12-9};   // Rn
     let Inst{11-8}  = addr{7-4};    // imm7_4/zero
     let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-    let AsmMatchConverter = "cvtLdWriteBackRegAddrMode3";
     let DecoderMethod = "DecodeAddrMode3Instruction";
   }
   def _POST : AI3ldstidx<op, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
@@ -2400,7 +2399,6 @@ def LDRD_PRE : AI3ldstidx<0b1101, 0, 1, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
-  let AsmMatchConverter = "cvtLdrdPre";
 }
 def LDRD_POST: AI3ldstidx<0b1101, 0, 0, (outs GPR:$Rt, GPR:$Rt2, GPR:$Rn_wb),
                           (ins addr_offset_none:$addr, am3offset:$offset),
@@ -2503,7 +2501,6 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{22} = 1;
     let Inst{11-8} = offset{7-4};
     let Inst{3-0} = offset{3-0};
-    let AsmMatchConverter = "cvtLdExtTWriteBackImm";
   }
   def r : AI3ldstidxT<op, 1, (outs GPRnopc:$Rt, GPRnopc:$base_wb),
                       (ins addr_offset_none:$addr, postidx_reg:$Rm),
@@ -2515,7 +2512,6 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{11-8} = 0;
     let Unpredictable{11-8} = 0b1111;
     let Inst{3-0} = Rm{3-0};
-    let AsmMatchConverter = "cvtLdExtTWriteBackReg";
     let DecoderMethod = "DecodeLDR";
   }
 }
@@ -2553,7 +2549,6 @@ multiclass AI2_stridx<bit isByte, string opc,
     let Inst{23}    = addr{12};     // U (add = ('U' == 1))
     let Inst{19-16} = addr{16-13};  // Rn
     let Inst{11-0}  = addr{11-0};   // imm12
-    let AsmMatchConverter = "cvtStWriteBackRegAddrModeImm12";
     let DecoderMethod = "DecodeSTRPreImm";
   }
 
@@ -2567,7 +2562,6 @@ multiclass AI2_stridx<bit isByte, string opc,
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{11-0}  = addr{11-0};
     let Inst{4}     = 0;           // Inst{4} = 0
-    let AsmMatchConverter = "cvtStWriteBackRegAddrMode2";
     let DecoderMethod = "DecodeSTRPreReg";
   }
   def _POST_REG : AI2ldstidx<0, isByte, 0, (outs GPR:$Rn_wb),
@@ -2676,7 +2670,6 @@ def STRH_PRE  : AI3ldstidx<0b1011, 0, 1, (outs GPR:$Rn_wb),
   let Inst{19-16} = addr{12-9};   // Rn
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
-  let AsmMatchConverter = "cvtStWriteBackRegAddrMode3";
   let DecoderMethod = "DecodeAddrMode3Instruction";
 }
 
@@ -2710,7 +2703,6 @@ def STRD_PRE : AI3ldstidx<0b1111, 0, 1, (outs GPR:$Rn_wb),
   let Inst{11-8}  = addr{7-4};    // imm7_4/zero
   let Inst{3-0}   = addr{3-0};    // imm3_0/Rm
   let DecoderMethod = "DecodeAddrMode3Instruction";
-  let AsmMatchConverter = "cvtStrdPre";
 }
 
 def STRD_POST: AI3ldstidx<0b1111, 0, 0, (outs GPR:$Rn_wb),
@@ -2817,7 +2809,6 @@ multiclass AI3strT<bits<4> op, string opc> {
     let Inst{22} = 1;
     let Inst{11-8} = offset{7-4};
     let Inst{3-0} = offset{3-0};
-    let AsmMatchConverter = "cvtStExtTWriteBackImm";
   }
   def r : AI3ldstidxT<op, 0, (outs GPR:$base_wb),
                       (ins GPR:$Rt, addr_offset_none:$addr, postidx_reg:$Rm),
@@ -2828,7 +2819,6 @@ multiclass AI3strT<bits<4> op, string opc> {
     let Inst{22} = 0;
     let Inst{11-8} = 0;
     let Inst{3-0} = Rm{3-0};
-    let AsmMatchConverter = "cvtStExtTWriteBackReg";
   }
 }
 
@@ -4011,8 +4001,13 @@ def PKHTB : APKHI<0b01101000, 1, (outs GPRnopc:$Rd),
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
+                   (srl GPRnopc:$src2, imm16:$sh)),
+               (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16:$sh)>;
 def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
-                   (srl GPRnopc:$src2, imm16_31:$sh)),
+                   (sra GPRnopc:$src2, imm16_31:$sh)),
                (PKHTB GPRnopc:$src1, GPRnopc:$src2, imm16_31:$sh)>;
 def : ARMV6Pat<(or (and GPRnopc:$src1, 0xFFFF0000),
                    (and (srl GPRnopc:$src2, imm1_15:$sh), 0xFFFF)),
@@ -4378,14 +4373,44 @@ let usesCustomInserter = 1 in {
       [(ARMcopystructbyval GPR:$dst, GPR:$src, imm:$size, imm:$alignment)]>;
 }
 
+def ldrex_1 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def ldrex_2 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def ldrex_4 : PatFrag<(ops node:$ptr), (int_arm_ldrex node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
+def strex_1 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+}]>;
+
+def strex_2 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+}]>;
+
+def strex_4 : PatFrag<(ops node:$val, node:$ptr),
+                      (int_arm_strex node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+}]>;
+
 let mayLoad = 1 in {
 def LDREXB : AIldrex<0b10, (outs GPR:$Rt), (ins addr_offset_none:$addr),
                      NoItinerary,
-                    "ldrexb", "\t$Rt, $addr", []>;
+                     "ldrexb", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
 def LDREXH : AIldrex<0b11, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldrexh", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldrexh", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
 def LDREX  : AIldrex<0b00, (outs GPR:$Rt), (ins addr_offset_none:$addr),
-                     NoItinerary, "ldrex", "\t$Rt, $addr", []>;
+                     NoItinerary, "ldrex", "\t$Rt, $addr",
+                     [(set GPR:$Rt, (ldrex_4 addr_offset_none:$addr))]>;
 let hasExtraDefRegAllocReq = 1 in
 def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
                       NoItinerary, "ldrexd", "\t$Rt, $addr", []> {
@@ -4395,11 +4420,14 @@ def LDREXD: AIldrex<0b01, (outs GPRPairOp:$Rt),(ins addr_offset_none:$addr),
 
 let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def STREXB: AIstrex<0b10, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strexb", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_1 GPR:$Rt, addr_offset_none:$addr))]>;
 def STREXH: AIstrex<0b11, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strexh", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_2 GPR:$Rt, addr_offset_none:$addr))]>;
 def STREX : AIstrex<0b00, (outs GPR:$Rd), (ins GPR:$Rt, addr_offset_none:$addr),
-                    NoItinerary, "strex", "\t$Rd, $Rt, $addr", []>;
+                    NoItinerary, "strex", "\t$Rd, $Rt, $addr",
+                    [(set GPR:$Rd, (strex_4 GPR:$Rt, addr_offset_none:$addr))]>;
 let hasExtraSrcRegAllocReq = 1 in
 def STREXD : AIstrex<0b01, (outs GPR:$Rd),
                     (ins GPRPairOp:$Rt, addr_offset_none:$addr),
@@ -4409,11 +4437,21 @@ def STREXD : AIstrex<0b01, (outs GPR:$Rd),
 }
 
 
-def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex", []>,
+def CLREX : AXI<(outs), (ins), MiscFrm, NoItinerary, "clrex",
+                [(int_arm_clrex)]>,
             Requires<[IsARM, HasV7]>  {
   let Inst{31-0} = 0b11110101011111111111000000011111;
 }
 
+def : ARMPat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+             (LDREXB addr_offset_none:$addr)>;
+def : ARMPat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+             (LDREXH addr_offset_none:$addr)>;
+def : ARMPat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+             (STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : ARMPat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+             (STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
 // SWP/SWPB are deprecated in V6/V7.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
@@ -4447,7 +4485,7 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
   let Inst{23-20} = opc1;
 }
 
-def CDP2 : ABXI<0b1110, (outs), (ins pf_imm:$cop, imm0_15:$opc1,
+def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
@@ -5187,10 +5225,10 @@ def : MnemonicAlias<"rfeed", "rfeib">;
 def : MnemonicAlias<"rfe", "rfeia">;
 
 // SRS aliases
-def : MnemonicAlias<"srsfa", "srsda">;
-def : MnemonicAlias<"srsea", "srsdb">;
-def : MnemonicAlias<"srsfd", "srsia">;
-def : MnemonicAlias<"srsed", "srsib">;
+def : MnemonicAlias<"srsfa", "srsib">;
+def : MnemonicAlias<"srsea", "srsia">;
+def : MnemonicAlias<"srsfd", "srsdb">;
+def : MnemonicAlias<"srsed", "srsda">;
 def : MnemonicAlias<"srs", "srsia">;
 
 // QSAX == QSUBADDX
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 9d1a8ea..af4f4d1 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -656,7 +656,6 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0111,op7_4, (outs VecListOneD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1u,
@@ -664,7 +663,6 @@ multiclass VLD1DWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 multiclass VLD1QWB<bits<4> op7_4, string Dt> {
@@ -675,7 +673,6 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b1010,op7_4, (outs VecListDPair:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
@@ -683,7 +680,6 @@ multiclass VLD1QWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -713,7 +709,6 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0110,op7_4, (outs VecListThreeD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
@@ -721,7 +716,6 @@ multiclass VLD1D3WB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -754,7 +748,6 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0,0b10,0b0010,op7_4, (outs VecListFourD:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), IIC_VLD1x2u,
@@ -762,7 +755,6 @@ multiclass VLD1D4WB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -811,7 +803,6 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<0, 0b10, op11_8, op7_4, (outs VdTy:$Vd, GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm), itin,
@@ -819,7 +810,6 @@ multiclass VLD2WB<bits<4> op11_8, bits<4> op7_4, string Dt,
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -1348,7 +1338,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListOneDAllLanes:$Vd, GPR:$wb),
@@ -1357,7 +1346,6 @@ multiclass VLD1DUPWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
@@ -1369,7 +1357,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1100, op7_4,
                         (outs VecListDPairAllLanes:$Vd, GPR:$wb),
@@ -1378,7 +1365,6 @@ multiclass VLD1QDUPWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD1DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -1419,7 +1405,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbFixed";
   }
   def _register : NLdSt<1, 0b10, 0b1101, op7_4,
                         (outs VdTy:$Vd, GPR:$wb),
@@ -1428,7 +1413,6 @@ multiclass VLD2DUPWB<bits<4> op7_4, string Dt, RegisterOperand VdTy> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLD2DupInstruction";
-    let AsmMatchConverter = "cvtVLDwbRegister";
   }
 }
 
@@ -1609,7 +1593,6 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0111,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListOneD:$Vd),
@@ -1618,7 +1601,6 @@ multiclass VST1DWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{4} = Rn{4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 multiclass VST1QWB<bits<4> op7_4, string Dt> {
@@ -1629,7 +1611,6 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b1010,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListDPair:$Vd),
@@ -1638,7 +1619,6 @@ multiclass VST1QWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 
@@ -1669,7 +1649,6 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0110,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListThreeD:$Vd),
@@ -1678,7 +1657,6 @@ multiclass VST1D3WB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 
@@ -1714,7 +1692,6 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0,0b00,0b0010,op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1723,7 +1700,6 @@ multiclass VST1D4WB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST1Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 
@@ -1773,7 +1749,6 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0, 0b00, op11_8, op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VdTy:$Vd), IIC_VLD1u,
@@ -1781,7 +1756,6 @@ multiclass VST2DWB<bits<4> op11_8, bits<4> op7_4, string Dt,
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 multiclass VST2QWB<bits<4> op7_4, string Dt> {
@@ -1792,7 +1766,6 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
     let Rm = 0b1101; // NLdSt will assign to the right encoding bits.
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVSTwbFixed";
   }
   def _register : NLdSt<0, 0b00, 0b0011, op7_4, (outs GPR:$wb),
                         (ins addrmode6:$Rn, rGPR:$Rm, VecListFourD:$Vd),
@@ -1801,7 +1774,6 @@ multiclass VST2QWB<bits<4> op7_4, string Dt> {
                         "$Rn.addr = $wb", []> {
     let Inst{5-4} = Rn{5-4};
     let DecoderMethod = "DecodeVLDST2Instruction";
-    let AsmMatchConverter = "cvtVSTwbRegister";
   }
 }
 
@@ -2379,6 +2351,21 @@ class N2VQInt<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
         (ins QPR:$Vm), itin, OpcodeStr, Dt, "$Vd, $Vm", "",
         [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
 
+// Same as above, but not predicated.
+class N2VDIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op17_16, op10_8, op7, 0,  (outs DPR:$Vd), (ins DPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vm))))]>;
+
+class N2VQIntnp<bits<2> op17_16, bits<3> op10_8, bit op7,
+              InstrItinClass itin, string OpcodeStr, string Dt,
+              ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
+  : N2Vnp<op17_16, op10_8, op7, 1,  (outs QPR:$Vd), (ins QPR:$Vm),
+          itin, OpcodeStr, Dt, ResTy, OpTy,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vm))))]>;
+
 // Narrow 2-register operations.
 class N2VN<bits<2> op24_23, bits<2> op21_20, bits<2> op19_18,
            bits<2> op17_16, bits<5> op11_7, bit op6, bit op4,
@@ -2541,6 +2528,16 @@ class N3VDInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
+
+class N3VDIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm), f, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set DPR:$Vd, (ResTy (IntOp (OpTy DPR:$Vn), (OpTy DPR:$Vm))))]>;
+
 class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane32<0, 1, op21_20, op11_8, 1, 0,
@@ -2552,6 +2549,7 @@ class N3VDIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                                            imm:$lane)))))]> {
   let isCommutable = 0;
 }
+
 class N3VDIntSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                   string OpcodeStr, string Dt, ValueType Ty, SDPatternOperator IntOp>
   : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
@@ -2584,6 +2582,16 @@ class N3VQInt<bit op24, bit op23, bits<2> op21_20, bits<4> op11_8, bit op4,
   let TwoOperandAliasConstraint = "$Vn = $Vd";
   let isCommutable = Commutable;
 }
+
+class N3VQIntnp<bits<5> op27_23, bits<2> op21_20, bits<4> op11_8, bit op6,
+                bit op4, Format f, InstrItinClass itin, string OpcodeStr,
+                string Dt, ValueType ResTy, ValueType OpTy,
+                SDPatternOperator IntOp, bit Commutable>
+  : N3Vnp<op27_23, op21_20, op11_8, op6, op4,
+          (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm), f, itin, OpcodeStr, Dt,
+          ResTy, OpTy, IntOp, Commutable,
+          [(set QPR:$Vd, (ResTy (IntOp (OpTy QPR:$Vn), (OpTy QPR:$Vm))))]>;
+
 class N3VQIntSL<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                 string OpcodeStr, string Dt,
                 ValueType ResTy, ValueType OpTy, SDPatternOperator IntOp>
@@ -4659,6 +4667,18 @@ def  VMAXfq   : N3VQInt<0, 0, 0b00, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmax", "f32",
                         v4f32, v4f32, int_arm_neon_vmaxs, 1>;
 
+// VMAXNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMAXNMND  : N3VDIntnp<0b00110, 0b00, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMAXNMNQ  : N3VQIntnp<0b00110, 0b00, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vmaxnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vmaxnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
 //   VMIN     : Vector Minimum
 defm VMINs    : N3VInt_QHS<0, 0, 0b0110, 1, N3RegFrm,
                            IIC_VSUBi4D, IIC_VSUBi4D, IIC_VSUBi4Q, IIC_VSUBi4Q,
@@ -4673,6 +4693,18 @@ def  VMINfq   : N3VQInt<0, 0, 0b10, 0b1111, 0, N3RegFrm, IIC_VBINQ,
                         "vmin", "f32",
                         v4f32, v4f32, int_arm_neon_vmins, 1>;
 
+// VMINNM
+let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+  def VMINNMND  : N3VDIntnp<0b00110, 0b10, 0b1111, 0, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v2f32, v2f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+  def VMINNMNQ  : N3VQIntnp<0b00110, 0b10, 0b1111, 1, 1,
+                            N3RegFrm, NoItinerary, "vminnm", "f32",
+                            v4f32, v4f32, int_arm_neon_vminnm, 1>,
+                            Requires<[HasV8, HasNEON]>;
+}
+
 // Vector Pairwise Operations.
 
 //   VPADD    : Vector Pairwise Add
@@ -5386,6 +5418,26 @@ def  VCVTs2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01100, 0, "vcvt", "f32.s32",
 def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
+// VCVT{A, N, P, M}
+multiclass VCVT_FPI<string op, bits<3> op10_8, SDPatternOperator IntS,
+                    SDPatternOperator IntU> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def SD : N2VDIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v2i32, v2f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def SQ : N2VQIntnp<0b11, op10_8, 0, NoItinerary, !strconcat("vcvt", op),
+                       "s32.f32", v4i32, v4f32, IntS>, Requires<[HasV8, HasNEON]>;
+    def UD : N2VDIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v2i32, v2f32, IntU>, Requires<[HasV8, HasNEON]>;
+    def UQ : N2VQIntnp<0b11, op10_8, 1, NoItinerary, !strconcat("vcvt", op),
+                       "u32.f32", v4i32, v4f32, IntU>, Requires<[HasV8, HasNEON]>;
+  }
+}
+
+defm VCVTAN : VCVT_FPI<"a", 0b000, int_arm_neon_vcvtas, int_arm_neon_vcvtau>;
+defm VCVTNN : VCVT_FPI<"n", 0b001, int_arm_neon_vcvtns, int_arm_neon_vcvtnu>;
+defm VCVTPN : VCVT_FPI<"p", 0b010, int_arm_neon_vcvtps, int_arm_neon_vcvtpu>;
+defm VCVTMN : VCVT_FPI<"m", 0b011, int_arm_neon_vcvtms, int_arm_neon_vcvtmu>;
+
 //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
 let DecoderMethod = "DecodeVCVTD" in {
 def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
@@ -5658,6 +5710,34 @@ def  VTBX4Pseudo
                 IIC_VTBX4, "$orig = $dst", []>;
 } // DecoderMethod = "DecodeTBLInstruction"
 
+// VRINT      : Vector Rounding
+multiclass VRINT_FPI<string op, bits<3> op9_7, SDPatternOperator Int> {
+  let PostEncoderMethod = "NEONThumb2V8PostEncoder", DecoderNamespace = "v8NEON" in {
+    def D : N2VDIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v2f32, v2f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+    def Q : N2VQIntnp<0b10, 0b100, 0, NoItinerary,
+                      !strconcat("vrint", op), "f32",
+                      v4f32, v4f32, Int>, Requires<[HasV8, HasNEON]> {
+      let Inst{9-7} = op9_7;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+  def : InstAlias<!strconcat("vrint", op, ".f32.f32\t$Qd, $Qm"),
+                  (!cast<Instruction>(NAME#"Q") QPR:$Qd, QPR:$Qm)>;
+}
+
+defm VRINTNN : VRINT_FPI<"n", 0b000, int_arm_neon_vrintn>;
+defm VRINTXN : VRINT_FPI<"x", 0b001, int_arm_neon_vrintx>;
+defm VRINTAN : VRINT_FPI<"a", 0b010, int_arm_neon_vrinta>;
+defm VRINTZN : VRINT_FPI<"z", 0b011, int_arm_neon_vrintz>;
+defm VRINTMN : VRINT_FPI<"m", 0b101, int_arm_neon_vrintm>;
+defm VRINTPN : VRINT_FPI<"p", 0b111, int_arm_neon_vrintp>;
+
 //===----------------------------------------------------------------------===//
 // NEON instructions for single-precision FP math
 //===----------------------------------------------------------------------===//
@@ -6698,12 +6778,17 @@ def VST4qWB_register_Asm_32 :
                   (ins VecListFourQ:$list, addrmode6:$addr,
                        rGPR:$Rm, pred:$p)>;
 
-// VMOV takes an optional datatype suffix
+// VMOV/VMVN takes an optional datatype suffix
 defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                          (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
 defm : NEONDTAnyInstAlias<"vmov${p}", "$Vd, $Vm",
                          (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNd DPR:$Vd, DPR:$Vm, pred:$p)>;
+defm : NEONDTAnyInstAlias<"vmvn${p}", "$Vd, $Vm",
+                         (VMVNq QPR:$Vd, QPR:$Vm, pred:$p)>;
+
 // VCLT (register) is an assembler alias for VCGT w/ the operands reversed.
 // D-register versions.
 def : NEONInstAlias<"vcle${p}.s8 $Dd, $Dn, $Dm",
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index 1fff41d..e7218c6 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -69,11 +69,6 @@ def thumb_immshifted_shamt : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(V, MVT::i32);
 }]>;
 
-// ADR instruction labels.
-def t_adrlabel : Operand<i32> {
-  let EncoderMethod = "getThumbAdrLabelOpValue";
-}
-
 // Scaled 4 immediate.
 def t_imm0_1020s4_asmoperand: AsmOperandClass { let Name = "Imm0_1020s4"; }
 def t_imm0_1020s4 : Operand<i32> {
@@ -97,12 +92,34 @@ def t_imm0_508s4_neg : Operand<i32> {
 
 // Define Thumb specific addressing modes.
 
+// unsigned 8-bit, 2-scaled memory offset
+class OperandUnsignedOffset_b8s2 : AsmOperandClass {
+  let Name = "UnsignedOffset_b8s2";
+  let PredicateMethod = "isUnsignedOffset<8, 2>";
+}
+
+def UnsignedOffset_b8s2 : OperandUnsignedOffset_b8s2;
+
+// thumb style PC relative operand. signed, 8 bits magnitude,
+// two bits shift. can be represented as either [pc, #imm], #imm,
+// or relocatable expression...
+def ThumbMemPC : AsmOperandClass {
+  let Name = "ThumbMemPC";
+}
+
 let OperandType = "OPERAND_PCREL" in {
 def t_brtarget : Operand<OtherVT> {
   let EncoderMethod = "getThumbBRTargetOpValue";
   let DecoderMethod = "DecodeThumbBROperand";
 }
 
+// ADR instruction labels.
+def t_adrlabel : Operand<i32> {
+  let EncoderMethod = "getThumbAdrLabelOpValue";
+  let PrintMethod = "printAdrLabelOperand<2>";
+  let ParserMatchClass = UnsignedOffset_b8s2;
+}
+
 def t_bcctarget : Operand<i32> {
   let EncoderMethod = "getThumbBCCTargetOpValue";
   let DecoderMethod = "DecodeThumbBCCTargetOperand";
@@ -122,6 +139,15 @@ def t_blxtarget : Operand<i32> {
   let EncoderMethod = "getThumbBLXTargetOpValue";
   let DecoderMethod = "DecodeThumbBLXOffset";
 }
+
+// t_addrmode_pc := <label> => pc + imm8 * 4
+//
+def t_addrmode_pc : Operand<i32> {
+  let EncoderMethod = "getAddrModePCOpValue";
+  let DecoderMethod = "DecodeThumbAddrModePC";
+  let PrintMethod = "printThumbLdrLabelOperand";
+  let ParserMatchClass = ThumbMemPC;
+}
 }
 
 // t_addrmode_rr := reg + reg
@@ -218,14 +244,6 @@ def t_addrmode_sp : Operand<i32>,
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
-// t_addrmode_pc := <label> => pc + imm8 * 4
-//
-def t_addrmode_pc : Operand<i32> {
-  let EncoderMethod = "getAddrModePCOpValue";
-  let DecoderMethod = "DecodeThumbAddrModePC";
-  let PrintMethod = "printThumbLdrLabelOperand";
-}
-
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
 //
@@ -505,6 +523,7 @@ let isBranch = 1, isTerminator = 1 in
   let Inst{7-0} = target;
 }
 
+
 // Tail calls
 let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
   // IOS versions.
@@ -629,11 +648,9 @@ def tLDRspi : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_sp:$addr), IIC_iLoad_i,
   let Inst{7-0} = addr;
 }
 
-// Load tconstpool
-// FIXME: Use ldr.n to work around a darwin assembler bug.
-let canFoldAsLoad = 1, isReMaterializable = 1, isCodeGenOnly = 1 in
+let canFoldAsLoad = 1, isReMaterializable = 1 in
 def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
-                  "ldr", ".n\t$Rt, $addr",
+                  "ldr", "\t$Rt, $addr",
                   [(set tGPR:$Rt, (load (ARMWrapper tconstpool:$addr)))]>,
               T1Encoding<{0,1,0,0,1,?}> {
   // A6.2 & A8.6.59
@@ -643,17 +660,8 @@ def tLDRpci : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
   let Inst{7-0}  = addr;
 }
 
-// FIXME: Remove this entry when the above ldr.n workaround is fixed.
-// For assembly/disassembly use only.
-def tLDRpciASM : T1pIs<(outs tGPR:$Rt), (ins t_addrmode_pc:$addr), IIC_iLoad_i,
-                       "ldr", "\t$Rt, $addr", []>,
-                 T1Encoding<{0,1,0,0,1,?}> {
-  // A6.2 & A8.6.59
-  bits<3> Rt;
-  bits<8> addr;
-  let Inst{10-8} = Rt;
-  let Inst{7-0}  = addr;
-}
+def : tInstAlias<"ldr${p}.n $Rt, $addr", 
+                 (tLDRpci tGPR:$Rt, t_addrmode_pc:$addr, pred:$p), 0>;
 
 // A8.6.194 & A8.6.192
 defm tSTR  : thumb_st_rr_ri_enc<0b000, 0b0110, t_addrmode_rrs4,
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index ff21bf7..84086a5 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -173,14 +173,13 @@ def t2ldr_pcrel_imm12 : Operand<i32> {
 // ADR instruction labels.
 def t2adrlabel : Operand<i32> {
   let EncoderMethod = "getT2AdrLabelOpValue";
-  let PrintMethod = "printAdrLabelOperand";
+  let PrintMethod = "printAdrLabelOperand<0>";
 }
 
-
 // t2addrmode_posimm8  := reg + imm8
 def MemPosImm8OffsetAsmOperand : AsmOperandClass {let Name="MemPosImm8Offset";}
 def t2addrmode_posimm8 : Operand<i32> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemPosImm8OffsetAsmOperand;
@@ -191,7 +190,7 @@ def t2addrmode_posimm8 : Operand<i32> {
 def MemNegImm8OffsetAsmOperand : AsmOperandClass {let Name="MemNegImm8Offset";}
 def t2addrmode_negimm8 : Operand<i32>,
                       ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemNegImm8OffsetAsmOperand;
@@ -200,15 +199,22 @@ def t2addrmode_negimm8 : Operand<i32>,
 
 // t2addrmode_imm8  := reg +/- imm8
 def MemImm8OffsetAsmOperand : AsmOperandClass { let Name = "MemImm8Offset"; }
-def t2addrmode_imm8 : Operand<i32>,
-                      ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
-  let PrintMethod = "printT2AddrModeImm8Operand";
+class T2AddrMode_Imm8 : Operand<i32>,
+                        ComplexPattern<i32, 2, "SelectT2AddrModeImm8", []> {
   let EncoderMethod = "getT2AddrModeImm8OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8";
   let ParserMatchClass = MemImm8OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
+def t2addrmode_imm8 : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<false>";
+}
+
+def t2addrmode_imm8_pre : T2AddrMode_Imm8 {
+  let PrintMethod = "printT2AddrModeImm8Operand<true>";
+}
+
 def t2am_imm8_offset : Operand<i32>,
                        ComplexPattern<i32, 1, "SelectT2AddrModeImm8Offset",
                                       [], [SDNPWantRoot]> {
@@ -219,14 +225,21 @@ def t2am_imm8_offset : Operand<i32>,
 
 // t2addrmode_imm8s4  := reg +/- (imm8 << 2)
 def MemImm8s4OffsetAsmOperand : AsmOperandClass {let Name = "MemImm8s4Offset";}
-def t2addrmode_imm8s4 : Operand<i32> {
-  let PrintMethod = "printT2AddrModeImm8s4Operand";
+class T2AddrMode_Imm8s4 : Operand<i32> {
   let EncoderMethod = "getT2AddrModeImm8s4OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm8s4";
   let ParserMatchClass = MemImm8s4OffsetAsmOperand;
   let MIOperandInfo = (ops GPR:$base, i32imm:$offsimm);
 }
 
+def t2addrmode_imm8s4 : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<false>";
+}
+
+def t2addrmode_imm8s4_pre : T2AddrMode_Imm8s4 {
+  let PrintMethod = "printT2AddrModeImm8s4Operand<true>";
+}
+
 def t2am_imm8s4_offset_asmoperand : AsmOperandClass { let Name = "Imm8s4"; }
 def t2am_imm8s4_offset : Operand<i32> {
   let PrintMethod = "printT2AddrModeImm8s4OffsetOperand";
@@ -238,7 +251,8 @@ def t2am_imm8s4_offset : Operand<i32> {
 def MemImm0_1020s4OffsetAsmOperand : AsmOperandClass {
   let Name = "MemImm0_1020s4Offset";
 }
-def t2addrmode_imm0_1020s4 : Operand<i32> {
+def t2addrmode_imm0_1020s4 : Operand<i32>,
+                         ComplexPattern<i32, 2, "SelectT2AddrModeExclusive"> {
   let PrintMethod = "printT2AddrModeImm0_1020s4Operand";
   let EncoderMethod = "getT2AddrModeImm0_1020s4OpValue";
   let DecoderMethod = "DecodeT2AddrModeImm0_1020s4";
@@ -959,6 +973,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{19-16} = addr{16-13}; // Rn
     let Inst{15-12} = Rt;
     let Inst{11-0}  = addr{11-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm12";
   }
   def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
@@ -979,6 +995,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{9}     = addr{8};    // U
     let Inst{8} = 0; // The W bit.
     let Inst{7-0}   = addr{7-0};  // imm
+
+    let DecoderMethod = "DecodeT2LoadImm8";
   }
   def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
@@ -1011,14 +1029,18 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
-    let Inst{23} = ?; // add = (U == '1')
     let Inst{22-21} = opcod;
     let Inst{20} = 1; // load
     let Inst{19-16} = 0b1111; // Rn
+
     bits<4> Rt;
-    bits<12> addr;
     let Inst{15-12} = Rt{3-0};
+
+    bits<13> addr;
+    let Inst{23} = addr{12}; // add = (U == '1')
     let Inst{11-0}  = addr{11-0};
+
+    let DecoderMethod = "DecodeT2LoadLabel";
   }
 }
 
@@ -1228,15 +1250,15 @@ defm t2LDR   : T2I_ld<0, 0b10, "ldr", IIC_iLoad_i, IIC_iLoad_si, GPR,
 
 // Loads with zero extension
 defm t2LDRH  : T2I_ld<0, 0b01, "ldrh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(zextloadi16 node:$Src)>>;
+                      GPR, UnOpFrag<(zextloadi16 node:$Src)>>;
 defm t2LDRB  : T2I_ld<0, 0b00, "ldrb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(zextloadi8  node:$Src)>>;
+                      GPR, UnOpFrag<(zextloadi8  node:$Src)>>;
 
 // Loads with sign extension
 defm t2LDRSH : T2I_ld<1, 0b01, "ldrsh", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(sextloadi16 node:$Src)>>;
+                      GPR, UnOpFrag<(sextloadi16 node:$Src)>>;
 defm t2LDRSB : T2I_ld<1, 0b00, "ldrsb", IIC_iLoad_bh_i, IIC_iLoad_bh_si,
-                      rGPR, UnOpFrag<(sextloadi8  node:$Src)>>;
+                      GPR, UnOpFrag<(sextloadi8  node:$Src)>>;
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
@@ -1294,12 +1316,9 @@ def : T2Pat<(extloadi16 (ARMWrapper tconstpool:$addr)),
 
 let mayLoad = 1, neverHasSideEffects = 1 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
 
 def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
@@ -1307,48 +1326,42 @@ def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
 def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+
 def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            []>;
+
 def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
                           "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
 
 def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
-                            (ins t2addrmode_imm8:$addr),
+                            (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []> {
-  let AsmMatchConverter = "cvtLdWriteBackRegT2AddrModeImm8";
-}
+                            []>;
+
 def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
@@ -1373,6 +1386,8 @@ class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
   let Inst{11} = 1;
   let Inst{10-8} = 0b110; // PUW.
   let Inst{7-0} = addr{7-0};
+
+  let DecoderMethod = "DecodeT2LoadT";
 }
 
 def t2LDRT   : T2IldT<0, 0b10, "ldrt", IIC_iLoad_i>;
@@ -1399,27 +1414,22 @@ def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
 
 let mayStore = 1, neverHasSideEffects = 1 in {
 def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins GPRnopc:$Rt, t2addrmode_imm8:$addr),
+                            (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
-                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+
 def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                         "strh", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
 
 def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
-                            (ins rGPR:$Rt, t2addrmode_imm8:$addr),
+                            (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []> {
-  let AsmMatchConverter = "cvtStWriteBackRegT2AddrModeImm8";
-}
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
 } // mayStore = 1, neverHasSideEffects = 1
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -1506,9 +1516,8 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 // For disassembly only.
 
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
-                 (ins t2addrmode_imm8s4:$addr), IIC_iLoad_d_ru,
+                 (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
-  let AsmMatchConverter = "cvtT2LdrdPre";
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
@@ -1518,10 +1527,9 @@ def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  "$addr.base = $wb", []>;
 
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
-                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
+                 (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
                  "$addr.base = $wb", []> {
-  let AsmMatchConverter = "cvtT2StrdPre";
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
@@ -1543,16 +1551,17 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
               Sched<[WritePreLd]> {
     let Inst{31-25} = 0b1111100;
     let Inst{24} = instr;
+    let Inst{23} = 1;
     let Inst{22} = 0;
     let Inst{21} = write;
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
 
     bits<17> addr;
-    let addr{12}    = 1;           // add = TRUE
     let Inst{19-16} = addr{16-13}; // Rn
-    let Inst{23}    = addr{12};    // U
     let Inst{11-0}  = addr{11-0};  // imm12
+
+    let DecoderMethod = "DecodeT2LoadImm12";
   }
 
   def i8 : T2Ii8<(outs), (ins t2addrmode_negimm8:$addr), IIC_Preload, opc,
@@ -1571,6 +1580,8 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     bits<13> addr;
     let Inst{19-16} = addr{12-9}; // Rn
     let Inst{7-0}   = addr{7-0};  // imm8
+
+    let DecoderMethod = "DecodeT2LoadImm8";
   }
 
   def s : T2Iso<(outs), (ins t2addrmode_so_reg:$addr), IIC_Preload, opc,
@@ -1584,7 +1595,7 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
     let Inst{21} = write;
     let Inst{20} = 1;
     let Inst{15-12} = 0b1111;
-    let Inst{11-6} = 0000000;
+    let Inst{11-6} = 0b000000;
 
     bits<10> addr;
     let Inst{19-16} = addr{9-6}; // Rn
@@ -1593,15 +1604,33 @@ multiclass T2Ipl<bits<1> write, bits<1> instr, string opc> {
 
     let DecoderMethod = "DecodeT2LoadShift";
   }
-  // FIXME: We should have a separate 'pci' variant here. As-is we represent
-  // it via the i12 variant, which it's related to, but that means we can
-  // represent negative immediates, which aren't legal for anything except
-  // the 'pci' case (Rn == 15).
 }
 
-defm t2PLD  : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
-defm t2PLDW : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
-defm t2PLI  : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+defm t2PLD    : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
+defm t2PLDW   : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
+defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
+
+// pci variant is very similar to i12, but supports negative offsets
+// from the PC. Only PLD and PLI have pci variants (not PLDW)
+class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
+               IIC_Preload, opc, "\t$addr", 
+               [(ARMPreload (ARMWrapper tconstpool:$addr),
+                (i32 0), (i32 inst))]>, Sched<[WritePreLd]> {
+  let Inst{31-25} = 0b1111100;
+  let Inst{24} = inst;
+  let Inst{22-20} = 0b001;
+  let Inst{19-16} = 0b1111;
+  let Inst{15-12} = 0b1111;
+
+  bits<13> addr;
+  let Inst{23}   = addr{12};   // add = (U == '1')
+  let Inst{11-0} = addr{11-0}; // imm12
+
+  let DecoderMethod = "DecodeT2LoadLabel";
+}
+
+def t2PLDpci : T2Iplpci<0, "pld">,  Requires<[IsThumb2]>;
+def t2PLIpci : T2Iplpci<1, "pli">,  Requires<[IsThumb2,HasV7]>;
 
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
@@ -2907,7 +2936,12 @@ def t2PKHTB : T2ThreeReg<
 
 // Alternate cases for PKHTB where identities eliminate some nodes.  Note that
 // a shift amount of 0 is *not legal* here, it is PKHBT instead.
-def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16_31:$sh)),
+// We also can not replace a srl (17..31) by an arithmetic shift we would use in
+// pkhtb src1, src2, asr (17..31).
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
+            (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
+            Requires<[HasT2ExtractPack, IsThumb2]>;
+def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
             Requires<[HasT2ExtractPack, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
@@ -3092,26 +3126,24 @@ def t2MOVCCror : T2I_movcc_sh<0b11, (outs rGPR:$Rd),
 
 // memory barriers protect the atomic sequences
 let hasSideEffects = 1 in {
-def t2DMB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
-                  Requires<[IsThumb, HasDB]> {
+def t2DMB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dmb", "\t$opt", [(ARMMemBarrier (i32 imm:$opt))]>,
+                Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f5;
   let Inst{3-0} = opt;
 }
 }
 
-def t2DSB : AInoP<(outs), (ins memb_opt:$opt), ThumbFrm, NoItinerary,
-                  "dsb", "\t$opt", []>,
-                  Requires<[IsThumb, HasDB]> {
+def t2DSB : T2I<(outs), (ins memb_opt:$opt), NoItinerary,
+                "dsb", "\t$opt", []>, Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f4;
   let Inst{3-0} = opt;
 }
 
-def t2ISB : AInoP<(outs), (ins instsyncb_opt:$opt), ThumbFrm, NoItinerary,
-                  "isb", "\t$opt",
-                  []>, Requires<[IsThumb, HasDB]> {
+def t2ISB : T2I<(outs), (ins instsyncb_opt:$opt), NoItinerary,
+                "isb", "\t$opt", []>, Requires<[HasDB]> {
   bits<4> opt;
   let Inst{31-4} = 0xf3bf8f6;
   let Inst{3-0} = opt;
@@ -3154,13 +3186,16 @@ class T2I_strex<bits<2> opcod, dag oops, dag iops, AddrMode am, int sz,
 let mayLoad = 1 in {
 def t2LDREXB : T2I_ldrex<0b00, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "ldrexb", "\t$Rt, $addr", "", []>;
+                         "ldrexb", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_1 addr_offset_none:$addr))]>;
 def t2LDREXH : T2I_ldrex<0b01, (outs rGPR:$Rt), (ins addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "ldrexh", "\t$Rt, $addr", "", []>;
+                         "ldrexh", "\t$Rt, $addr", "",
+                         [(set rGPR:$Rt, (ldrex_2 addr_offset_none:$addr))]>;
 def t2LDREX  : Thumb2I<(outs rGPR:$Rt), (ins t2addrmode_imm0_1020s4:$addr),
                        AddrModeNone, 4, NoItinerary,
-                       "ldrex", "\t$Rt, $addr", "", []> {
+                       "ldrex", "\t$Rt, $addr", "",
+                     [(set rGPR:$Rt, (ldrex_4 t2addrmode_imm0_1020s4:$addr))]> {
   bits<4> Rt;
   bits<12> addr;
   let Inst{31-27} = 0b11101;
@@ -3185,16 +3220,22 @@ let mayStore = 1, Constraints = "@earlyclobber $Rd" in {
 def t2STREXB : T2I_strex<0b00, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "strexb", "\t$Rd, $Rt, $addr", "", []>;
+                         "strexb", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd, (strex_1 rGPR:$Rt,
+                                                  addr_offset_none:$addr))]>;
 def t2STREXH : T2I_strex<0b01, (outs rGPR:$Rd),
                          (ins rGPR:$Rt, addr_offset_none:$addr),
                          AddrModeNone, 4, NoItinerary,
-                         "strexh", "\t$Rd, $Rt, $addr", "", []>;
+                         "strexh", "\t$Rd, $Rt, $addr", "",
+                         [(set rGPR:$Rd, (strex_2 rGPR:$Rt,
+                                                  addr_offset_none:$addr))]>;
+
 def t2STREX  : Thumb2I<(outs rGPR:$Rd), (ins rGPR:$Rt,
                              t2addrmode_imm0_1020s4:$addr),
                   AddrModeNone, 4, NoItinerary,
                   "strex", "\t$Rd, $Rt, $addr", "",
-                  []> {
+                  [(set rGPR:$Rd, (strex_4 rGPR:$Rt,
+                                           t2addrmode_imm0_1020s4:$addr))]> {
   bits<4> Rd;
   bits<4> Rt;
   bits<12> addr;
@@ -3216,7 +3257,7 @@ def t2STREXD : T2I_strex<0b11, (outs rGPR:$Rd),
 }
 }
 
-def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
+def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", [(int_arm_clrex)]>,
             Requires<[IsThumb2, HasV7]>  {
   let Inst{31-16} = 0xf3bf;
   let Inst{15-14} = 0b10;
@@ -3227,6 +3268,15 @@ def t2CLREX : T2I<(outs), (ins), NoItinerary, "clrex", "", []>,
   let Inst{3-0} = 0b1111;
 }
 
+def : T2Pat<(and (ldrex_1 addr_offset_none:$addr), 0xff),
+            (t2LDREXB addr_offset_none:$addr)>;
+def : T2Pat<(and (ldrex_2 addr_offset_none:$addr), 0xffff),
+            (t2LDREXH addr_offset_none:$addr)>;
+def : T2Pat<(strex_1 (and GPR:$Rt, 0xff), addr_offset_none:$addr),
+            (t2STREXB GPR:$Rt, addr_offset_none:$addr)>;
+def : T2Pat<(strex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
+            (t2STREXH GPR:$Rt, addr_offset_none:$addr)>;
+
 //===----------------------------------------------------------------------===//
 // SJLJ Exception handling intrinsics
 //   eh_sjlj_setjmp() is an instruction sequence to store the return
@@ -3549,6 +3599,16 @@ def t2RFEIA  : T2RFE<0b111010011001,
                    (outs), (ins GPR:$Rn), NoItinerary, "rfeia", "\t$Rn",
                    [/* For disassembly only; pattern left blank */]>;
 
+// B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction.
+let Defs = [PC], Uses = [LR] in
+def t2SUBS_PC_LR : T2I <(outs), (ins imm0_255:$imm), NoItinerary,
+                   "subs", "\tpc, lr, $imm", []>, Requires<[IsThumb2]> {
+  let Inst{31-8} = 0b111100111101111010001111;
+
+  bits<8> imm;
+  let Inst{7-0} = imm;
+}
+
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
@@ -3632,7 +3692,7 @@ multiclass t2LdStCop<bits<4> op31_28, bit load, bit Dbit, string asm> {
     let DecoderMethod = "DecodeCopMemInstruction";
   }
   def _PRE : T2CI<op31_28,
-                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5:$addr),
+                  (outs), (ins p_imm:$cop, c_imm:$CRd, addrmode5_pre:$addr),
                   asm, "\t$cop, $CRd, $addr!"> {
     bits<13> addr;
     bits<4> cop;
@@ -3783,8 +3843,7 @@ def t2MSR_M : T2I<(outs), (ins msr_mask:$SYSm, rGPR:$Rn),
 
 class t2MovRCopro<bits<4> Op, string opc, bit direction, dag oops, dag iops,
                   list<dag> pattern>
-  : T2Cop<Op, oops, iops,
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2"),
+  : T2Cop<Op, oops, iops, opc, "\t$cop, $opc1, $Rt, $CRn, $CRm, $opc2",
           pattern> {
   let Inst{27-24} = 0b1110;
   let Inst{20} = direction;
@@ -3809,7 +3868,7 @@ class t2MovRRCopro<bits<4> Op, string opc, bit direction,
                    list<dag> pattern = []>
   : T2Cop<Op, (outs),
           (ins p_imm:$cop, imm0_15:$opc1, GPR:$Rt, GPR:$Rt2, c_imm:$CRm),
-          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern> {
+          opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm", pattern> {
   let Inst{27-24} = 0b1100;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -3834,32 +3893,32 @@ def t2MCR : t2MovRCopro<0b1110, "mcr", 0,
                 c_imm:$CRm, imm0_7:$opc2),
            [(int_arm_mcr imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                          imm:$CRm, imm:$opc2)]>;
-def : t2InstAlias<"mcr $cop, $opc1, $Rt, $CRn, $CRm",
+def : t2InstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
-                         c_imm:$CRm, 0)>;
+                         c_imm:$CRm, 0, pred:$p)>;
 def t2MCR2 : t2MovRCopro<0b1111, "mcr2", 0,
              (outs), (ins p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                           c_imm:$CRm, imm0_7:$opc2),
              [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                             imm:$CRm, imm:$opc2)]>;
-def : t2InstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
+def : t2InstAlias<"mcr2${p} $cop, $opc1, $Rt, $CRn, $CRm",
                   (t2MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
-                          c_imm:$CRm, 0)>;
+                          c_imm:$CRm, 0, pred:$p)>;
 
 /* from coprocessor to ARM core register */
 def t2MRC : t2MovRCopro<0b1110, "mrc", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                                   c_imm:$CRm, imm0_7:$opc2), []>;
-def : t2InstAlias<"mrc $cop, $opc1, $Rt, $CRn, $CRm",
-                  (t2MRC GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                         c_imm:$CRm, 0)>;
+def : t2InstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                         c_imm:$CRm, 0, pred:$p)>;
 
 def t2MRC2 : t2MovRCopro<0b1111, "mrc2", 1,
-             (outs GPR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+             (outs GPRwithAPSR:$Rt), (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                                   c_imm:$CRm, imm0_7:$opc2), []>;
-def : t2InstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
-                  (t2MRC2 GPR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
-                          c_imm:$CRm, 0)>;
+def : t2InstAlias<"mrc2${p} $cop, $opc1, $Rt, $CRn, $CRm",
+                  (t2MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
+                          c_imm:$CRm, 0, pred:$p)>;
 
 def : T2v6Pat<(int_arm_mrc  imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2),
               (t2MRC imm:$cop, imm:$opc1, imm:$CRn, imm:$CRm, imm:$opc2)>;
@@ -3886,7 +3945,7 @@ def t2MRRC2 : t2MovRRCopro<0b1111, "mrrc2", 1>;
 
 def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                  c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
-                 "cdp\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                 "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                  [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                                imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
@@ -3909,7 +3968,7 @@ def tCDP : T2Cop<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
 
 def t2CDP2 : T2Cop<0b1111, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                    c_imm:$CRd, c_imm:$CRn, c_imm:$CRm, imm0_7:$opc2),
-                   "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
+                   "cdp2", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                    [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                                   imm:$CRm, imm:$opc2)]> {
   let Inst{27-24} = 0b1110;
@@ -4097,9 +4156,9 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr GPRnopc:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
-def : InstAlias<"dmb", (t2DMB 0xf)>, Requires<[IsThumb, HasDB]>;
-def : InstAlias<"dsb", (t2DSB 0xf)>, Requires<[IsThumb, HasDB]>;
-def : InstAlias<"isb", (t2ISB 0xf)>, Requires<[IsThumb, HasDB]>;
+def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
+def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p)>, Requires<[IsThumb2, HasDB]>;
 
 // Alias for LDR, LDRB, LDRH, LDRSB, and LDRSH without the ".w" optional
 // width specifier.
@@ -4350,7 +4409,7 @@ def t2LDRSHpcrel  : t2AsmPseudo<"ldrsh${p} $Rt, $addr",
                          (ins GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
     // Version w/ the .w suffix.
 def : t2InstAlias<"ldr${p}.w $Rt, $addr",
-                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
+                  (t2LDRpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p), 0>;
 def : t2InstAlias<"ldrb${p}.w $Rt, $addr",
                   (t2LDRBpcrel GPRnopc:$Rt, t2ldr_pcrel_imm12:$addr, pred:$p)>;
 def : t2InstAlias<"ldrh${p}.w $Rt, $addr",
@@ -4362,3 +4421,10 @@ def : t2InstAlias<"ldrsh${p}.w $Rt, $addr",
 
 def : t2InstAlias<"add${p} $Rd, pc, $imm",
                   (t2ADR rGPR:$Rd, imm0_4095:$imm, pred:$p)>;
+
+// PLD/PLDW/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p)>,
+      Requires<[IsThumb2,HasV7]>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 597b74a..f9cfa15 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -333,6 +333,42 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
   let D = VFPNeonA8Domain;
 }
 
+multiclass vsel_inst<string op, bits<2> opc> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def S : ASbInp<0b11100, opc, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat("vsel", op, ".f32\t$Sd, $Sn, $Sm"),
+                   []>, Requires<[HasV8FP]>;
+
+    def D : ADbInp<0b11100, opc, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat("vsel", op, ".f64\t$Dd, $Dn, $Dm"),
+                   []>, Requires<[HasV8FP]>;
+  }
+}
+
+defm VSELGT : vsel_inst<"gt", 0b11>;
+defm VSELGE : vsel_inst<"ge", 0b10>;
+defm VSELEQ : vsel_inst<"eq", 0b00>;
+defm VSELVS : vsel_inst<"vs", 0b01>;
+
+multiclass vmaxmin_inst<string op, bit opc> {
+  let DecoderNamespace = "VFPV8", PostEncoderMethod = "" in {
+    def S : ASbInp<0b11101, 0b00, opc,
+                   (outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm),
+                   NoItinerary, !strconcat(op, ".f32\t$Sd, $Sn, $Sm"),
+                   []>, Requires<[HasV8FP]>;
+
+    def D : ADbInp<0b11101, 0b00, opc,
+                   (outs DPR:$Dd), (ins DPR:$Dn, DPR:$Dm),
+                   NoItinerary, !strconcat(op, ".f64\t$Dd, $Dn, $Dm"),
+                   []>, Requires<[HasV8FP]>;
+  }
+}
+
+defm VMAXNM : vmaxmin_inst<"vmaxnm", 0>;
+defm VMINNM : vmaxmin_inst<"vminnm", 1>;
+
 // Match reassociated forms only if not sign dependent rounding.
 def : Pat<(fmul (fneg DPR:$a), (f64 DPR:$b)),
           (VNMULD DPR:$a, DPR:$b)>, Requires<[NoHonorSignDependentRounding]>;
@@ -468,7 +504,7 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Inst{4}     = 0;
 }
 
-// Between half-precision and single-precision.  For disassembly only.
+// Between half, single and double-precision.  For disassembly only.
 
 // FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
@@ -493,6 +529,111 @@ def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>;
 
+def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasV8FP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtb", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasV8FP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{3-0}     = Dm{3-0};
+  let Inst{5}       = Dm{4};
+  let Inst{15-12}   = Sd{4-1};
+  let Inst{22}      = Sd{0};
+}
+
+def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
+                   (outs DPR:$Dd), (ins SPR:$Sm),
+                   NoItinerary, "vcvtt", ".f64.f16\t$Dd, $Sm",
+                   []>, Requires<[HasV8FP]> {
+  // Instruction operands.
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0} = Sm{4-1};
+  let Inst{5}   = Sm{0};
+}
+
+def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
+                   (outs SPR:$Sd), (ins DPR:$Dm),
+                   NoItinerary, "vcvtt", ".f16.f64\t$Sd, $Dm",
+                   []>, Requires<[HasV8FP]> {
+  // Instruction operands.
+  bits<5> Sd;
+  bits<5> Dm;
+
+  // Encode instruction operands.
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+  let Inst{3-0}   = Dm{3-0};
+  let Inst{5}     = Dm{4};
+}
+
+multiclass vcvt_inst<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def SS : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasV8FP]> {
+      let Inst{17-16} = rm;
+    }
+
+    def US : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins SPR:$Sm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f32\t$Sd, $Sm"),
+                    []>, Requires<[HasV8FP]> {
+      let Inst{17-16} = rm;
+    }
+
+    def SD : ASuInp<0b11101, 0b11, 0b1100, 0b11, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".s32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasV8FP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0} = Dm{3-0};
+      let Inst{5}   = Dm{4};
+      let Inst{8} = 1;
+    }
+
+    def UD : ASuInp<0b11101, 0b11, 0b1100, 0b01, 0,
+                    (outs SPR:$Sd), (ins DPR:$Dm),
+                    NoItinerary, !strconcat("vcvt", opc, ".u32.f64\t$Sd, $Dm"),
+                    []>, Requires<[HasV8FP]> {
+      bits<5> Dm;
+
+      let Inst{17-16} = rm;
+
+      // Encode instruction operands
+      let Inst{3-0}  = Dm{3-0};
+      let Inst{5}    = Dm{4};
+      let Inst{8} = 1;
+    }
+  }
+}
+
+defm VCVTA : vcvt_inst<"a", 0b00>;
+defm VCVTN : vcvt_inst<"n", 0b01>;
+defm VCVTP : vcvt_inst<"p", 0b10>;
+defm VCVTM : vcvt_inst<"m", 0b11>;
+
 def VNEGD  : ADuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpUNA64, "vneg", ".f64\t$Dd, $Dm",
@@ -507,6 +648,54 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
   let D = VFPNeonA8Domain;
 }
 
+multiclass vrint_inst_zrx<string opc, bit op, bit op2> {
+  def S : ASuI<0b11101, 0b11, 0b0110, 0b11, 0,
+               (outs SPR:$Sd), (ins SPR:$Sm),
+               NoItinerary, !strconcat("vrint", opc), ".f32\t$Sd, $Sm",
+               []>, Requires<[HasV8FP]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+  def D : ADuI<0b11101, 0b11, 0b0110, 0b11, 0,
+                (outs DPR:$Dd), (ins DPR:$Dm),
+                NoItinerary, !strconcat("vrint", opc), ".f64\t$Dd, $Dm",
+                []>, Requires<[HasV8FP]> {
+    let Inst{7} = op2;
+    let Inst{16} = op;
+  }
+}
+
+defm VRINTZ : vrint_inst_zrx<"z", 0, 1>;
+defm VRINTR : vrint_inst_zrx<"r", 0, 0>;
+defm VRINTX : vrint_inst_zrx<"x", 1, 0>;
+
+multiclass vrint_inst_anpm<string opc, bits<2> rm> {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+    def S : ASuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs SPR:$Sd), (ins SPR:$Sm),
+                   NoItinerary, !strconcat("vrint", opc, ".f32\t$Sd, $Sm"),
+                   []>, Requires<[HasV8FP]> {
+      let Inst{17-16} = rm;
+    }
+    def D : ADuInp<0b11101, 0b11, 0b1000, 0b01, 0,
+                   (outs DPR:$Dd), (ins DPR:$Dm),
+                   NoItinerary, !strconcat("vrint", opc, ".f64\t$Dd, $Dm"),
+                   []>, Requires<[HasV8FP]> {
+      let Inst{17-16} = rm;
+    }
+  }
+
+  def : InstAlias<!strconcat("vrint", opc, ".f32.f32\t$Sd, $Sm"),
+                  (!cast<Instruction>(NAME#"S") SPR:$Sd, SPR:$Sm)>;
+  def : InstAlias<!strconcat("vrint", opc, ".f64.f64\t$Dd, $Dm"),
+                  (!cast<Instruction>(NAME#"D") DPR:$Dd, DPR:$Dm)>;
+}
+
+defm VRINTA : vrint_inst_anpm<"a", 0b00>;
+defm VRINTN : vrint_inst_anpm<"n", 0b01>;
+defm VRINTP : vrint_inst_anpm<"p", 0b10>;
+defm VRINTM : vrint_inst_anpm<"m", 0b11>;
+
 def VSQRTD : ADuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs DPR:$Dd), (ins DPR:$Dm),
                   IIC_fpSQRT64, "vsqrt", ".f64\t$Dd, $Dm",
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c8ed576..1803a8a 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -109,12 +109,12 @@ namespace {
                         unsigned PredReg,
                         unsigned Scratch,
                         DebugLoc dl,
-                        SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+                        SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
     void MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex, unsigned Base,
                       int Opcode, unsigned Size,
                       ARMCC::CondCodes Pred, unsigned PredReg,
                       unsigned Scratch, MemOpQueue &MemOps,
-                      SmallVector<MachineBasicBlock::iterator, 4> &Merges);
+                      SmallVectorImpl<MachineBasicBlock::iterator> &Merges);
 
     void AdvanceRS(MachineBasicBlock &MBB, MemOpQueue &MemOps);
     bool FixInvalidRegPairOp(MachineBasicBlock &MBB,
@@ -371,7 +371,7 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
                                      ARMCC::CondCodes Pred, unsigned PredReg,
                                      unsigned Scratch,
                                      DebugLoc dl,
-                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
   // First calculate which of the registers should be killed by the merged
   // instruction.
   const unsigned insertPos = memOps[insertAfter].Position;
@@ -444,10 +444,10 @@ void ARMLoadStoreOpt::MergeOpsUpdate(MachineBasicBlock &MBB,
 /// load / store multiple instructions.
 void
 ARMLoadStoreOpt::MergeLDR_STR(MachineBasicBlock &MBB, unsigned SIndex,
-                          unsigned Base, int Opcode, unsigned Size,
-                          ARMCC::CondCodes Pred, unsigned PredReg,
-                          unsigned Scratch, MemOpQueue &MemOps,
-                          SmallVector<MachineBasicBlock::iterator, 4> &Merges) {
+                         unsigned Base, int Opcode, unsigned Size,
+                         ARMCC::CondCodes Pred, unsigned PredReg,
+                         unsigned Scratch, MemOpQueue &MemOps,
+                         SmallVectorImpl<MachineBasicBlock::iterator> &Merges) {
   bool isNotVFP = isi32Load(Opcode) || isi32Store(Opcode);
   int Offset = MemOps[SIndex].Offset;
   int SOffset = Offset;
@@ -1484,7 +1484,7 @@ namespace {
                           unsigned &PredReg, ARMCC::CondCodes &Pred,
                           bool &isT2);
     bool RescheduleOps(MachineBasicBlock *MBB,
-                       SmallVector<MachineInstr*, 4> &Ops,
+                       SmallVectorImpl<MachineInstr *> &Ops,
                        unsigned Base, bool isLd,
                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
@@ -1602,8 +1602,9 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
     return false;
 
   // Make sure the base address satisfies i64 ld / st alignment requirement.
+  // At the moment, we ignore the memoryoperand's value.
+  // If we want to use AliasAnalysis, we should check it accordingly.
   if (!Op0->hasOneMemOperand() ||
-      !(*Op0->memoperands_begin())->getValue() ||
       (*Op0->memoperands_begin())->isVolatile())
     return false;
 
@@ -1655,7 +1656,7 @@ namespace {
 }
 
 bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
-                                 SmallVector<MachineInstr*, 4> &Ops,
+                                 SmallVectorImpl<MachineInstr *> &Ops,
                                  unsigned Base, bool isLd,
                                  DenseMap<MachineInstr*, unsigned> &MI2LocMap) {
   bool RetVal = false;
@@ -1857,9 +1858,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           if (!StopHere)
             BI->second.push_back(MI);
         } else {
-          SmallVector<MachineInstr*, 4> MIs;
-          MIs.push_back(MI);
-          Base2LdsMap[Base] = MIs;
+          Base2LdsMap[Base].push_back(MI);
           LdBases.push_back(Base);
         }
       } else {
@@ -1875,9 +1874,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
           if (!StopHere)
             BI->second.push_back(MI);
         } else {
-          SmallVector<MachineInstr*, 4> MIs;
-          MIs.push_back(MI);
-          Base2StsMap[Base] = MIs;
+          Base2StsMap[Base].push_back(MI);
           StBases.push_back(Base);
         }
       }
@@ -1893,7 +1890,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     // Re-schedule loads.
     for (unsigned i = 0, e = LdBases.size(); i != e; ++i) {
       unsigned Base = LdBases[i];
-      SmallVector<MachineInstr*, 4> &Lds = Base2LdsMap[Base];
+      SmallVectorImpl<MachineInstr *> &Lds = Base2LdsMap[Base];
       if (Lds.size() > 1)
         RetVal |= RescheduleOps(MBB, Lds, Base, true, MI2LocMap);
     }
@@ -1901,7 +1898,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
     // Re-schedule stores.
     for (unsigned i = 0, e = StBases.size(); i != e; ++i) {
       unsigned Base = StBases[i];
-      SmallVector<MachineInstr*, 4> &Sts = Base2StsMap[Base];
+      SmallVectorImpl<MachineInstr *> &Sts = Base2StsMap[Base];
       if (Sts.size() > 1)
         RetVal |= RescheduleOps(MBB, Sts, Base, false, MI2LocMap);
     }
diff --git a/lib/Target/ARM/ARMRegisterInfo.td b/lib/Target/ARM/ARMRegisterInfo.td
index 0459d64..bb7d358 100644
--- a/lib/Target/ARM/ARMRegisterInfo.td
+++ b/lib/Target/ARM/ARMRegisterInfo.td
@@ -215,7 +215,7 @@ def GPRnopc : RegisterClass<"ARM", [i32], 32, (sub GPR, PC)> {
 // GPRs without the PC but with APSR. Some instructions allow accessing the
 // APSR, while actually encoding PC in the register field. This is usefull
 // for assembly and disassembly only.
-def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add GPR, APSR_NZCV)> {
+def GPRwithAPSR : RegisterClass<"ARM", [i32], 32, (add (sub GPR, PC), APSR_NZCV)> {
   let AltOrders = [(add LR, GPRnopc), (trunc GPRnopc, 8)];
   let AltOrderSelect = [{
       return 1 + MF.getTarget().getSubtarget<ARMSubtarget>().isThumb1Only();
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index d06ad7d..74ee50b 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1883,13 +1883,10 @@ def CortexA9Itineraries : ProcessorItineraries<
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
   let IssueWidth = 2; // 2 micro-ops are dispatched per cycle.
-  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let MicroOpBufferSize = 56; // Based on available renamed registers.
   let LoadLatency = 2; // Optimistic load latency assuming bypass.
                        // This is overriden by OperandCycles if the
                        // Itineraries are queried instead.
-  let ILPWindow = 10; // Don't reschedule small blocks to hide
-                      // latency. Minimum latency requirements are already
-                      // modeled strictly by reserving resources.
   let MispredictPenalty = 8; // Based on estimate of pipeline depth.
 
   let Itineraries = CortexA9Itineraries;
@@ -1904,7 +1901,7 @@ def A9UnitALU : ProcResource<2>;
 def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
 def A9UnitAGU : ProcResource<1>;
 def A9UnitLS  : ProcResource<1>;
-def A9UnitFP  : ProcResource<1> { let Buffered = 0; }
+def A9UnitFP  : ProcResource<1> { let BufferSize = 0; }
 def A9UnitB   : ProcResource<1>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index b5cf251..2a41616 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -1076,7 +1076,7 @@ def SwiftItineraries : ProcessorItineraries<
 // Swift machine model for scheduling and other instruction cost heuristics.
 def SwiftModel : SchedMachineModel {
   let IssueWidth = 3; // 3 micro-ops are dispatched per cycle.
-  let MinLatency = 0; // Data dependencies are allowed within dispatch groups.
+  let MicroOpBufferSize = 45; // Based on NEON renamed registers.
   let LoadLatency = 3;
   let MispredictPenalty = 14; // A branch direction mispredict.
 
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 4d204ce..0a0f30c 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -77,9 +77,11 @@ void ARMSubtarget::initializeEnvironment() {
   HasV6Ops = false;
   HasV6T2Ops = false;
   HasV7Ops = false;
+  HasV8Ops = false;
   HasVFPv2 = false;
   HasVFPv3 = false;
   HasVFPv4 = false;
+  HasV8FP = false;
   HasNEON = false;
   UseNEONForSinglePrecisionFP = false;
   UseMulOps = UseFusedMulOps;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index bc5af96..ad7f1b3 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -37,7 +37,8 @@ protected:
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
   ARMProcFamilyEnum ARMProcFamily;
 
-  /// HasV4TOps, HasV5TOps, HasV5TEOps, HasV6Ops, HasV6T2Ops, HasV7Ops -
+  /// HasV4TOps, HasV5TOps, HasV5TEOps,
+  /// HasV6Ops, HasV6T2Ops, HasV7Ops, HasV8Ops -
   /// Specify whether target support specific ARM ISA variants.
   bool HasV4TOps;
   bool HasV5TOps;
@@ -45,12 +46,14 @@ protected:
   bool HasV6Ops;
   bool HasV6T2Ops;
   bool HasV7Ops;
+  bool HasV8Ops;
 
-  /// HasVFPv2, HasVFPv3, HasVFPv4, HasNEON - Specify what
+  /// HasVFPv2, HasVFPv3, HasVFPv4, HasV8FP, HasNEON - Specify what
   /// floating point ISAs are supported.
   bool HasVFPv2;
   bool HasVFPv3;
   bool HasVFPv4;
+  bool HasV8FP;
   bool HasNEON;
 
   /// UseNEONForSinglePrecisionFP - if the NEONFP attribute has been
@@ -192,10 +195,6 @@ protected:
 
  public:
   enum {
-    isELF, isDarwin
-  } TargetType;
-
-  enum {
     ARM_ABI_APCS,
     ARM_ABI_AAPCS // ARM EABI
   } TargetABI;
@@ -231,6 +230,7 @@ public:
   bool hasV6Ops()   const { return HasV6Ops;   }
   bool hasV6T2Ops() const { return HasV6T2Ops; }
   bool hasV7Ops()   const { return HasV7Ops;  }
+  bool hasV8Ops()   const { return HasV8Ops;  }
 
   bool isCortexA5() const { return ARMProcFamily == CortexA5; }
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
@@ -246,6 +246,7 @@ public:
   bool hasVFP2() const { return HasVFPv2; }
   bool hasVFP3() const { return HasVFPv3; }
   bool hasVFP4() const { return HasVFPv4; }
+  bool hasV8FP() const { return HasV8FP; }
   bool hasNEON() const { return HasNEON;  }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP; }
@@ -279,6 +280,14 @@ public:
   bool isTargetNaCl() const { return TargetTriple.getOS() == Triple::NaCl; }
   bool isTargetLinux() const { return TargetTriple.getOS() == Triple::Linux; }
   bool isTargetELF() const { return !isTargetDarwin(); }
+  // ARM EABI is the bare-metal EABI described in ARM ABI documents and
+  // can be accessed via -target arm-none-eabi. This is NOT GNUEABI.
+  // FIXME: Add a flag for bare-metal for that target and set Triple::EABI
+  // even for GNUEABI, so we can make a distinction here and still conform to
+  // the EABI on GNU (and Android) mode. This requires change in Clang, too.
+  bool isTargetAEABI() const {
+    return TargetTriple.getEnvironment() == Triple::EABI;
+  }
 
   bool isAPCS_ABI() const { return TargetABI == ARM_ABI_APCS; }
   bool isAAPCS_ABI() const { return TargetABI == ARM_ABI_AAPCS; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index b16ab4b..e6dbcb6 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -60,7 +60,7 @@ void ARMBaseTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
   // Add first the target-independent BasicTTI pass, then our ARM pass. This
   // allows the ARM pass to delegate to the target independent layer when
   // appropriate.
-  PM.add(createBasicTargetTransformInfoPass(getTargetLowering()));
+  PM.add(createBasicTargetTransformInfoPass(this));
   PM.add(createARMTargetTransformInfoPass(this));
 }
 
@@ -150,7 +150,7 @@ TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
 
 bool ARMPassConfig::addPreISel() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
-    addPass(createGlobalMergePass(TM->getTargetLowering()));
+    addPass(createGlobalMergePass(TM));
 
   return false;
 }
@@ -169,7 +169,7 @@ bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
     addPass(createARMLoadStoreOptimizationPass(true));
-  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
     addPass(createMLxExpansionPass());
   // Since the A15SDOptimizer pass can insert VDUP instructions, it can only be
   // enabled when NEON is available.
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 53ece66..34576ba 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -124,7 +124,7 @@ public:
 
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) const;
 
-  unsigned getAddressComputationCost(Type *Val) const;
+  unsigned getAddressComputationCost(Type *Val, bool IsComplex) const;
 
   unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                   OperandValueKind Op1Info = OK_AnyValue,
@@ -411,12 +411,14 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
 
     EVT SelCondTy = TLI->getValueType(CondTy);
     EVT SelValTy = TLI->getValueType(ValTy);
-    int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
-                                          array_lengthof(NEONVectorSelectTbl),
-                                          ISD, SelCondTy.getSimpleVT(),
-                                          SelValTy.getSimpleVT());
-    if (Idx != -1)
-      return NEONVectorSelectTbl[Idx].Cost;
+    if (SelCondTy.isSimple() && SelValTy.isSimple()) {
+      int Idx = ConvertCostTableLookup<MVT>(NEONVectorSelectTbl,
+                                            array_lengthof(NEONVectorSelectTbl),
+                                            ISD, SelCondTy.getSimpleVT(),
+                                            SelValTy.getSimpleVT());
+      if (Idx != -1)
+        return NEONVectorSelectTbl[Idx].Cost;
+    }
 
     std::pair<unsigned, MVT> LT = TLI->getTypeLegalizationCost(ValTy);
     return LT.first;
@@ -425,7 +427,16 @@ unsigned ARMTTI::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
   return TargetTransformInfo::getCmpSelInstrCost(Opcode, ValTy, CondTy);
 }
 
-unsigned ARMTTI::getAddressComputationCost(Type *Ty) const {
+unsigned ARMTTI::getAddressComputationCost(Type *Ty, bool IsComplex) const {
+  // Address computations in vectorized code with non-consecutive addresses will
+  // likely result in more instructions compared to scalar code where the
+  // computation can more often be merged into the index mode. The resulting
+  // extra micro-ops can significantly decrease throughput.
+  unsigned NumVectorInstToHideOverhead = 10;
+
+  if (Ty->isVectorTy() && IsComplex)
+    return NumVectorInstToHideOverhead;
+
   // In many cases the address computation is not merged into the instruction
   // addressing mode.
   return 1;
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c59ca64..80e5c6e 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -161,6 +161,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasV7Ops() const {
     return STI.getFeatureBits() & ARM::HasV7Ops;
   }
+  bool hasV8Ops() const {
+    return STI.getFeatureBits() & ARM::HasV8Ops;
+  }
   bool hasARM() const {
     return !(STI.getFeatureBits() & ARM::FeatureNoARM);
   }
@@ -216,50 +219,17 @@ class ARMAsmParser : public MCTargetAsmParser {
                                        SMLoc &EndLoc);
 
   // Asm Match Converter Methods
-  void cvtT2LdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtT2StrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrMode2(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrMode2(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStWriteBackRegAddrMode3(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdExtTWriteBackImm(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdExtTWriteBackReg(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStExtTWriteBackImm(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStExtTWriteBackReg(MCInst &Inst,
-                             const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtStrdPre(MCInst &Inst, const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtLdWriteBackRegAddrMode3(MCInst &Inst,
-                                  const SmallVectorImpl<MCParsedAsmOperand*> &);
   void cvtThumbMultiply(MCInst &Inst,
                         const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVLDwbFixed(MCInst &Inst,
-                     const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVLDwbRegister(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVSTwbFixed(MCInst &Inst,
-                     const SmallVectorImpl<MCParsedAsmOperand*> &);
-  void cvtVSTwbRegister(MCInst &Inst,
-                        const SmallVectorImpl<MCParsedAsmOperand*> &);
   bool validateInstruction(MCInst &Inst,
                            const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
   bool shouldOmitCCOutOperand(StringRef Mnemonic,
                               SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool shouldOmitPredicateOperand(StringRef Mnemonic,
+                              SmallVectorImpl<MCParsedAsmOperand*> &Operands);
+  bool isDeprecated(MCInst &Inst, StringRef &Info);
 
 public:
   enum ARMMatchResultTy {
@@ -277,7 +247,7 @@ public:
     MCAsmParserExtension::Initialize(_Parser);
 
     // Cache the MCRegisterInfo.
-    MRI = &getContext().getRegisterInfo();
+    MRI = getContext().getRegisterInfo();
 
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
@@ -626,6 +596,40 @@ public:
   bool isITMask() const { return Kind == k_ITCondMask; }
   bool isITCondCode() const { return Kind == k_CondCode; }
   bool isImm() const { return Kind == k_Immediate; }
+  // checks whether this operand is an unsigned offset which fits is a field
+  // of specified width and scaled by a specific number of bits
+  template<unsigned width, unsigned scale>
+  bool isUnsignedOffset() const {
+    if (!isImm()) return false;
+    if (dyn_cast<MCSymbolRefExpr>(Imm.Val)) return true;
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Val = CE->getValue();
+      int64_t Align = 1LL << scale;
+      int64_t Max = Align * ((1LL << width) - 1);
+      return ((Val % Align) == 0) && (Val >= 0) && (Val <= Max);
+    }
+    return false;
+  }
+  // checks whether this operand is a memory operand computed as an offset
+  // applied to PC. the offset may have 8 bits of magnitude and is represented
+  // with two bits of shift. textually it may be either [pc, #imm], #imm or 
+  // relocable expression...
+  bool isThumbMemPC() const {
+    int64_t Val = 0;
+    if (isImm()) {
+      if (isa<MCSymbolRefExpr>(Imm.Val)) return true;
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(Imm.Val);
+      if (!CE) return false;
+      Val = CE->getValue();
+    }
+    else if (isMem()) {
+      if(!Memory.OffsetImm || Memory.OffsetRegNum) return false;
+      if(Memory.BaseRegNum != ARM::PC) return false;
+      Val = Memory.OffsetImm->getValue();
+    }
+    else return false;
+    return ((Val % 4) == 0) && (Val >= -1020) && (Val <= 1020);
+  }
   bool isFPImm() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -1704,6 +1708,37 @@ public:
     Inst.addOperand(MCOperand::CreateImm(-CE->getValue()));
   }
 
+  void addUnsignedOffset_b8s2Operands(MCInst &Inst, unsigned N) const {
+    if(const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm())) {
+      Inst.addOperand(MCOperand::CreateImm(CE->getValue() >> 2));
+      return;
+    }
+
+    const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+    assert(SR && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateExpr(SR));
+  }
+
+  void addThumbMemPCOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    if (isImm()) {
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+      if (CE) {
+        Inst.addOperand(MCOperand::CreateImm(CE->getValue()));
+        return;
+      }
+
+      const MCSymbolRefExpr *SR = dyn_cast<MCSymbolRefExpr>(Imm.Val);
+      assert(SR && "Unknown value type!");
+      Inst.addOperand(MCOperand::CreateExpr(SR));
+      return;
+    }
+
+    assert(isMem()  && "Unknown value type!");
+    assert(isa<MCConstantExpr>(Memory.OffsetImm) && "Unknown value type!");
+    Inst.addOperand(MCOperand::CreateImm(Memory.OffsetImm->getValue()));
+  }
+
   void addARMSOImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // The operand is actually a so_imm, but we have its bitwise
@@ -2278,21 +2313,24 @@ public:
   }
 
   static ARMOperand *
-  CreateRegList(const SmallVectorImpl<std::pair<unsigned, SMLoc> > &Regs,
+  CreateRegList(SmallVectorImpl<std::pair<unsigned, unsigned> > &Regs,
                 SMLoc StartLoc, SMLoc EndLoc) {
+    assert (Regs.size() > 0 && "RegList contains no registers?");
     KindTy Kind = k_RegisterList;
 
-    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().first))
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Regs.front().second))
       Kind = k_DPRRegisterList;
     else if (ARMMCRegisterClasses[ARM::SPRRegClassID].
-             contains(Regs.front().first))
+             contains(Regs.front().second))
       Kind = k_SPRRegisterList;
 
+    // Sort based on the register encoding values.
+    array_pod_sort(Regs.begin(), Regs.end());
+
     ARMOperand *Op = new ARMOperand(Kind);
-    for (SmallVectorImpl<std::pair<unsigned, SMLoc> >::const_iterator
+    for (SmallVectorImpl<std::pair<unsigned, unsigned> >::const_iterator
            I = Regs.begin(), E = Regs.end(); I != E; ++I)
-      Op->Registers.push_back(I->first);
-    array_pod_sort(Op->Registers.begin(), Op->Registers.end());
+      Op->Registers.push_back(I->second);
     Op->StartLoc = StartLoc;
     Op->EndLoc = EndLoc;
     return Op;
@@ -2972,12 +3010,14 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 
   // The reglist instructions have at most 16 registers, so reserve
   // space for that many.
-  SmallVector<std::pair<unsigned, SMLoc>, 16> Registers;
+  int EReg = 0;
+  SmallVector<std::pair<unsigned, unsigned>, 16> Registers;
 
   // Allow Q regs and just interpret them as the two D sub-registers.
   if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
     Reg = getDRegFromQReg(Reg);
-    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
     ++Reg;
   }
   const MCRegisterClass *RC;
@@ -2991,7 +3031,8 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     return Error(RegLoc, "invalid register in register list");
 
   // Store the register.
-  Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+  EReg = MRI->getEncodingValue(Reg);
+  Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
 
   // This starts immediately after the first register token in the list,
   // so we can see either a comma or a minus (range separator) as a legal
@@ -3021,7 +3062,8 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       // Add all the registers in the range to the register list.
       while (Reg != EndReg) {
         Reg = getNextRegister(Reg);
-        Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
+        EReg = MRI->getEncodingValue(Reg);
+        Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
       }
       continue;
     }
@@ -3054,14 +3096,15 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
       continue;
     }
     // VFP register lists must also be contiguous.
-    // It's OK to use the enumeration values directly here rather, as the
-    // VFP register classes have the enum sorted properly.
     if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
         Reg != OldReg + 1)
       return Error(RegLoc, "non-contiguous register range");
-    Registers.push_back(std::pair<unsigned, SMLoc>(Reg, RegLoc));
-    if (isQReg)
-      Registers.push_back(std::pair<unsigned, SMLoc>(++Reg, RegLoc));
+    EReg = MRI->getEncodingValue(Reg);
+    Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    if (isQReg) {
+      EReg = MRI->getEncodingValue(++Reg);
+      Registers.push_back(std::pair<unsigned, unsigned>(EReg, Reg));
+    }
   }
 
   if (Parser.getTok().isNot(AsmToken::RCurly))
@@ -4039,260 +4082,9 @@ parseAM3Offset(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   return MatchOperand_Success;
 }
 
-/// cvtT2LdrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtT2LdrdPre(MCInst &Inst,
-             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateReg(0));
-  // addr
-  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtT2StrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtT2StrdPre(MCInst &Inst,
-             const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateReg(0));
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[4])->addMemImm8s4OffsetOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegT2AddrModeImm8 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegT2AddrModeImm8(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemImm8OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrMode2(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrModeImm12(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-
-  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-
-/// cvtStWriteBackRegAddrModeImm12 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrModeImm12(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addMemImm12OffsetOperands(Inst, 2);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegAddrMode2 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrMode2(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addAddrMode2Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStWriteBackRegAddrMode3(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdExtTWriteBackImm - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdExtTWriteBackImm(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdExtTWriteBackReg - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdExtTWriteBackReg(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStExtTWriteBackImm - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStExtTWriteBackImm(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxImm8Operands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStExtTWriteBackReg - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStExtTWriteBackReg(MCInst &Inst,
-                      const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[3])->addMemNoOffsetOperands(Inst, 1);
-  // offset
-  ((ARMOperand*)Operands[4])->addPostIdxRegOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdrdPre(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // addr
-  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtStrdPre - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtStrdPre(MCInst &Inst,
-           const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Rt, Rt2
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  ((ARMOperand*)Operands[3])->addRegOperands(Inst, 1);
-  // addr
-  ((ARMOperand*)Operands[4])->addAddrMode3Operands(Inst, 3);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtLdWriteBackRegAddrMode3 - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
-void ARMAsmParser::
-cvtLdWriteBackRegAddrMode3(MCInst &Inst,
-                         const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  ((ARMOperand*)Operands[2])->addRegOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  ((ARMOperand*)Operands[3])->addAddrMode3Operands(Inst, 3);
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-/// cvtThumbMultiply - Convert parsed operands to MCInst.
-/// Needed here because the Asm Gen Matcher can't handle properly tied operands
-/// when they refer multiple MIOperands inside a single one.
+/// Convert parsed operands to MCInst.  Needed here because this instruction
+/// only has two register operands, but multiplication is commutative so
+/// assemblers should accept both "mul rD, rN, rD" and "mul rD, rD, rN".
 void ARMAsmParser::
 cvtThumbMultiply(MCInst &Inst,
            const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
@@ -4310,62 +4102,6 @@ cvtThumbMultiply(MCInst &Inst,
   ((ARMOperand*)Operands[2])->addCondCodeOperands(Inst, 2);
 }
 
-void ARMAsmParser::
-cvtVLDwbFixed(MCInst &Inst,
-              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Vd
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-void ARMAsmParser::
-cvtVLDwbRegister(MCInst &Inst,
-                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Vd
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vm
-  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-void ARMAsmParser::
-cvtVSTwbFixed(MCInst &Inst,
-              const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vt
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
-void ARMAsmParser::
-cvtVSTwbRegister(MCInst &Inst,
-                 const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  // Create a writeback register dummy placeholder.
-  Inst.addOperand(MCOperand::CreateImm(0));
-  // Vn
-  ((ARMOperand*)Operands[4])->addAlignedMemoryOperands(Inst, 2);
-  // Vm
-  ((ARMOperand*)Operands[5])->addRegOperands(Inst, 1);
-  // Vt
-  ((ARMOperand*)Operands[3])->addVecListOperands(Inst, 1);
-  // pred
-  ((ARMOperand*)Operands[1])->addCondCodeOperands(Inst, 2);
-}
-
 /// Parse an ARM memory expression, return false if successful else return true
 /// or an error.  The first token must be a '[' when called.
 bool ARMAsmParser::
@@ -4869,7 +4605,10 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "vcgt"  || Mnemonic == "vcle"   || Mnemonic == "smlal" ||
       Mnemonic == "umaal" || Mnemonic == "umlal"  || Mnemonic == "vabal" ||
       Mnemonic == "vmlal" || Mnemonic == "vpadal" || Mnemonic == "vqdmlal" ||
-      Mnemonic == "fmuls")
+      Mnemonic == "fmuls" || Mnemonic == "vmaxnm" || Mnemonic == "vminnm" ||
+      Mnemonic == "vcvta" || Mnemonic == "vcvtn"  || Mnemonic == "vcvtp" ||
+      Mnemonic == "vcvtm" || Mnemonic == "vrinta" || Mnemonic == "vrintn" ||
+      Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic.startswith("vsel"))
     return Mnemonic;
 
   // First, split out any predication code. Ignore mnemonics we know aren't
@@ -4966,28 +4705,30 @@ getMnemonicAcceptInfo(StringRef Mnemonic, bool &CanAcceptCarrySet,
   } else
     CanAcceptCarrySet = false;
 
-  if (Mnemonic == "cbnz" || Mnemonic == "setend" || Mnemonic == "dmb" ||
-      Mnemonic == "cps" || Mnemonic == "mcr2" || Mnemonic == "it" ||
-      Mnemonic == "mcrr2" || Mnemonic == "cbz" || Mnemonic == "cdp2" ||
-      Mnemonic == "trap" || Mnemonic == "mrc2" || Mnemonic == "mrrc2" ||
-      Mnemonic == "dsb" || Mnemonic == "isb" || Mnemonic == "setend" ||
-      (Mnemonic == "clrex" && !isThumb()) ||
-      (Mnemonic == "nop" && isThumbOne()) ||
-      ((Mnemonic == "pld" || Mnemonic == "pli" || Mnemonic == "pldw" ||
-        Mnemonic == "ldc2" || Mnemonic == "ldc2l" ||
-        Mnemonic == "stc2" || Mnemonic == "stc2l") && !isThumb()) ||
-      ((Mnemonic.startswith("rfe") || Mnemonic.startswith("srs")) &&
-       !isThumb()) ||
-      Mnemonic.startswith("cps") || (Mnemonic == "movs" && isThumbOne())) {
+  if (Mnemonic == "bkpt" || Mnemonic == "cbnz" || Mnemonic == "setend" ||
+      Mnemonic == "cps" ||  Mnemonic == "it" ||  Mnemonic == "cbz" ||
+      Mnemonic == "trap" || Mnemonic == "setend" ||
+      Mnemonic.startswith("cps") || Mnemonic.startswith("vsel") ||
+      Mnemonic == "vmaxnm" || Mnemonic == "vminnm" || Mnemonic == "vcvta" ||
+      Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
+      Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
+      Mnemonic == "vrintm") {
+    // These mnemonics are never predicable
     CanAcceptPredicationCode = false;
+  } else if (!isThumb()) {
+    // Some instructions are only predicable in Thumb mode
+    CanAcceptPredicationCode
+      = Mnemonic != "cdp2" && Mnemonic != "clrex" && Mnemonic != "mcr2" &&
+        Mnemonic != "mcrr2" && Mnemonic != "mrc2" && Mnemonic != "mrrc2" &&
+        Mnemonic != "dmb" && Mnemonic != "dsb" && Mnemonic != "isb" &&
+        Mnemonic != "pld" && Mnemonic != "pli" && Mnemonic != "pldw" &&
+        Mnemonic != "ldc2" && Mnemonic != "ldc2l" &&
+        Mnemonic != "stc2" && Mnemonic != "stc2l" &&
+        !Mnemonic.startswith("rfe") && !Mnemonic.startswith("srs");
+  } else if (isThumbOne()) {
+    CanAcceptPredicationCode = Mnemonic != "nop" && Mnemonic != "movs";
   } else
     CanAcceptPredicationCode = true;
-
-  if (isThumb()) {
-    if (Mnemonic == "bkpt" || Mnemonic == "mcr" || Mnemonic == "mcrr" ||
-        Mnemonic == "mrc" || Mnemonic == "mrrc" || Mnemonic == "cdp")
-      CanAcceptPredicationCode = false;
-  }
 }
 
 bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
@@ -5042,15 +4783,6 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
       static_cast<ARMOperand*>(Operands[5])->isImm()) {
     // Nest conditions rather than one big 'if' statement for readability.
     //
-    // If either register is a high reg, it's either one of the SP
-    // variants (handled above) or a 32-bit encoding, so we just
-    // check against T3. If the second register is the PC, this is an
-    // alternate form of ADR, which uses encoding T4, so check for that too.
-    if ((!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
-         !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg())) &&
-        static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
-        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
-      return false;
     // If both registers are low, we're in an IT block, and the immediate is
     // in range, we should use encoding T1 instead, which has a cc_out.
     if (inITBlock() &&
@@ -5058,6 +4790,11 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
         isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) &&
         static_cast<ARMOperand*>(Operands[5])->isImm0_7())
       return false;
+    // Check against T3. If the second register is the PC, this is an
+    // alternate form of ADR, which uses encoding T4, so check for that too.
+    if (static_cast<ARMOperand*>(Operands[4])->getReg() != ARM::PC &&
+        static_cast<ARMOperand*>(Operands[5])->isT2SOImm())
+      return false;
 
     // Otherwise, we use encoding T4, which does not have a cc_out
     // operand.
@@ -5120,6 +4857,34 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
   return false;
 }
 
+bool ARMAsmParser::shouldOmitPredicateOperand(
+    StringRef Mnemonic, SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
+  // VRINT{Z, R, X} have a predicate operand in VFP, but not in NEON
+  unsigned RegIdx = 3;
+  if ((Mnemonic == "vrintz" || Mnemonic == "vrintx" || Mnemonic == "vrintr") &&
+      static_cast<ARMOperand *>(Operands[2])->getToken() == ".f32") {
+    if (static_cast<ARMOperand *>(Operands[3])->isToken() &&
+        static_cast<ARMOperand *>(Operands[3])->getToken() == ".f32")
+      RegIdx = 4;
+
+    if (static_cast<ARMOperand *>(Operands[RegIdx])->isReg() &&
+        (ARMMCRegisterClasses[ARM::DPRRegClassID]
+             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg()) ||
+         ARMMCRegisterClasses[ARM::QPRRegClassID]
+             .contains(static_cast<ARMOperand *>(Operands[RegIdx])->getReg())))
+      return true;
+  }
+  return false;
+}
+
+bool ARMAsmParser::isDeprecated(MCInst &Inst, StringRef &Info) {
+  if (hasV8Ops() && Inst.getOpcode() == ARM::SETEND) {
+    Info = "armv8";
+    return true;
+  }
+  return false;
+}
+
 static bool isDataTypeToken(StringRef Tok) {
   return Tok == ".8" || Tok == ".16" || Tok == ".32" || Tok == ".64" ||
     Tok == ".i8" || Tok == ".i16" || Tok == ".i32" || Tok == ".i64" ||
@@ -5266,7 +5031,17 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         doesIgnoreDataTypeSuffix(Mnemonic, ExtraToken))
       continue;
 
-    if (ExtraToken != ".n") {
+    // For for ARM mode generate an error if the .n qualifier is used.
+    if (ExtraToken == ".n" && !isThumb()) {
+      SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
+      return Error(Loc, "instruction with .n (narrow) qualifier not allowed in "
+                   "arm mode");
+    }
+
+    // The .n qualifier is always discarded as that is what the tables
+    // and matcher expect.  In ARM mode the .w qualifier has no effect,
+    // so discard it to avoid errors that can be caused by the matcher.
+    if (ExtraToken != ".n" && (isThumb() || ExtraToken != ".w")) {
       SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Start);
       Operands.push_back(ARMOperand::CreateToken(ExtraToken, Loc));
     }
@@ -5312,6 +5087,15 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     delete Op;
   }
 
+  // Some instructions have the same mnemonic, but don't always
+  // have a predicate. Distinguish them here and delete the
+  // predicate if needed.
+  if (shouldOmitPredicateOperand(Mnemonic, Operands)) {
+    ARMOperand *Op = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op;
+  }
+
   // ARM mode 'blx' need special handling, as the register operand version
   // is predicable, but the label operand version is not. So, we can't rely
   // on the Mnemonic based checking to correctly figure out when to put
@@ -5363,6 +5147,26 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     }
   }
 
+  // FIXME: As said above, this is all a pretty gross hack.  This instruction
+  // does not fit with other "subs" and tblgen.
+  // Adjust operands of B9.3.19 SUBS PC, LR, #imm (Thumb2) system instruction
+  // so the Mnemonic is the original name "subs" and delete the predicate
+  // operand so it will match the table entry.
+  if (isThumbTwo() && Mnemonic == "sub" && Operands.size() == 6 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[3])->getReg() == ARM::PC &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->getReg() == ARM::LR &&
+      static_cast<ARMOperand*>(Operands[5])->isImm()) {
+    ARMOperand *Op0 = static_cast<ARMOperand*>(Operands[0]);
+    Operands.erase(Operands.begin());
+    delete Op0;
+    Operands.insert(Operands.begin(), ARMOperand::CreateToken(Name, NameLoc));
+
+    ARMOperand *Op1 = static_cast<ARMOperand*>(Operands[1]);
+    Operands.erase(Operands.begin() + 1);
+    delete Op1;
+  }
   return false;
 }
 
@@ -5581,6 +5385,10 @@ validateInstruction(MCInst &Inst,
   }
   }
 
+  StringRef DepInfo;
+  if (isDeprecated(Inst, DepInfo))
+    Warning(Loc, "deprecated on " + DepInfo);
+
   return false;
 }
 
@@ -5862,7 +5670,9 @@ processInstruction(MCInst &Inst,
   case ARM::t2LDRpcrel:
     // Select the narrow version if the immediate will fit.
     if (Inst.getOperand(1).getImm() > 0 &&
-        Inst.getOperand(1).getImm() <= 0xff)
+        Inst.getOperand(1).getImm() <= 0xff &&
+        !(static_cast<ARMOperand*>(Operands[2])->isToken() &&
+         static_cast<ARMOperand*>(Operands[2])->getToken() == ".w"))
       Inst.setOpcode(ARM::tLDRpci);
     else
       Inst.setOpcode(ARM::t2LDRpci);
@@ -7851,8 +7661,8 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
 /// parseDirectiveThumbFunc
 ///  ::= .thumbfunc symbol_name
 bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
-  const MCAsmInfo &MAI = getParser().getStreamer().getContext().getAsmInfo();
-  bool isMachO = MAI.hasSubsectionsViaSymbols();
+  const MCAsmInfo *MAI = getParser().getStreamer().getContext().getAsmInfo();
+  bool isMachO = MAI->hasSubsectionsViaSymbols();
   StringRef Name;
   bool needFuncName = true;
 
@@ -8199,11 +8009,19 @@ bool ARMAsmParser::parseDirectiveRegSave(SMLoc L, bool IsVector) {
   if (HandlerDataLoc.isValid())
     return Error(L, ".save or .vsave must precede .handlerdata directive");
 
+  // RAII object to make sure parsed operands are deleted.
+  struct CleanupObject {
+    SmallVector<MCParsedAsmOperand *, 1> Operands;
+    ~CleanupObject() {
+      for (unsigned I = 0, E = Operands.size(); I != E; ++I)
+        delete Operands[I];
+    }
+  } CO;
+
   // Parse the register list
-  SmallVector<MCParsedAsmOperand*, 1> Operands;
-  if (parseRegisterList(Operands))
+  if (parseRegisterList(CO.Operands))
     return true;
-  ARMOperand *Op = (ARMOperand*)Operands[0];
+  ARMOperand *Op = (ARMOperand*)CO.Operands[0];
   if (!IsVector && !Op->isRegList())
     return Error(L, ".save expects GPR registers");
   if (IsVector && !Op->isDPRRegList())
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index b832508..f271a93 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -49,7 +49,7 @@ add_llvm_target(ARMCodeGen
   Thumb2SizeReduction.cpp
   )
 
-add_dependencies(LLVMARMCodeGen intrinsics_gen)
+add_dependencies(LLVMARMCodeGen ARMCommonTableGen intrinsics_gen)
 
 # workaround for hanging compilation on MSVC9, 10
 if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index a6eab33..8a06664 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -347,6 +347,14 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void* Decoder);
 static DecodeStatus DecodeT2Imm8S4(MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2AddrModeImm8s4(MCInst &Inst, unsigned Val,
@@ -448,6 +456,13 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn, Address, this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
   result = decodeInstruction(DecoderTableNEONData32, MI, insn, Address,
                              this, STI);
   if (result != MCDisassembler::Fail) {
@@ -484,7 +499,14 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   MI.clear();
+  result = decodeInstruction(DecoderTablev8NEON32, MI, insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
 
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -746,23 +768,34 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return result;
   }
 
-  MI.clear();
-  result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
-  if (result != MCDisassembler::Fail) {
-    Size = 4;
-    UpdateThumbVFPPredicate(MI);
-    return result;
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableVFP32, MI, insn32, Address, this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      UpdateThumbVFPPredicate(MI);
+      return result;
+    }
   }
 
   MI.clear();
-  result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
-                             this, STI);
+  result = decodeInstruction(DecoderTableVFPV832, MI, insn32, Address, this, STI);
   if (result != MCDisassembler::Fail) {
     Size = 4;
-    Check(result, AddThumbPredicate(MI));
     return result;
   }
 
+  if (fieldFromInstruction(insn32, 28, 4) == 0xE) {
+    MI.clear();
+    result = decodeInstruction(DecoderTableNEONDup32, MI, insn32, Address,
+                               this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 4;
+      Check(result, AddThumbPredicate(MI));
+      return result;
+    }
+  }
+
   if (fieldFromInstruction(insn32, 24, 8) == 0xF9) {
     MI.clear();
     uint32_t NEONLdStInsn = insn32;
@@ -792,6 +825,17 @@ DecodeStatus ThumbDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
+  MI.clear();
+  uint32_t NEONv8Insn = insn32;
+  NEONv8Insn &= 0xF3FFFFFF; // Clear bits 27-26
+  result = decodeInstruction(DecoderTablev8NEON32, MI, NEONv8Insn, Address,
+                             this, STI);
+  if (result != MCDisassembler::Fail) {
+    Size = 4;
+    return result;
+  }
+
+  MI.clear();
   Size = 0;
   return MCDisassembler::Fail;
 }
@@ -908,8 +952,11 @@ static DecodeStatus DecodetcGPRRegisterClass(MCInst &Inst, unsigned RegNo,
 
 static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
-  if (RegNo == 13 || RegNo == 15) return MCDisassembler::Fail;
-  return DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder);
+  DecodeStatus S = MCDisassembler::Success;
+  if (RegNo == 13 || RegNo == 15)
+    S = MCDisassembler::SoftFail;
+  Check(S, DecodeGPRRegisterClass(Inst, RegNo, Address, Decoder));
+  return S;
 }
 
 static const uint16_t SPRDecoderTable[] = {
@@ -2104,7 +2151,7 @@ DecodeT2BInstruction(MCInst &Inst, unsigned Insn,
   unsigned imm10 = fieldFromInstruction(Insn, 16, 10);
   unsigned imm11 = fieldFromInstruction(Insn, 0, 11);
   unsigned tmp = (S << 23) | (I1 << 22) | (I2 << 21) | (imm10 << 11) | imm11;
-  int imm32 = SignExtend32<24>(tmp << 1);
+  int imm32 = SignExtend32<25>(tmp << 1);
   if (!tryAddingSymbolicOperand(Address, Address + imm32 + 4,
                                 true, 4, Inst, Decoder))
     Inst.addOperand(MCOperand::CreateImm(imm32));
@@ -3164,6 +3211,17 @@ static DecodeStatus DecodeT2AddrModeSOReg(MCInst &Inst, unsigned Val,
   unsigned Rm = fieldFromInstruction(Val, 2, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 2);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRHs:
+  case ARM::t2STRBs:
+  case ARM::t2STRs:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecoderGPRRegisterClass(Inst, Rm, Address, Decoder)))
@@ -3177,53 +3235,282 @@ static DecodeStatus DecodeT2LoadShift(MCInst &Inst, unsigned Insn,
                               uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRBs:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHs:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHs:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRSBs:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRs:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2PLDs:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIs:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHs:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHs:
+      // FIXME: this instruction is only available with MP extensions,
+      // this should be checked first but we don't have access to the
+      // feature bits here.
+      Inst.setOpcode(ARM::t2PLDWs);
+      break;
+    default:
+      break;
+    }
+  }
+
   switch (Inst.getOpcode()) {
     case ARM::t2PLDs:
     case ARM::t2PLDWs:
     case ARM::t2PLIs:
       break;
-    default: {
-      unsigned Rt = fieldFromInstruction(Insn, 12, 4);
-      if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    default:
+      if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+        return MCDisassembler::Fail;
+  }
+
+  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
+  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
+  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
+  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
     return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm8(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 9, 1);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (U << 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRi8:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBi8:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi8:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRHi8:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi8:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2PLDi8:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi8:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi8:
+      return MCDisassembler::Fail;
+    default:
+      break;
     }
   }
 
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi8:
+  case ARM::t2PLIi8:
+  case ARM::t2PLDWi8:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadImm12(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
   unsigned Rn = fieldFromInstruction(Insn, 16, 4);
-  if (Rn == 0xF) {
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 12);
+  imm |= (Rn << 13);
+
+  if (Rn == 15) {
     switch (Inst.getOpcode()) {
-      case ARM::t2LDRBs:
-        Inst.setOpcode(ARM::t2LDRBpci);
-        break;
-      case ARM::t2LDRHs:
-        Inst.setOpcode(ARM::t2LDRHpci);
-        break;
-      case ARM::t2LDRSHs:
-        Inst.setOpcode(ARM::t2LDRSHpci);
-        break;
-      case ARM::t2LDRSBs:
-        Inst.setOpcode(ARM::t2LDRSBpci);
+    case ARM::t2LDRi12:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSHi12:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    case ARM::t2LDRBi12:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRSBi12:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2PLDi12:
+      Inst.setOpcode(ARM::t2PLDpci);
+      break;
+    case ARM::t2PLIi12:
+      Inst.setOpcode(ARM::t2PLIpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRSHi12:
+      return MCDisassembler::Fail;
+    case ARM::t2LDRHi12:
+      Inst.setOpcode(ARM::t2PLDi12);
+      break;
+    default:
+      break;
+    }
+  }
+
+  switch (Inst.getOpcode()) {
+  case ARM::t2PLDi12:
+  case ARM::t2PLDWi12:
+  case ARM::t2PLIi12:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
+  }
+
+  if (!Check(S, DecodeT2AddrModeImm12(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadT(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned imm = fieldFromInstruction(Insn, 0, 8);
+  imm |= (Rn << 9);
+
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDRT:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRBT:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRHT:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSBT:
+      Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSHT:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
+  if (!Check(S, DecoderGPRRegisterClass(Inst, Rt, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeT2AddrModeImm8(Inst, imm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  return S;
+}
+
+static DecodeStatus DecodeT2LoadLabel(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void* Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Rt = fieldFromInstruction(Insn, 12, 4);
+  unsigned U = fieldFromInstruction(Insn, 23, 1);
+  int imm = fieldFromInstruction(Insn, 0, 12);
+
+  if (Rt == 15) {
+    switch (Inst.getOpcode()) {
+      case ARM::t2LDRBpci:
+      case ARM::t2LDRHpci:
+        Inst.setOpcode(ARM::t2PLDpci);
         break;
-      case ARM::t2PLDs:
-        Inst.setOpcode(ARM::t2PLDi12);
-        Inst.addOperand(MCOperand::CreateReg(ARM::PC));
+      case ARM::t2LDRSBpci:
+        Inst.setOpcode(ARM::t2PLIpci);
         break;
-      default:
+      case ARM::t2LDRSHpci:
         return MCDisassembler::Fail;
+      default:
+        break;
     }
+  }
 
-    int imm = fieldFromInstruction(Insn, 0, 12);
-    if (!fieldFromInstruction(Insn, 23, 1)) imm *= -1;
-    Inst.addOperand(MCOperand::CreateImm(imm));
-
-    return S;
+  switch(Inst.getOpcode()) {
+  case ARM::t2PLDpci:
+  case ARM::t2PLIpci:
+    break;
+  default:
+    if (!Check(S, DecodeGPRRegisterClass(Inst, Rt, Address, Decoder)))
+      return MCDisassembler::Fail;
   }
 
-  unsigned addrmode = fieldFromInstruction(Insn, 4, 2);
-  addrmode |= fieldFromInstruction(Insn, 0, 4) << 2;
-  addrmode |= fieldFromInstruction(Insn, 16, 4) << 6;
-  if (!Check(S, DecodeT2AddrModeSOReg(Inst, addrmode, Address, Decoder)))
-    return MCDisassembler::Fail;
+  if (!U) {
+    // Special case for #-0.
+    if (imm == 0)
+      imm = INT32_MIN;
+    else
+      imm = -imm;
+  }
+  Inst.addOperand(MCOperand::CreateImm(imm));
 
   return S;
 }
@@ -3292,6 +3579,21 @@ static DecodeStatus DecodeT2AddrModeImm8(MCInst &Inst, unsigned Val,
   unsigned Rn = fieldFromInstruction(Val, 9, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 9);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRT:
+  case ARM::t2STRBT:
+  case ARM::t2STRHT:
+  case ARM::t2STRi8:
+  case ARM::t2STRHi8:
+  case ARM::t2STRBi8:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+    break;
+  default:
+    break;
+  }
+
   // Some instructions always use an additive offset.
   switch (Inst.getOpcode()) {
     case ARM::t2LDRT:
@@ -3327,6 +3629,37 @@ static DecodeStatus DecodeT2LdStPre(MCInst &Inst, unsigned Insn,
   addr |= Rn << 9;
   unsigned load = fieldFromInstruction(Insn, 20, 1);
 
+  if (Rn == 15) {
+    switch (Inst.getOpcode()) {
+    case ARM::t2LDR_PRE:
+    case ARM::t2LDR_POST:
+      Inst.setOpcode(ARM::t2LDRpci);
+      break;
+    case ARM::t2LDRB_PRE:
+    case ARM::t2LDRB_POST:
+      Inst.setOpcode(ARM::t2LDRBpci);
+      break;
+    case ARM::t2LDRH_PRE:
+    case ARM::t2LDRH_POST:
+      Inst.setOpcode(ARM::t2LDRHpci);
+      break;
+    case ARM::t2LDRSB_PRE:
+    case ARM::t2LDRSB_POST:
+      if (Rt == 15)
+        Inst.setOpcode(ARM::t2PLIpci);
+      else
+        Inst.setOpcode(ARM::t2LDRSBpci);
+      break;
+    case ARM::t2LDRSH_PRE:
+    case ARM::t2LDRSH_POST:
+      Inst.setOpcode(ARM::t2LDRSHpci);
+      break;
+    default:
+      return MCDisassembler::Fail;
+    }
+    return DecodeT2LoadLabel(Inst, Insn, Address, Decoder);
+  }
+
   if (!load) {
     if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
       return MCDisassembler::Fail;
@@ -3353,6 +3686,17 @@ static DecodeStatus DecodeT2AddrModeImm12(MCInst &Inst, unsigned Val,
   unsigned Rn = fieldFromInstruction(Val, 13, 4);
   unsigned imm = fieldFromInstruction(Val, 0, 12);
 
+  // Thumb stores cannot use PC as dest register.
+  switch (Inst.getOpcode()) {
+  case ARM::t2STRi12:
+  case ARM::t2STRBi12:
+  case ARM::t2STRHi12:
+    if (Rn == 15)
+      return MCDisassembler::Fail;
+  default:
+    break;
+  }
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   Inst.addOperand(MCOperand::CreateImm(imm));
@@ -4364,10 +4708,8 @@ static DecodeStatus DecodeIT(MCInst &Inst, unsigned Insn,
     S = MCDisassembler::SoftFail;
   }
 
-  if (mask == 0x0) {
-    mask |= 0x8;
-    S = MCDisassembler::SoftFail;
-  }
+  if (mask == 0x0)
+    return MCDisassembler::Fail;
 
   Inst.addOperand(MCOperand::CreateImm(pred));
   Inst.addOperand(MCOperand::CreateImm(mask));
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 7fef795..97da232 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -243,15 +243,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
-  // Thumb1 NOP
-  if (Opcode == ARM::tMOVr && MI->getOperand(0).getReg() == ARM::R8 &&
-      MI->getOperand(1).getReg() == ARM::R8) {
-    O << "\tnop";
-    printPredicateOperand(MI, 2, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-
   // Combine 2 GPRs from disassember into a GPRPair to match with instr def.
   // ldrexd/strexd require even/odd GPR pair. To enforce this constraint,
   // a single GPRPair reg operand is used in the .td file to replace the two
@@ -315,15 +306,29 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 void ARMInstPrinter::printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                                raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
-  if (MO1.isExpr())
+  if (MO1.isExpr()) {
     O << *MO1.getExpr();
-  else if (MO1.isImm()) {
-    O << markup("<mem:") << "[pc, "
-      << markup("<imm:") << "#" << formatImm(MO1.getImm())
-      << markup(">]>", "]");
+    return;
   }
-  else
-    llvm_unreachable("Unknown LDR label operand?");
+
+  O << markup("<mem:") << "[pc, ";
+
+  int32_t OffImm = (int32_t)MO1.getImm();
+  bool isSub = OffImm < 0;
+
+  // Special value for #-0. All others are normal.
+  if (OffImm == INT32_MIN)
+    OffImm = 0;
+  if (isSub) {
+    O << markup("<imm:")
+      << "#-" << formatImm(-OffImm)
+      << markup(">");
+  } else {
+    O << markup("<imm:")
+      << "#" << formatImm(OffImm)
+      << markup(">");
+  }
+  O << "]" << markup(">");
 }
 
 // so_reg is a 4-operand unit corresponding to register forms of the A5.1
@@ -895,6 +900,7 @@ void ARMInstPrinter::printPCLabel(const MCInst *MI, unsigned OpNum,
   llvm_unreachable("Unhandled PC-relative pseudo-instruction!");
 }
 
+template<unsigned scale>
 void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
@@ -904,7 +910,7 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  int32_t OffImm = (int32_t)MO.getImm();
+  int32_t OffImm = (int32_t)MO.getImm() << scale;
 
   O << markup("<imm:");
   if (OffImm == INT32_MIN)
@@ -1065,6 +1071,7 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
   O << "]" << markup(">");
 }
 
+template<bool AlwaysPrintImm0>
 void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
                                                 unsigned OpNum,
                                                 raw_ostream &O) {
@@ -1075,22 +1082,25 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
   // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << "#-0";
-  else if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
   O << "]" << markup(">");
 }
 
+template<bool AlwaysPrintImm0>
 void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
                                                   unsigned OpNum,
                                                   raw_ostream &O) {
@@ -1106,22 +1116,24 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
+  bool isSub = OffImm < 0;
 
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
   // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
   if (OffImm == INT32_MIN)
-    O << "#-0";
-  else if (OffImm < 0)
-    O << "#-" << -OffImm;
-  else if (OffImm > 0)
-    O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+    OffImm = 0;
+  if (isSub) {
+    O << ", "
+      << markup("<imm:")
+      << "#-" << -OffImm
+      << markup(">");
+  } else if (AlwaysPrintImm0 || OffImm > 0) {
+    O << ", "
+      << markup("<imm:")
+      << "#" << OffImm
+      << markup(">");
+  }
   O << "]" << markup(">");
 }
 
@@ -1148,7 +1160,9 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm();
   O << ", " << markup("<imm:");
-  if (OffImm < 0)
+  if (OffImm == INT32_MIN)
+    O << "#-0";
+  else if (OffImm < 0)
     O << "#-" << -OffImm;
   else
     O << "#" << OffImm;
@@ -1163,19 +1177,14 @@ void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
 
   assert(((OffImm & 0x3) == 0) && "Not a valid immediate!");
 
-  // Don't print +0.
-  if (OffImm != 0)
-    O << ", ";
-  if (OffImm != 0 && UseMarkup)
-    O << "<imm:";
+  O << ", " << markup("<imm:");
   if (OffImm == INT32_MIN)
     O << "#-0";
   else if (OffImm < 0)
     O << "#-" << -OffImm;
-  else if (OffImm > 0)
+  else
     O << "#" << OffImm;
-  if (OffImm != 0 && UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 5a64348..15ae8d1 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -76,6 +76,7 @@ public:
   void printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printPKHASRShiftImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 
+  template <unsigned scale>
   void printAdrLabelOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printThumbSRImm(const MCInst *MI, unsigned OpNum, raw_ostream &O);
@@ -97,8 +98,10 @@ public:
   template<bool AlwaysPrintImm0>
   void printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
                                  raw_ostream &O);
+  template<bool AlwaysPrintImm0>
   void printT2AddrModeImm8Operand(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O);
+  template<bool AlwaysPrintImm0>
   void printT2AddrModeImm8s4Operand(const MCInst *MI, unsigned OpNum,
                                     raw_ostream &O);
   void printT2AddrModeImm0_1020s4Operand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 8baa3a6..b1e25d8 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -152,7 +152,7 @@ static unsigned getRelaxedOpcode(unsigned Op) {
   switch (Op) {
   default: return Op;
   case ARM::tBcc:       return ARM::t2Bcc;
-  case ARM::tLDRpciASM: return ARM::t2LDRpci;
+  case ARM::tLDRpci:    return ARM::t2LDRpci;
   case ARM::tADR:       return ARM::t2ADR;
   case ARM::tB:         return ARM::t2B;
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 679d3c4..6b98205 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -109,18 +109,17 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitBytes(StringRef Data, unsigned AddrSpace) {
+  virtual void EmitBytes(StringRef Data) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data, AddrSpace);
+    MCELFStreamer::EmitBytes(Data);
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
-                             unsigned AddrSpace) {
+  virtual void EmitValueImpl(const MCExpr *Value, unsigned Size) {
     EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, AddrSpace);
+    MCELFStreamer::EmitValueImpl(Value, Size);
   }
 
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -204,7 +203,7 @@ private:
 
   void EmitPersonalityFixup(StringRef Name);
   void FlushPendingOffset();
-  void FlushUnwindOpcodes(bool AllowCompactModel0);
+  void FlushUnwindOpcodes(bool NoHandlerData);
 
   void SwitchToEHSection(const char *Prefix, unsigned Type, unsigned Flags,
                          SectionKind Kind, const MCSymbol &Fn);
@@ -336,17 +335,17 @@ void ARMELFStreamer::EmitFnEnd() {
                             MCSymbolRefExpr::VK_ARM_PREL31,
                             getContext());
 
-  EmitValue(FnStartRef, 4, 0);
+  EmitValue(FnStartRef, 4);
 
   if (CantUnwind) {
-    EmitIntValue(EXIDX_CANTUNWIND, 4, 0);
+    EmitIntValue(EXIDX_CANTUNWIND, 4);
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
       MCSymbolRefExpr::Create(ExTab,
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
-    EmitValue(ExTabEntryRef, 4, 0);
+    EmitValue(ExTabEntryRef, 4);
   } else {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
     // the second word of exception index table entry.  The size of the unwind
@@ -356,7 +355,7 @@ void ARMELFStreamer::EmitFnEnd() {
     assert(Opcodes.size() == 4u &&
            "Unwind opcode size for __aeabi_cpp_unwind_pr0 must be equal to 4");
     EmitBytes(StringRef(reinterpret_cast<const char*>(Opcodes.data()),
-                        Opcodes.size()), 0);
+                        Opcodes.size()));
   }
 
   // Switch to the section containing FnStart
@@ -377,13 +376,13 @@ void ARMELFStreamer::FlushPendingOffset() {
   }
 }
 
-void ARMELFStreamer::FlushUnwindOpcodes(bool AllowCompactModel0) {
+void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // Emit the unwind opcode to restore $sp.
   if (UsedFP) {
-    const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+    const MCRegisterInfo *MRI = getContext().getRegisterInfo();
     int64_t LastRegSaveSPOffset = SPOffset - PendingOffset;
     UnwindOpAsm.EmitSPOffset(LastRegSaveSPOffset - FPOffset);
-    UnwindOpAsm.EmitSetSP(MRI.getEncodingValue(FPReg));
+    UnwindOpAsm.EmitSetSP(MRI->getEncodingValue(FPReg));
   } else {
     FlushPendingOffset();
   }
@@ -394,7 +393,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool AllowCompactModel0) {
   // For compact model 0, we have to emit the unwind opcodes in the .ARM.exidx
   // section.  Thus, we don't have to create an entry in the .ARM.extab
   // section.
-  if (AllowCompactModel0 && PersonalityIndex == AEABI_UNWIND_CPP_PR0)
+  if (NoHandlerData && PersonalityIndex == AEABI_UNWIND_CPP_PR0)
     return;
 
   // Switch to .ARM.extab section.
@@ -412,12 +411,22 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool AllowCompactModel0) {
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
 
-    EmitValue(PersonalityRef, 4, 0);
+    EmitValue(PersonalityRef, 4);
   }
 
   // Emit unwind opcodes
   EmitBytes(StringRef(reinterpret_cast<const char *>(Opcodes.data()),
-                      Opcodes.size()), 0);
+                      Opcodes.size()));
+
+  // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
+  // __aeabi_unwind_cpp_pr2() is used, then the handler data must be emitted
+  // after the unwind opcodes.  The handler data consists of several 32-bit
+  // words, and should be terminated by zero.
+  //
+  // In case that the .handlerdata directive is not specified by the
+  // programmer, we should emit zero to terminate the handler data.
+  if (NoHandlerData && !Personality)
+    EmitIntValue(0, 4);
 }
 
 void ARMELFStreamer::EmitHandlerData() {
@@ -458,9 +467,9 @@ void ARMELFStreamer::EmitRegSave(const SmallVectorImpl<unsigned> &RegList,
   // Collect the registers in the register list
   unsigned Count = 0;
   uint32_t Mask = 0;
-  const MCRegisterInfo &MRI = getContext().getRegisterInfo();
+  const MCRegisterInfo *MRI = getContext().getRegisterInfo();
   for (size_t i = 0; i < RegList.size(); ++i) {
-    unsigned Reg = MRI.getEncodingValue(RegList[i]);
+    unsigned Reg = MRI->getEncodingValue(RegList[i]);
     assert(Reg < (IsVector ? 32U : 16U) && "Register out of range");
     unsigned Bit = (1u << Reg);
     if ((Mask & Bit) == 0) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 2aa1010..a18d465 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -315,6 +315,8 @@ public:
                                           unsigned EncodedValue) const;
   unsigned NEONThumb2DupPostEncoder(const MCInst &MI,
                                     unsigned EncodedValue) const;
+  unsigned NEONThumb2V8PostEncoder(const MCInst &MI,
+                                   unsigned EncodedValue) const;
 
   unsigned VFPThumb2PostEncoder(const MCInst &MI,
                                 unsigned EncodedValue) const;
@@ -389,6 +391,17 @@ unsigned ARMMCCodeEmitter::NEONThumb2DupPostEncoder(const MCInst &MI,
   return EncodedValue;
 }
 
+/// Post-process encoded NEON v8 instructions, and rewrite them to Thumb2 form
+/// if we are in Thumb2.
+unsigned ARMMCCodeEmitter::NEONThumb2V8PostEncoder(const MCInst &MI,
+                                                 unsigned EncodedValue) const {
+  if (isThumb2()) {
+    EncodedValue |= 0xC000000; // Set bits 27-26
+  }
+
+  return EncodedValue;
+}
+
 /// VFPThumb2PostEncoder - Post-process encoded VFP instructions and rewrite
 /// them to their Thumb2 form if we are currently in Thumb2 mode.
 unsigned ARMMCCodeEmitter::
@@ -407,7 +420,7 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups) const {
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
 
     // Q registers are encoded as 2x their register number.
     switch (Reg) {
@@ -436,7 +449,7 @@ EncodeAddrModeOpValues(const MCInst &MI, unsigned OpIdx, unsigned &Reg,
   const MCOperand &MO  = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
 
-  Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   int32_t SImm = MO1.getImm();
   bool isAdd = true;
@@ -724,8 +737,8 @@ getThumbAddrModeRegRegOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO1 = MI.getOperand(OpIdx);
   const MCOperand &MO2 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
-  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
   return (Rm << 3) | Rn;
 }
 
@@ -741,12 +754,12 @@ getAddrModeImm12OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm12 = 0;
-    isAdd = false ; // 'U' bit is set as part of the fixup.
 
     if (MO.isExpr()) {
       const MCExpr *Expr = MO.getExpr();
+      isAdd = false ; // 'U' bit is set as part of the fixup.
 
       MCFixupKind Kind;
       if (isThumb2())
@@ -821,7 +834,7 @@ getT2AddrModeImm8s4OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false ; // 'U' bit is set as part of the fixup.
 
@@ -857,7 +870,7 @@ getT2AddrModeImm0_1020s4OpValue(const MCInst &MI, unsigned OpIdx,
   // {7-0}  = imm8
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Reg = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Reg = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm8 = MO1.getImm();
   return (Reg << 8) | Imm8;
 }
@@ -940,8 +953,8 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   const MCOperand &MO2 = MI.getOperand(OpIdx+2);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
-  unsigned Rm = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
+  unsigned Rm = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm());
   bool isAdd = ARM_AM::getAM2Op(MO2.getImm()) == ARM_AM::add;
   ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
@@ -975,7 +988,7 @@ getAddrMode2OpValue(const MCInst &MI, unsigned OpIdx,
   // {12}     isAdd
   // {11-0}   imm12/Rm
   const MCOperand &MO = MI.getOperand(OpIdx);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   uint32_t Binary = getAddrMode2OffsetOpValue(MI, OpIdx + 1, Fixups);
   Binary |= Rn << 14;
   return Binary;
@@ -998,7 +1011,7 @@ getAddrMode2OffsetOpValue(const MCInst &MI, unsigned OpIdx,
     ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(Imm);
     Binary <<= 7;                    // Shift amount is bits [11:7]
     Binary |= getShiftOp(ShOp) << 5; // Shift type is bits [6:5]
-    Binary |= CTX.getRegisterInfo().getEncodingValue(MO.getReg()); // Rm is bits [3:0]
+    Binary |= CTX.getRegisterInfo()->getEncodingValue(MO.getReg()); // Rm is bits [3:0]
   }
   return Binary | (isAdd << 12) | (isReg << 13);
 }
@@ -1011,7 +1024,7 @@ getPostIdxRegOpValue(const MCInst &MI, unsigned OpIdx,
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx+1);
   bool isAdd = MO1.getImm() != 0;
-  return CTX.getRegisterInfo().getEncodingValue(MO.getReg()) | (isAdd << 4);
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg()) | (isAdd << 4);
 }
 
 uint32_t ARMMCCodeEmitter::
@@ -1029,7 +1042,7 @@ getAddrMode3OffsetOpValue(const MCInst &MI, unsigned OpIdx,
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   return Imm8 | (isAdd << 8) | (isImm << 9);
 }
 
@@ -1047,7 +1060,7 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
 
   // If The first operand isn't a register, we have a label reference.
   if (!MO.isReg()) {
-    unsigned Rn = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
 
     assert(MO.isExpr() && "Unexpected machine operand type!");
     const MCExpr *Expr = MO.getExpr();
@@ -1057,14 +1070,14 @@ getAddrMode3OpValue(const MCInst &MI, unsigned OpIdx,
     ++MCNumCPRelocations;
     return (Rn << 9) | (1 << 13);
   }
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm = MO2.getImm();
   bool isAdd = ARM_AM::getAM3Op(Imm) == ARM_AM::add;
   bool isImm = MO1.getReg() == 0;
   uint32_t Imm8 = ARM_AM::getAM3Offset(Imm);
   // if reg +/- reg, Rm will be non-zero. Otherwise, we have reg +/- imm8
   if (!isImm)
-    Imm8 = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+    Imm8 = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   return (Rn << 9) | Imm8 | (isAdd << 8) | (isImm << 13);
 }
 
@@ -1092,7 +1105,7 @@ getAddrModeISOpValue(const MCInst &MI, unsigned OpIdx,
   //   {2-0} = Rn
   const MCOperand &MO = MI.getOperand(OpIdx);
   const MCOperand &MO1 = MI.getOperand(OpIdx + 1);
-  unsigned Rn = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Rn = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
   unsigned Imm5 = MO1.getImm();
   return ((Imm5 & 0x1f) << 3) | Rn;
 }
@@ -1119,7 +1132,7 @@ getAddrMode5OpValue(const MCInst &MI, unsigned OpIdx,
   // If The first operand isn't a register, we have a label reference.
   const MCOperand &MO = MI.getOperand(OpIdx);
   if (!MO.isReg()) {
-    Reg = CTX.getRegisterInfo().getEncodingValue(ARM::PC);   // Rn is PC.
+    Reg = CTX.getRegisterInfo()->getEncodingValue(ARM::PC);   // Rn is PC.
     Imm8 = 0;
     isAdd = false; // 'U' bit is handled as part of the fixup.
 
@@ -1165,7 +1178,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO2.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1190,7 +1203,7 @@ getSORegRegOpValue(const MCInst &MI, unsigned OpIdx,
   // Encode the shift operation Rs.
   // Encode Rs bit[11:8].
   assert(ARM_AM::getSORegOffset(MO2.getImm()) == 0);
-  return Binary | (CTX.getRegisterInfo().getEncodingValue(Rs) << ARMII::RegRsShift);
+  return Binary | (CTX.getRegisterInfo()->getEncodingValue(Rs) << ARMII::RegRsShift);
 }
 
 unsigned ARMMCCodeEmitter::
@@ -1209,7 +1222,7 @@ getSORegImmOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1248,9 +1261,9 @@ getT2AddrModeSORegOpValue(const MCInst &MI, unsigned OpNum,
 
   // Encoded as [Rn, Rm, imm].
   // FIXME: Needs fixup support.
-  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
   Value <<= 4;
-  Value |= CTX.getRegisterInfo().getEncodingValue(MO2.getReg());
+  Value |= CTX.getRegisterInfo()->getEncodingValue(MO2.getReg());
   Value <<= 2;
   Value |= MO3.getImm();
 
@@ -1264,7 +1277,7 @@ getT2AddrModeImm8OpValue(const MCInst &MI, unsigned OpNum,
   const MCOperand &MO2 = MI.getOperand(OpNum+1);
 
   // FIXME: Needs fixup support.
-  unsigned Value = CTX.getRegisterInfo().getEncodingValue(MO1.getReg());
+  unsigned Value = CTX.getRegisterInfo()->getEncodingValue(MO1.getReg());
 
   // Even though the immediate is 8 bits long, we need 9 bits in order
   // to represent the (inverse of the) sign bit.
@@ -1326,7 +1339,7 @@ getT2SORegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc SOpc = ARM_AM::getSORegShOp(MO1.getImm());
 
   // Encode Rm.
-  unsigned Binary = CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  unsigned Binary = CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 
   // Encode the shift opcode.
   unsigned SBits = 0;
@@ -1382,7 +1395,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
 
   if (SPRRegs || DPRRegs) {
     // VLDM/VSTM
-    unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg);
+    unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
     unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
     Binary |= (RegNo & 0x1f) << 8;
     if (SPRRegs)
@@ -1391,7 +1404,7 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
       Binary |= NumRegs * 2;
   } else {
     for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
-      unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(MI.getOperand(I).getReg());
+      unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(MI.getOperand(I).getReg());
       Binary |= 1 << RegNo;
     }
   }
@@ -1407,7 +1420,7 @@ getAddrMode6AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1430,7 +1443,7 @@ getAddrMode6OneLane32AddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1456,7 +1469,7 @@ getAddrMode6DupAddressOpValue(const MCInst &MI, unsigned Op,
   const MCOperand &Reg = MI.getOperand(Op);
   const MCOperand &Imm = MI.getOperand(Op + 1);
 
-  unsigned RegNo = CTX.getRegisterInfo().getEncodingValue(Reg.getReg());
+  unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg.getReg());
   unsigned Align = 0;
 
   switch (Imm.getImm()) {
@@ -1475,7 +1488,7 @@ getAddrMode6OffsetOpValue(const MCInst &MI, unsigned Op,
                           SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(Op);
   if (MO.getReg() == 0) return 0x0D;
-  return CTX.getRegisterInfo().getEncodingValue(MO.getReg());
+  return CTX.getRegisterInfo()->getEncodingValue(MO.getReg());
 }
 
 unsigned ARMMCCodeEmitter::
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 14fd03f..caa1949 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -59,7 +59,10 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
   std::string ARMArchFeature;
   if (Idx) {
     unsigned SubVer = TT[Idx];
-    if (SubVer >= '7' && SubVer <= '9') {
+    if (SubVer == '8') {
+      // FIXME: Parse v8 features
+      ARMArchFeature = "+v8";
+    } else if (SubVer == '7') {
       if (Len >= Idx+2 && TT[Idx+1] == 'm') {
         isThumb = true;
         if (NoCPU)