aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Target/ARM/ARMInstrNEON.td91
-rw-r--r--lib/Target/ARM/AsmParser/ARMAsmParser.cpp101
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp22
-rw-r--r--lib/Target/ARM/InstPrinter/ARMInstPrinter.h4
-rw-r--r--test/MC/ARM/neon-vld-encoding.s41
5 files changed, 253 insertions, 6 deletions
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 4040db9..eea25a6 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -171,6 +171,27 @@ def VecListTwoQAllLanes : RegisterOperand<DPR,
"printVectorListTwoSpacedAllLanes"> {
let ParserMatchClass = VecListTwoQAllLanesAsmOperand;
}
+// Register list of three D registers, with "all lanes" subscripting.
+def VecListThreeDAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeDAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeDAllLanes : RegisterOperand<DPR,
+ "printVectorListThreeAllLanes"> {
+ let ParserMatchClass = VecListThreeDAllLanesAsmOperand;
+}
+// Register list of three D registers spaced by 2 (three sequential Q regs).
+def VecListThreeQAllLanesAsmOperand : AsmOperandClass {
+ let Name = "VecListThreeQAllLanes";
+ let ParserMethod = "parseVectorList";
+ let RenderMethod = "addVecListOperands";
+}
+def VecListThreeQAllLanes : RegisterOperand<DPR,
+ "printVectorListThreeSpacedAllLanes"> {
+ let ParserMatchClass = VecListThreeQAllLanesAsmOperand;
+}
+
// Register list of one D register, with byte lane subscripting.
def VecListOneDByteIndexAsmOperand : AsmOperandClass {
@@ -1433,9 +1454,9 @@ def VLD3DUPd16Pseudo : VLDQQPseudo<IIC_VLD3dup>;
def VLD3DUPd32Pseudo : VLDQQPseudo<IIC_VLD3dup>;
// ...with double-spaced registers (not used for codegen):
-def VLD3DUPd8x2 : VLD3DUP<{0,0,1,?}, "8">;
-def VLD3DUPd16x2 : VLD3DUP<{0,1,1,?}, "16">;
-def VLD3DUPd32x2 : VLD3DUP<{1,0,1,?}, "32">;
+def VLD3DUPq8 : VLD3DUP<{0,0,1,?}, "8">;
+def VLD3DUPq16 : VLD3DUP<{0,1,1,?}, "16">;
+def VLD3DUPq32 : VLD3DUP<{1,0,1,?}, "32">;
// ...with address register writeback:
class VLD3DUPWB<bits<4> op7_4, string Dt>
@@ -1451,9 +1472,9 @@ def VLD3DUPd8_UPD : VLD3DUPWB<{0,0,0,0}, "8">;
def VLD3DUPd16_UPD : VLD3DUPWB<{0,1,0,?}, "16">;
def VLD3DUPd32_UPD : VLD3DUPWB<{1,0,0,?}, "32">;
-def VLD3DUPd8x2_UPD : VLD3DUPWB<{0,0,1,0}, "8">;
-def VLD3DUPd16x2_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
-def VLD3DUPd32x2_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
+def VLD3DUPq8_UPD : VLD3DUPWB<{0,0,1,0}, "8">;
+def VLD3DUPq16_UPD : VLD3DUPWB<{0,1,1,?}, "16">;
+def VLD3DUPq32_UPD : VLD3DUPWB<{1,0,1,?}, "32">;
def VLD3DUPd8Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
def VLD3DUPd16Pseudo_UPD : VLDQQWBPseudo<IIC_VLD3dupu>;
@@ -6036,6 +6057,64 @@ def VST2LNqWB_register_Asm_32 :
(ins VecListTwoQWordIndexed:$list, addrmode6:$addr,
rGPR:$Rm, pred:$p)>;
+// VLD3 all-lanes pseudo-instructions. These need special handling for
+// the lane index that an InstAlias can't handle, so we use these instead.
+def VLD3DUPdAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_8 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_16 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqAsm_32 : NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+
+def VLD3DUPdWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPqWB_fixed_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr!",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr, pred:$p)>;
+def VLD3DUPdWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPdWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeDAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_8 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".8", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_16 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".16", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+def VLD3DUPqWB_register_Asm_32 :
+ NEONDataTypeAsmPseudoInst<"vld3${p}", ".32", "$list, $addr, $Rm",
+ (ins VecListThreeQAllLanes:$list, addrmode6:$addr,
+ rGPR:$Rm, pred:$p)>;
+
// VLD3 single-lane pseudo-instructions. These need special handling for
// the lane index that an InstAlias can't handle, so we use these instead.
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index a3657db..20b2e85 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1132,6 +1132,16 @@ public:
return VectorList.Count == 2;
}
+ bool isVecListThreeDAllLanes() const {
+ if (!isSingleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 3;
+ }
+
+ bool isVecListThreeQAllLanes() const {
+ if (!isDoubleSpacedVectorAllLanes()) return false;
+ return VectorList.Count == 3;
+ }
+
bool isSingleSpacedVectorIndexed() const {
return Kind == k_VectorListIndexed && !VectorList.isDoubleSpaced;
}
@@ -5343,6 +5353,26 @@ static unsigned getRealVLDOpcode(unsigned Opc, unsigned &Spacing) {
case ARM::VLD2LNqAsm_16: Spacing = 2; return ARM::VLD2LNq16;
case ARM::VLD2LNqAsm_32: Spacing = 2; return ARM::VLD2LNq32;
+ // VLD3DUP
+ case ARM::VLD3DUPdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD;
+ case ARM::VLD3DUPdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+ case ARM::VLD3DUPdWB_fixed_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3DUPq8_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3DUPq16_UPD;
+ case ARM::VLD3DUPqWB_fixed_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_8: Spacing = 1; return ARM::VLD3DUPd8_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_16: Spacing = 1; return ARM::VLD3DUPd16_UPD;
+ case ARM::VLD3DUPdWB_register_Asm_32: Spacing = 1; return ARM::VLD3DUPd32_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_8: Spacing = 2; return ARM::VLD3DUPq8_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_16: Spacing = 2; return ARM::VLD3DUPq16_UPD;
+ case ARM::VLD3DUPqWB_register_Asm_32: Spacing = 2; return ARM::VLD3DUPq32_UPD;
+ case ARM::VLD3DUPdAsm_8: Spacing = 1; return ARM::VLD3DUPd8;
+ case ARM::VLD3DUPdAsm_16: Spacing = 1; return ARM::VLD3DUPd16;
+ case ARM::VLD3DUPdAsm_32: Spacing = 1; return ARM::VLD3DUPd32;
+ case ARM::VLD3DUPqAsm_8: Spacing = 2; return ARM::VLD3DUPq8;
+ case ARM::VLD3DUPqAsm_16: Spacing = 2; return ARM::VLD3DUPq16;
+ case ARM::VLD3DUPqAsm_32: Spacing = 2; return ARM::VLD3DUPq32;
+
// VLD3LN
case ARM::VLD3LNdWB_fixed_Asm_8: Spacing = 1; return ARM::VLD3LNd8_UPD;
case ARM::VLD3LNdWB_fixed_Asm_16: Spacing = 1; return ARM::VLD3LNd16_UPD;
@@ -6061,6 +6091,77 @@ processInstruction(MCInst &Inst,
return true;
}
+ // VLD3DUP single 3-element structure to all lanes instructions.
+ case ARM::VLD3DUPdAsm_8:
+ case ARM::VLD3DUPdAsm_16:
+ case ARM::VLD3DUPdAsm_32:
+ case ARM::VLD3DUPqAsm_8:
+ case ARM::VLD3DUPqAsm_16:
+ case ARM::VLD3DUPqAsm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3DUPdWB_fixed_Asm_8:
+ case ARM::VLD3DUPdWB_fixed_Asm_16:
+ case ARM::VLD3DUPdWB_fixed_Asm_32:
+ case ARM::VLD3DUPqWB_fixed_Asm_8:
+ case ARM::VLD3DUPqWB_fixed_Asm_16:
+ case ARM::VLD3DUPqWB_fixed_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(MCOperand::CreateReg(0)); // Rm
+ TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(4));
+ Inst = TmpInst;
+ return true;
+ }
+
+ case ARM::VLD3DUPdWB_register_Asm_8:
+ case ARM::VLD3DUPdWB_register_Asm_16:
+ case ARM::VLD3DUPdWB_register_Asm_32:
+ case ARM::VLD3DUPqWB_register_Asm_8:
+ case ARM::VLD3DUPqWB_register_Asm_16:
+ case ARM::VLD3DUPqWB_register_Asm_32: {
+ MCInst TmpInst;
+ unsigned Spacing;
+ TmpInst.setOpcode(getRealVLDOpcode(Inst.getOpcode(), Spacing));
+ TmpInst.addOperand(Inst.getOperand(0)); // Vd
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing));
+ TmpInst.addOperand(MCOperand::CreateReg(Inst.getOperand(0).getReg() +
+ Spacing * 2));
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn
+ TmpInst.addOperand(Inst.getOperand(1)); // Rn_wb == tied Rn
+ TmpInst.addOperand(Inst.getOperand(2)); // alignment
+ TmpInst.addOperand(Inst.getOperand(3)); // Rm
+ TmpInst.addOperand(Inst.getOperand(4)); // CondCode
+ TmpInst.addOperand(Inst.getOperand(5));
+ Inst = TmpInst;
+ return true;
+ }
+
// VLD3 multiple 3-element structure instructions.
case ARM::VLD3dAsm_8:
case ARM::VLD3dAsm_16:
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 27d5de8..455a487 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -1067,6 +1067,17 @@ void ARMInstPrinter::printVectorListTwoAllLanes(const MCInst *MI,
<< getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[]}";
}
+void ARMInstPrinter::printVectorListThreeAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
+ << getRegisterName(MI->getOperand(OpNum).getReg() + 1) << "[], "
+ << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[]}";
+}
+
void ARMInstPrinter::printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
raw_ostream &O) {
// Normally, it's not safe to use register enum values directly with
@@ -1086,6 +1097,17 @@ void ARMInstPrinter::printVectorListTwoSpacedAllLanes(const MCInst *MI,
<< getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[]}";
}
+void ARMInstPrinter::printVectorListThreeSpacedAllLanes(const MCInst *MI,
+ unsigned OpNum,
+ raw_ostream &O) {
+ // Normally, it's not safe to use register enum values directly with
+ // addition to get the next register, but for VFP registers, the
+ // sort order is guaranteed because they're all of the form D<n>.
+ O << "{" << getRegisterName(MI->getOperand(OpNum).getReg()) << "[], "
+ << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[], "
+ << getRegisterName(MI->getOperand(OpNum).getReg() + 2) << "[]}";
+}
+
void ARMInstPrinter::printVectorListThreeSpaced(const MCInst *MI,
unsigned OpNum,
raw_ostream &O) {
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 4af116c..ffcc79b 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -139,10 +139,14 @@ public:
raw_ostream &O);
void printVectorListTwoAllLanes(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
+ void printVectorListThreeAllLanes(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O);
void printVectorListTwoSpaced(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
void printVectorListTwoSpacedAllLanes(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
+ void printVectorListThreeSpacedAllLanes(const MCInst *MI, unsigned OpNum,
+ raw_ostream &O);
void printVectorListThreeSpaced(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
void printVectorListFourSpaced(const MCInst *MI, unsigned OpNum,
diff --git a/test/MC/ARM/neon-vld-encoding.s b/test/MC/ARM/neon-vld-encoding.s
index ba9b218..de6b8fb 100644
--- a/test/MC/ARM/neon-vld-encoding.s
+++ b/test/MC/ARM/neon-vld-encoding.s
@@ -336,6 +336,47 @@
@ CHECK: vld3.32 {d5[0], d7[0], d9[0]}, [r4]! @ encoding: [0x4d,0x5a,0xa4,0xf4]
+ vld3.8 {d16[], d17[], d18[]}, [r1]
+ vld3.16 {d16[], d17[], d18[]}, [r2]
+ vld3.32 {d16[], d17[], d18[]}, [r3]
+ vld3.8 {d17[], d19[], d21[]}, [r7]
+ vld3.16 {d17[], d19[], d21[]}, [r7]
+ vld3.32 {d16[], d18[], d20[]}, [r8]
+
+ vld3.s8 {d16[], d17[], d18[]}, [r1]!
+ vld3.s16 {d16[], d17[], d18[]}, [r2]!
+ vld3.s32 {d16[], d17[], d18[]}, [r3]!
+ vld3.u8 {d17[], d19[], d21[]}, [r7]!
+ vld3.u16 {d17[], d19[], d21[]}, [r7]!
+ vld3.u32 {d16[], d18[], d20[]}, [r8]!
+
+ vld3.p8 {d16[], d17[], d18[]}, [r1], r8
+ vld3.p16 {d16[], d17[], d18[]}, [r2], r7
+ vld3.f32 {d16[], d17[], d18[]}, [r3], r5
+ vld3.i8 {d16[], d18[], d20[]}, [r6], r3
+ vld3.i16 {d16[], d18[], d20[]}, [r6], r3
+ vld3.i32 {d17[], d19[], d21[]}, [r9], r4
+
+@ CHECK: vld3.8 {d16[], d17[], d18[]}, [r1] @ encoding: [0x0f,0x0e,0xe1,0xf4]
+@ CHECK: vld3.16 {d16[], d17[], d18[]}, [r2] @ encoding: [0x4f,0x0e,0xe2,0xf4]
+@ CHECK: vld3.32 {d16[], d17[], d18[]}, [r3] @ encoding: [0x8f,0x0e,0xe3,0xf4]
+@ CHECK: vld3.8 {d17[], d19[], d21[]}, [r7] @ encoding: [0x2f,0x1e,0xe7,0xf4]
+@ CHECK: vld3.16 {d17[], d19[], d21[]}, [r7] @ encoding: [0x6f,0x1e,0xe7,0xf4]
+@ CHECK: vld3.32 {d16[], d18[], d20[]}, [r8] @ encoding: [0xaf,0x0e,0xe8,0xf4]
+@ CHECK: vld3.8 {d16[], d17[], d18[]}, [r1]! @ encoding: [0x0d,0x0e,0xe1,0xf4]
+@ CHECK: vld3.16 {d16[], d17[], d18[]}, [r2]! @ encoding: [0x4d,0x0e,0xe2,0xf4]
+@ CHECK: vld3.32 {d16[], d17[], d18[]}, [r3]! @ encoding: [0x8d,0x0e,0xe3,0xf4]
+@ CHECK: vld3.8 {d17[], d18[], d19[]}, [r7]! @ encoding: [0x2d,0x1e,0xe7,0xf4]
+@ CHECK: vld3.16 {d17[], d18[], d19[]}, [r7]! @ encoding: [0x6d,0x1e,0xe7,0xf4]
+@ CHECK: vld3.32 {d16[], d18[], d20[]}, [r8]! @ encoding: [0xad,0x0e,0xe8,0xf4]
+@ CHECK: vld3.8 {d16[], d17[], d18[]}, [r1], r8 @ encoding: [0x08,0x0e,0xe1,0xf4]
+@ CHECK: vld3.16 {d16[], d17[], d18[]}, [r2], r7 @ encoding: [0x47,0x0e,0xe2,0xf4]
+@ CHECK: vld3.32 {d16[], d17[], d18[]}, [r3], r5 @ encoding: [0x85,0x0e,0xe3,0xf4]
+@ CHECK: vld3.8 {d16[], d18[], d20[]}, [r6], r3 @ encoding: [0x23,0x0e,0xe6,0xf4]
+@ CHECK: vld3.16 {d16[], d18[], d20[]}, [r6], r3 @ encoding: [0x63,0x0e,0xe6,0xf4]
+@ CHECK: vld3.32 {d17[], d19[], d21[]}, [r9], r4 @ encoding: [0xa4,0x1e,0xe9,0xf4]
+
+
vld4.8 {d16[1], d17[1], d18[1], d19[1]}, [r1]
vld4.16 {d16[1], d17[1], d18[1], d19[1]}, [r2]
vld4.32 {d16[1], d17[1], d18[1], d19[1]}, [r3]