aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRafael Espindola <rafael.espindola@gmail.com>2013-10-10 15:15:17 +0000
committerRafael Espindola <rafael.espindola@gmail.com>2013-10-10 15:15:17 +0000
commit812ddcc50f8bc3ec6ce115863ff2263815906aaf (patch)
tree09d78a3a26f09e84735e4195e566584f0996b91a
parentd622bef31d11a5a6429fe7fad557c9b111e96f69 (diff)
downloadexternal_llvm-812ddcc50f8bc3ec6ce115863ff2263815906aaf.zip
external_llvm-812ddcc50f8bc3ec6ce115863ff2263815906aaf.tar.gz
external_llvm-812ddcc50f8bc3ec6ce115863ff2263815906aaf.tar.bz2
Revert "Implement AArch64 vector load/store multiple N-element structure class SIMD(lselem). Including following 14 instructions: 4 ld1 insts: load multiple 1-element structure to sequential 1/2/3/4 registers. ld2/ld3/ld4: load multiple N-element structure to sequential N registers (N=2,3,4). 4 st1 insts: store multiple 1-element structure from sequential 1/2/3/4 registers. st2/st3/st4: store multiple N-element structure from sequential N registers (N = 2,3,4)."
This reverts commit r192352. It broke the build. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@192354 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--include/llvm/CodeGen/ValueTypes.h2
-rw-r--r--lib/Target/AArch64/AArch64ISelDAGToDAG.cpp298
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp54
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h4
-rw-r--r--lib/Target/AArch64/AArch64InstrFormats.td18
-rw-r--r--lib/Target/AArch64/AArch64InstrNEON.td126
-rw-r--r--lib/Target/AArch64/AArch64RegisterInfo.td101
-rw-r--r--lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp166
-rw-r--r--lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp53
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp30
-rw-r--r--lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h3
-rw-r--r--lib/Target/AArch64/Utils/AArch64BaseInfo.h44
-rw-r--r--test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll1228
-rw-r--r--test/MC/AArch64/neon-diagnostics.s221
-rw-r--r--test/MC/AArch64/neon-simd-ldst-multi-elem.s463
15 files changed, 3 insertions, 2808 deletions
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index 79f3233..2e8f637 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -208,7 +208,7 @@ namespace llvm {
bool is64BitVector() const {
return (SimpleTy == MVT::v8i8 || SimpleTy == MVT::v4i16 ||
SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 ||
- SimpleTy == MVT::v1f64 || SimpleTy == MVT::v2f32);
+ SimpleTy == MVT::v2f32);
}
/// is128BitVector - Return true if this is a 128-bit vector type.
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3b0dd64..a865564 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -109,23 +109,6 @@ public:
SDNode* Select(SDNode*);
private:
- /// Select NEON load intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVLD(SDNode *N, unsigned NumVecs, const uint16_t *Opcode);
-
- /// Select NEON store intrinsics. NumVecs should be 1, 2, 3 or 4.
- SDNode *SelectVST(SDNode *N, unsigned NumVecs, const uint16_t *Opcodes);
-
- // Form pairs of consecutive 64-bit/128-bit registers.
- SDNode *createDPairNode(SDValue V0, SDValue V1);
- SDNode *createQPairNode(SDValue V0, SDValue V1);
-
- // Form sequences of 3 consecutive 64-bit/128-bit registers.
- SDNode *createDTripleNode(SDValue V0, SDValue V1, SDValue V2);
- SDNode *createQTripleNode(SDValue V0, SDValue V1, SDValue V2);
-
- // Form sequences of 4 consecutive 64-bit/128-bit registers.
- SDNode *createDQuadNode(SDValue V0, SDValue V1, SDValue V2, SDValue V3);
- SDNode *createQQuadNode(SDValue V0, SDValue V1, SDValue V2, SDValue V3);
};
}
@@ -407,221 +390,6 @@ SDNode *AArch64DAGToDAGISel::SelectAtomic(SDNode *Node, unsigned Op8,
&Ops[0], Ops.size());
}
-SDNode *AArch64DAGToDAGISel::createDPairNode(SDValue V0, SDValue V1) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::DPairRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v2i64,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::createQPairNode(SDValue V0, SDValue V1) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::QPairRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v4i64,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::createDTripleNode(SDValue V0, SDValue V1,
- SDValue V2) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::DTripleRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32);
- SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::dsub_2, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::createQTripleNode(SDValue V0, SDValue V1,
- SDValue V2) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::QTripleRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32);
- SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::qsub_2, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::createDQuadNode(SDValue V0, SDValue V1, SDValue V2,
- SDValue V3) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::DQuadRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::dsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::dsub_1, MVT::i32);
- SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::dsub_2, MVT::i32);
- SDValue SubReg3 = CurDAG->getTargetConstant(AArch64::dsub_3, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3,
- SubReg3 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v4i64,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::createQQuadNode(SDValue V0, SDValue V1, SDValue V2,
- SDValue V3) {
- SDLoc dl(V0.getNode());
- SDValue RegClass =
- CurDAG->getTargetConstant(AArch64::QQuadRegClassID, MVT::i32);
- SDValue SubReg0 = CurDAG->getTargetConstant(AArch64::qsub_0, MVT::i32);
- SDValue SubReg1 = CurDAG->getTargetConstant(AArch64::qsub_1, MVT::i32);
- SDValue SubReg2 = CurDAG->getTargetConstant(AArch64::qsub_2, MVT::i32);
- SDValue SubReg3 = CurDAG->getTargetConstant(AArch64::qsub_3, MVT::i32);
- const SDValue Ops[] = { RegClass, V0, SubReg0, V1, SubReg1, V2, SubReg2, V3,
- SubReg3 };
- return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::v8i64,
- Ops);
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVLD(SDNode *N, unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
-
- EVT VT = N->getValueType(0);
- unsigned OpcodeIndex;
- switch (VT.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("unhandled vector load type");
- case MVT::v8i8: OpcodeIndex = 0; break;
- case MVT::v4i16: OpcodeIndex = 1; break;
- case MVT::v2f32:
- case MVT::v2i32: OpcodeIndex = 2; break;
- case MVT::v1f64:
- case MVT::v1i64: OpcodeIndex = 3; break;
- case MVT::v16i8: OpcodeIndex = 4; break;
- case MVT::v8f16:
- case MVT::v8i16: OpcodeIndex = 5; break;
- case MVT::v4f32:
- case MVT::v4i32: OpcodeIndex = 6; break;
- case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 7; break;
- }
- unsigned Opc = Opcodes[OpcodeIndex];
-
- SmallVector<SDValue, 2> Ops;
- Ops.push_back(N->getOperand(2)); // Push back the Memory Address
- Ops.push_back(N->getOperand(0)); // Push back the Chain
-
- std::vector<EVT> ResTys;
- bool is64BitVector = VT.is64BitVector();
-
- if (NumVecs == 1)
- ResTys.push_back(VT);
- else if (NumVecs == 3)
- ResTys.push_back(MVT::Untyped);
- else {
- EVT ResTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64,
- is64BitVector ? NumVecs : NumVecs * 2);
- ResTys.push_back(ResTy);
- }
-
- ResTys.push_back(MVT::Other); // Type of the Chain
- SDLoc dl(N);
- SDNode *VLd = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
-
- // Transfer memoperands.
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
- cast<MachineSDNode>(VLd)->setMemRefs(MemOp, MemOp + 1);
-
- if (NumVecs == 1)
- return VLd;
-
- // If NumVecs > 1, the return result is a super register containing 2-4
- // consecutive vector registers.
- SDValue SuperReg = SDValue(VLd, 0);
-
- unsigned Sub0 = is64BitVector ? AArch64::dsub_0 : AArch64::qsub_0;
- for (unsigned Vec = 0; Vec < NumVecs; ++Vec)
- ReplaceUses(SDValue(N, Vec),
- CurDAG->getTargetExtractSubreg(Sub0 + Vec, dl, VT, SuperReg));
- // Update users of the Chain
- ReplaceUses(SDValue(N, NumVecs), SDValue(VLd, 1));
-
- return NULL;
-}
-
-SDNode *AArch64DAGToDAGISel::SelectVST(SDNode *N, unsigned NumVecs,
- const uint16_t *Opcodes) {
- assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
- SDLoc dl(N);
-
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = cast<MemIntrinsicSDNode>(N)->getMemOperand();
-
- unsigned Vec0Idx = 3;
- EVT VT = N->getOperand(Vec0Idx).getValueType();
- unsigned OpcodeIndex;
- switch (VT.getSimpleVT().SimpleTy) {
- default: llvm_unreachable("unhandled vector store type");
- case MVT::v8i8: OpcodeIndex = 0; break;
- case MVT::v4i16: OpcodeIndex = 1; break;
- case MVT::v2f32:
- case MVT::v2i32: OpcodeIndex = 2; break;
- case MVT::v1f64:
- case MVT::v1i64: OpcodeIndex = 3; break;
- case MVT::v16i8: OpcodeIndex = 4; break;
- case MVT::v8f16:
- case MVT::v8i16: OpcodeIndex = 5; break;
- case MVT::v4f32:
- case MVT::v4i32: OpcodeIndex = 6; break;
- case MVT::v2f64:
- case MVT::v2i64: OpcodeIndex = 7; break;
- }
- unsigned Opc = Opcodes[OpcodeIndex];
-
- std::vector<EVT> ResTys;
- ResTys.push_back(MVT::Other); // Type for the Chain
-
- SmallVector<SDValue, 6> Ops;
- Ops.push_back(N->getOperand(2)); // Push back the Memory Address
-
- bool is64BitVector = VT.is64BitVector();
-
- SDValue V0 = N->getOperand(Vec0Idx + 0);
- SDValue SrcReg;
- if (NumVecs == 1)
- SrcReg = V0;
- else {
- SDValue V1 = N->getOperand(Vec0Idx + 1);
- if (NumVecs == 2)
- SrcReg = is64BitVector ? SDValue(createDPairNode(V0, V1), 0)
- : SDValue(createQPairNode(V0, V1), 0);
- else {
- SDValue V2 = N->getOperand(Vec0Idx + 2);
- if (NumVecs == 3)
- SrcReg = is64BitVector ? SDValue(createDTripleNode(V0, V1, V2), 0)
- : SDValue(createQTripleNode(V0, V1, V2), 0);
- else {
- SDValue V3 = N->getOperand(Vec0Idx + 3);
- SrcReg = is64BitVector ? SDValue(createDQuadNode(V0, V1, V2, V3), 0)
- : SDValue(createQQuadNode(V0, V1, V2, V3), 0);
- }
- }
- }
- Ops.push_back(SrcReg);
-
- // Push back the Chain
- Ops.push_back(N->getOperand(0));
-
- // Transfer memoperands.
- SDNode *VSt = CurDAG->getMachineNode(Opc, dl, ResTys, Ops);
- cast<MachineSDNode>(VSt)->setMemRefs(MemOp, MemOp + 1);
-
- return VSt;
-}
-
SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
// Dump information about the Node being selected
DEBUG(dbgs() << "Selecting: "; Node->dump(CurDAG); dbgs() << "\n");
@@ -768,72 +536,6 @@ SDNode *AArch64DAGToDAGISel::Select(SDNode *Node) {
Node = ResNode;
break;
}
- case ISD::INTRINSIC_VOID:
- case ISD::INTRINSIC_W_CHAIN: {
- unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
- switch (IntNo) {
- default:
- break;
-
- case Intrinsic::arm_neon_vld1: {
- static const uint16_t Opcodes[] = { AArch64::LD1_8B, AArch64::LD1_4H,
- AArch64::LD1_2S, AArch64::LD1_1D,
- AArch64::LD1_16B, AArch64::LD1_8H,
- AArch64::LD1_4S, AArch64::LD1_2D };
- return SelectVLD(Node, 1, Opcodes);
- }
- case Intrinsic::arm_neon_vld2: {
- static const uint16_t Opcodes[] = { AArch64::LD2_8B, AArch64::LD2_4H,
- AArch64::LD2_2S, AArch64::LD1_2V_1D,
- AArch64::LD2_16B, AArch64::LD2_8H,
- AArch64::LD2_4S, AArch64::LD2_2D };
- return SelectVLD(Node, 2, Opcodes);
- }
- case Intrinsic::arm_neon_vld3: {
- static const uint16_t Opcodes[] = { AArch64::LD3_8B, AArch64::LD3_4H,
- AArch64::LD3_2S, AArch64::LD1_3V_1D,
- AArch64::LD3_16B, AArch64::LD3_8H,
- AArch64::LD3_4S, AArch64::LD3_2D };
- return SelectVLD(Node, 3, Opcodes);
- }
- case Intrinsic::arm_neon_vld4: {
- static const uint16_t Opcodes[] = { AArch64::LD4_8B, AArch64::LD4_4H,
- AArch64::LD4_2S, AArch64::LD1_4V_1D,
- AArch64::LD4_16B, AArch64::LD4_8H,
- AArch64::LD4_4S, AArch64::LD4_2D };
- return SelectVLD(Node, 4, Opcodes);
- }
- case Intrinsic::arm_neon_vst1: {
- static const uint16_t Opcodes[] = { AArch64::ST1_8B, AArch64::ST1_4H,
- AArch64::ST1_2S, AArch64::ST1_1D,
- AArch64::ST1_16B, AArch64::ST1_8H,
- AArch64::ST1_4S, AArch64::ST1_2D };
- return SelectVST(Node, 1, Opcodes);
- }
- case Intrinsic::arm_neon_vst2: {
- static const uint16_t Opcodes[] = { AArch64::ST2_8B, AArch64::ST2_4H,
- AArch64::ST2_2S, AArch64::ST1_2V_1D,
- AArch64::ST2_16B, AArch64::ST2_8H,
- AArch64::ST2_4S, AArch64::ST2_2D };
- return SelectVST(Node, 2, Opcodes);
- }
- case Intrinsic::arm_neon_vst3: {
- static const uint16_t Opcodes[] = { AArch64::ST3_8B, AArch64::ST3_4H,
- AArch64::ST3_2S, AArch64::ST1_3V_1D,
- AArch64::ST3_16B, AArch64::ST3_8H,
- AArch64::ST3_4S, AArch64::ST3_2D };
- return SelectVST(Node, 3, Opcodes);
- }
- case Intrinsic::arm_neon_vst4: {
- static const uint16_t Opcodes[] = { AArch64::ST4_8B, AArch64::ST4_4H,
- AArch64::ST4_2S, AArch64::ST1_4V_1D,
- AArch64::ST4_16B, AArch64::ST4_8H,
- AArch64::ST4_4S, AArch64::ST4_2D };
- return SelectVST(Node, 4, Opcodes);
- }
- }
- break;
- }
default:
break; // Let generic code handle it
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index d89213c..d70548a 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -3681,57 +3681,3 @@ AArch64TargetLowering::getRegForInlineAsmConstraint(
// constraint into a member of a register class.
return TargetLowering::getRegForInlineAsmConstraint(Constraint, VT);
}
-
-/// Represent NEON load and store intrinsics as MemIntrinsicNodes.
-/// The associated MachineMemOperands record the alignment specified
-/// in the intrinsic calls.
-bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
- const CallInst &I,
- unsigned Intrinsic) const {
- switch (Intrinsic) {
- case Intrinsic::arm_neon_vld1:
- case Intrinsic::arm_neon_vld2:
- case Intrinsic::arm_neon_vld3:
- case Intrinsic::arm_neon_vld4: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- // Conservatively set memVT to the entire set of vectors loaded.
- uint64_t NumElts = getDataLayout()->getTypeAllocSize(I.getType()) / 8;
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
- Info.vol = false; // volatile loads with NEON intrinsics not supported
- Info.readMem = true;
- Info.writeMem = false;
- return true;
- }
- case Intrinsic::arm_neon_vst1:
- case Intrinsic::arm_neon_vst2:
- case Intrinsic::arm_neon_vst3:
- case Intrinsic::arm_neon_vst4: {
- Info.opc = ISD::INTRINSIC_VOID;
- // Conservatively set memVT to the entire set of vectors stored.
- unsigned NumElts = 0;
- for (unsigned ArgI = 1, ArgE = I.getNumArgOperands(); ArgI < ArgE; ++ArgI) {
- Type *ArgTy = I.getArgOperand(ArgI)->getType();
- if (!ArgTy->isVectorTy())
- break;
- NumElts += getDataLayout()->getTypeAllocSize(ArgTy) / 8;
- }
- Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
- Info.ptrVal = I.getArgOperand(0);
- Info.offset = 0;
- Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
- Info.align = cast<ConstantInt>(AlignArg)->getZExtValue();
- Info.vol = false; // volatile stores with NEON intrinsics not supported
- Info.readMem = false;
- Info.writeMem = true;
- return true;
- }
- default:
- break;
- }
-
- return false;
-}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index da7f623..3e309a9 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -281,10 +281,6 @@ public:
std::pair<unsigned, const TargetRegisterClass*>
getRegForInlineAsmConstraint(const std::string &Constraint, MVT VT) const;
-
- virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
- unsigned Intrinsic) const LLVM_OVERRIDE;
-
private:
const InstrItineraryData *Itins;
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index ab4d083..5781578 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1194,23 +1194,5 @@ class NeonI_Scalar2SameMisc<bit u, bits<2> size, bits<5> opcode, dag outs, dag i
// Inherit Rd in 4-0
}
-// Format AdvSIMD vector load/store multiple N-element structure
-class NeonI_LdStMult<bit q, bit l, bits<4> opcode, bits<2> size,
- dag outs, dag ins, string asmstr,
- list<dag> patterns, InstrItinClass itin>
- : A64InstRtn<outs, ins, asmstr, patterns, itin>
-{
- let Inst{31} = 0b0;
- let Inst{30} = q;
- let Inst{29-23} = 0b0011000;
- let Inst{22} = l;
- let Inst{21-16} = 0b000000;
- let Inst{15-12} = opcode;
- let Inst{11-10} = size;
-
- // Inherit Rn in 9-5
- // Inherit Rt in 4-0
-}
-
}
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index 355de53..a9f6061 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -2982,132 +2982,6 @@ defm PMULL2vvv : NeonI_3VDL2_2Op_mull_v3<0b0, 0b1110, "pmull2",
// End of implementation for instruction class (3V Diff)
-// The followings are vector load/store multiple N-element structure
-// (class SIMD lselem).
-
-// ld1: load multiple 1-element structure to 1/2/3/4 registers.
-// ld2/ld3/ld4: load multiple N-element structure to N registers (N = 2, 3, 4).
-// The structure consists of a sequence of sets of N values.
-// The first element of the structure is placed in the first lane
-// of the first first vector, the second element in the first lane
-// of the second vector, and so on.
-// E.g. LD1_3V_2S will load 32-bit elements {A, B, C, D, E, F} sequentially into
-// the three 64-bit vectors list {BA, DC, FE}.
-// E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
-// 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
-
-
-class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, string asmop>
- : NeonI_LdStMult<q, 1, opcode, size,
- (outs VecList:$Rt), (ins GPR64xsp:$Rn),
- asmop # "\t$Rt, [$Rn]",
- [],
- NoItinerary> {
- let mayLoad = 1;
- let neverHasSideEffects = 1;
-}
-
-multiclass LDVList_BHSD<bits<4> opcode, string List, string asmop> {
- def _8B : NeonI_LDVList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
- def _4H : NeonI_LDVList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
- def _2S : NeonI_LDVList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
- def _16B : NeonI_LDVList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
- def _8H : NeonI_LDVList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
- def _4S : NeonI_LDVList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
- def _2D : NeonI_LDVList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
-def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
-
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
-
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
-
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
-
-// Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
-defm LD1_2V : LDVList_BHSD<0b1010, "VPair", "ld1">;
-def LD1_2V_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
-
-defm LD1_3V : LDVList_BHSD<0b0110, "VTriple", "ld1">;
-def LD1_3V_1D : NeonI_LDVList<0, 0b0110, 0b11, VTriple1D_operand, "ld1">;
-
-defm LD1_4V : LDVList_BHSD<0b0010, "VQuad", "ld1">;
-def LD1_4V_1D : NeonI_LDVList<0, 0b0010, 0b11, VQuad1D_operand, "ld1">;
-
-class NeonI_STVList<bit q, bits<4> opcode, bits<2> size,
- RegisterOperand VecList, string asmop>
- : NeonI_LdStMult<q, 0, opcode, size,
- (outs), (ins GPR64xsp:$Rn, VecList:$Rt),
- asmop # "\t$Rt, [$Rn]",
- [],
- NoItinerary> {
- let mayStore = 1;
- let neverHasSideEffects = 1;
-}
-
-multiclass STVList_BHSD<bits<4> opcode, string List, string asmop> {
- def _8B : NeonI_STVList<0, opcode, 0b00,
- !cast<RegisterOperand>(List # "8B_operand"), asmop>;
-
- def _4H : NeonI_STVList<0, opcode, 0b01,
- !cast<RegisterOperand>(List # "4H_operand"), asmop>;
-
- def _2S : NeonI_STVList<0, opcode, 0b10,
- !cast<RegisterOperand>(List # "2S_operand"), asmop>;
-
- def _16B : NeonI_STVList<1, opcode, 0b00,
- !cast<RegisterOperand>(List # "16B_operand"), asmop>;
-
- def _8H : NeonI_STVList<1, opcode, 0b01,
- !cast<RegisterOperand>(List # "8H_operand"), asmop>;
-
- def _4S : NeonI_STVList<1, opcode, 0b10,
- !cast<RegisterOperand>(List # "4S_operand"), asmop>;
-
- def _2D : NeonI_STVList<1, opcode, 0b11,
- !cast<RegisterOperand>(List # "2D_operand"), asmop>;
-}
-
-// Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
-def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
-
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
-
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
-
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
-
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1_2V : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1_2V_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
-
-defm ST1_3V : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1_3V_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
-
-defm ST1_4V : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1_4V_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
-
-// End of vector load/store multiple N-element structure(class SIMD lselem)
-
// Scalar Arithmetic
class NeonI_Scalar3Same_D_size<bit u, bits<5> opcode, string asmop>
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.td b/lib/Target/AArch64/AArch64RegisterInfo.td
index 5e2b196..b7a6acb 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -17,20 +17,6 @@ def sub_64 : SubRegIndex<64>;
def sub_32 : SubRegIndex<32>;
def sub_16 : SubRegIndex<16>;
def sub_8 : SubRegIndex<8>;
-
-// Note: Code depends on these having consecutive numbers.
-def qqsub : SubRegIndex<256, 256>;
-
-def qsub_0 : SubRegIndex<128>;
-def qsub_1 : SubRegIndex<128, 128>;
-def qsub_2 : ComposedSubRegIndex<qqsub, qsub_0>;
-def qsub_3 : ComposedSubRegIndex<qqsub, qsub_1>;
-
-def dsub_0 : SubRegIndex<64>;
-def dsub_1 : SubRegIndex<64, 64>;
-def dsub_2 : ComposedSubRegIndex<qsub_1, dsub_0>;
-def dsub_3 : ComposedSubRegIndex<qsub_1, dsub_1>;
-def dsub_4 : ComposedSubRegIndex<qsub_2, dsub_0>;
}
// Registers are identified with 5-bit ID numbers.
@@ -202,90 +188,3 @@ def FlagClass : RegisterClass<"AArch64", [i32], 32, (add NZCV)> {
let CopyCost = -1;
let isAllocatable = 0;
}
-
-//===----------------------------------------------------------------------===//
-// Consecutive vector registers
-//===----------------------------------------------------------------------===//
-// 2 Consecutive 64-bit registers: D0_D1, D1_D2, ..., D30_D31
-def Tuples2D : RegisterTuples<[dsub_0, dsub_1],
- [(rotl FPR64, 0), (rotl FPR64, 1)]>;
-
-// 3 Consecutive 64-bit registers: D0_D1_D2, ..., D31_D0_D1
-def Tuples3D : RegisterTuples<[dsub_0, dsub_1, dsub_2],
- [(rotl FPR64, 0), (rotl FPR64, 1),
- (rotl FPR64, 2)]>;
-
-// 4 Consecutive 64-bit registers: D0_D1_D2_D3, ..., D31_D0_D1_D2
-def Tuples4D : RegisterTuples<[dsub_0, dsub_1, dsub_2, dsub_3],
- [(rotl FPR64, 0), (rotl FPR64, 1),
- (rotl FPR64, 2), (rotl FPR64, 3)]>;
-
-// 2 Consecutive 128-bit registers: Q0_Q1, Q1_Q2, ..., Q30_Q31
-def Tuples2Q : RegisterTuples<[qsub_0, qsub_1],
- [(rotl FPR128, 0), (rotl FPR128, 1)]>;
-
-// 3 Consecutive 128-bit registers: Q0_Q1_Q2, ..., Q31_Q0_Q1
-def Tuples3Q : RegisterTuples<[qsub_0, qsub_1, qsub_2],
- [(rotl FPR128, 0), (rotl FPR128, 1),
- (rotl FPR128, 2)]>;
-
-// 4 Consecutive 128-bit registers: Q0_Q1_Q2_Q3, ..., Q31_Q0_Q1_Q2
-def Tuples4Q : RegisterTuples<[qsub_0, qsub_1, qsub_2, qsub_3],
- [(rotl FPR128, 0), (rotl FPR128, 1),
- (rotl FPR128, 2), (rotl FPR128, 3)]>;
-
-// The followings are super register classes to model 2/3/4 consecutive
-// 64-bit/128-bit registers.
-
-def DPair : RegisterClass<"AArch64", [v2i64], 64, (add Tuples2D)>;
-
-def DTriple : RegisterClass<"AArch64", [untyped], 64, (add Tuples3D)> {
- let Size = 192; // 3 x 64 bits, we have no predefined type of that size.
-}
-
-def DQuad : RegisterClass<"AArch64", [v4i64], 64, (add Tuples4D)>;
-
-def QPair : RegisterClass<"AArch64", [v4i64], 128, (add Tuples2Q)>;
-
-def QTriple : RegisterClass<"AArch64", [untyped], 128, (add Tuples3Q)> {
- let Size = 384; // 3 x 128 bits, we have no predefined type of that size.
-}
-
-def QQuad : RegisterClass<"AArch64", [v8i64], 128, (add Tuples4Q)>;
-
-
-// The followings are vector list operands
-multiclass VectorList_operands<string PREFIX, string LAYOUT, int Count,
- RegisterClass RegList> {
- def _asmoperand : AsmOperandClass {
- let Name = PREFIX # LAYOUT # Count;
- let RenderMethod = "addVectorListOperands";
- let PredicateMethod =
- "isVectorList<A64Layout::_" # LAYOUT # ", " # Count # ">";
- let ParserMethod = "ParseVectorList";
- }
-
- def _operand : RegisterOperand<RegList,
- "printVectorList<A64Layout::_" # LAYOUT # ", " # Count # ">"> {
- let ParserMatchClass =
- !cast<AsmOperandClass>(PREFIX # LAYOUT # "_asmoperand");
- }
-}
-
-multiclass VectorList_BHSD<string PREFIX, int Count, RegisterClass DRegList,
- RegisterClass QRegList> {
- defm 8B : VectorList_operands<PREFIX, "8B", Count, DRegList>;
- defm 4H : VectorList_operands<PREFIX, "4H", Count, DRegList>;
- defm 2S : VectorList_operands<PREFIX, "2S", Count, DRegList>;
- defm 1D : VectorList_operands<PREFIX, "1D", Count, DRegList>;
- defm 16B : VectorList_operands<PREFIX, "16B", Count, QRegList>;
- defm 8H : VectorList_operands<PREFIX, "8H", Count, QRegList>;
- defm 4S : VectorList_operands<PREFIX, "4S", Count, QRegList>;
- defm 2D : VectorList_operands<PREFIX, "2D", Count, QRegList>;
-}
-
-// Vector list operand with 1/2/3/4 registers: VOne8B_operand,..., VQuad2D_operand
-defm VOne : VectorList_BHSD<"VOne", 1, FPR64, FPR128>;
-defm VPair : VectorList_BHSD<"VPair", 2, DPair, QPair>;
-defm VTriple : VectorList_BHSD<"VTriple", 3, DTriple, QTriple>;
-defm VQuad : VectorList_BHSD<"VQuad", 4, DQuad, QQuad>; \ No newline at end of file
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 127c7ec..51638d9 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -127,11 +127,6 @@ public:
OperandMatchResultTy
ParseSysRegOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands);
- bool TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc, StringRef &Layout,
- SMLoc &LayoutLoc);
-
- OperandMatchResultTy ParseVectorList(SmallVectorImpl<MCParsedAsmOperand *> &);
-
bool validateInstruction(MCInst &Inst,
const SmallVectorImpl<MCParsedAsmOperand*> &Operands);
@@ -159,7 +154,6 @@ private:
k_Immediate, // Including expressions referencing symbols
k_Register,
k_ShiftExtend,
- k_VectorList, // A sequential list of 1 to 4 registers.
k_SysReg, // The register operand of MRS and MSR instructions
k_Token, // The mnemonic; other raw tokens the auto-generated
k_WrappedRegister // Load/store exclusive permit a wrapped register.
@@ -195,13 +189,6 @@ private:
bool ImplicitAmount;
};
- // A vector register list is a sequential list of 1 to 4 registers.
- struct VectorListOp {
- unsigned RegNum;
- unsigned Count;
- A64Layout::VectorLayout Layout;
- };
-
struct SysRegOp {
const char *Data;
unsigned Length;
@@ -219,7 +206,6 @@ private:
struct ImmOp Imm;
struct RegOp Reg;
struct ShiftExtendOp ShiftExtend;
- struct VectorListOp VectorList;
struct SysRegOp SysReg;
struct TokOp Tok;
};
@@ -731,12 +717,6 @@ public:
return ShiftExtend.Amount == 8 || ShiftExtend.Amount == 16;
}
- template <A64Layout::VectorLayout Layout, unsigned Count>
- bool isVectorList() const {
- return Kind == k_VectorList && VectorList.Layout == Layout &&
- VectorList.Count == Count;
- }
-
template <int MemSize> bool isSImm7Scaled() const {
if (!isImm())
return false;
@@ -857,18 +837,6 @@ public:
return Op;
}
- static AArch64Operand *CreateVectorList(unsigned RegNum, unsigned Count,
- A64Layout::VectorLayout Layout,
- SMLoc S, SMLoc E) {
- AArch64Operand *Op = new AArch64Operand(k_VectorList, S, E);
- Op->VectorList.RegNum = RegNum;
- Op->VectorList.Count = Count;
- Op->VectorList.Layout = Layout;
- Op->StartLoc = S;
- Op->EndLoc = E;
- return Op;
- }
-
static AArch64Operand *CreateToken(StringRef Str, SMLoc S) {
AArch64Operand *Op = new AArch64Operand(k_Token, S, S);
Op->Tok.Data = Str.data();
@@ -1216,11 +1184,6 @@ public:
}
Inst.addOperand(MCOperand::CreateImm(Imm));
}
-
- void addVectorListOperands(MCInst &Inst, unsigned N) const {
- assert(N == 1 && "Invalid number of operands!");
- Inst.addOperand(MCOperand::CreateReg(VectorList.RegNum));
- }
};
} // end anonymous namespace.
@@ -1260,6 +1223,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
else
return MatchOperand_Success;
}
+
// ... or it might be a symbolish thing
}
// Fall through
@@ -1303,7 +1267,7 @@ AArch64AsmParser::ParseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
return ParseOperand(Operands, Mnemonic);
}
// The following will likely be useful later, but not in very early cases
- case AsmToken::LCurly: // SIMD vector list is not parsed here
+ case AsmToken::LCurly: // Weird SIMD lists
llvm_unreachable("Don't know how to deal with '{' in operand");
return MatchOperand_ParseFail;
}
@@ -1926,132 +1890,6 @@ AArch64AsmParser::ParseShiftExtend(
return MatchOperand_Success;
}
-/// Try to parse a vector register token, If it is a vector register,
-/// the token is eaten and return true. Otherwise return false.
-bool AArch64AsmParser::TryParseVector(uint32_t &RegNum, SMLoc &RegEndLoc,
- StringRef &Layout, SMLoc &LayoutLoc) {
- bool IsVector = true;
-
- if (!IdentifyRegister(RegNum, RegEndLoc, Layout, LayoutLoc))
- IsVector = false;
-
- if (!AArch64MCRegisterClasses[AArch64::FPR64RegClassID].contains(RegNum) &&
- !AArch64MCRegisterClasses[AArch64::FPR128RegClassID].contains(RegNum))
- IsVector = false;
-
- if (Layout.size() == 0)
- IsVector = false;
-
- if (!IsVector)
- Error(Parser.getTok().getLoc(), "expected vector type register");
-
- Parser.Lex(); // Eat this token.
- return IsVector;
-}
-
-
-// A vector list contains 1-4 consecutive registers.
-// Now there are two kinds of vector list when number of vector > 1:
-// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-// (2) {Vn.layout - Vm.layout}
-AArch64AsmParser::OperandMatchResultTy AArch64AsmParser::ParseVectorList(
- SmallVectorImpl<MCParsedAsmOperand *> &Operands) {
- if (Parser.getTok().isNot(AsmToken::LCurly)) {
- Error(Parser.getTok().getLoc(), "'{' expected");
- return MatchOperand_ParseFail;
- }
- SMLoc SLoc = Parser.getTok().getLoc();
- Parser.Lex(); // Eat '{' token.
-
- unsigned Reg, Count = 1;
- StringRef LayoutStr;
- SMLoc RegEndLoc, LayoutLoc;
- if (!TryParseVector(Reg, RegEndLoc, LayoutStr, LayoutLoc))
- return MatchOperand_ParseFail;
-
- if (Parser.getTok().is(AsmToken::Minus)) {
- Parser.Lex(); // Eat the minus.
-
- unsigned Reg2;
- StringRef LayoutStr2;
- SMLoc RegEndLoc2, LayoutLoc2;
- SMLoc RegLoc2 = Parser.getTok().getLoc();
-
- if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
- return MatchOperand_ParseFail;
- unsigned Space = (Reg < Reg2) ? (Reg2 - Reg) : (Reg2 + 32 - Reg);
-
- if (LayoutStr != LayoutStr2) {
- Error(LayoutLoc2, "expected the same vector layout");
- return MatchOperand_ParseFail;
- }
- if (Space == 0 || Space > 3) {
- Error(RegLoc2, "invalid number of vectors");
- return MatchOperand_ParseFail;
- }
-
- Count += Space;
- } else {
- unsigned LastReg = Reg;
- while (Parser.getTok().is(AsmToken::Comma)) {
- Parser.Lex(); // Eat the comma.
- unsigned Reg2;
- StringRef LayoutStr2;
- SMLoc RegEndLoc2, LayoutLoc2;
- SMLoc RegLoc2 = Parser.getTok().getLoc();
-
- if (!TryParseVector(Reg2, RegEndLoc2, LayoutStr2, LayoutLoc2))
- return MatchOperand_ParseFail;
- unsigned Space = (LastReg < Reg2) ? (Reg2 - LastReg)
- : (Reg2 + 32 - LastReg);
- Count++;
-
- // The space between two vectors should be 1. And they should have the same layout.
- // Total count shouldn't be great than 4
- if (Space != 1) {
- Error(RegLoc2, "invalid space between two vectors");
- return MatchOperand_ParseFail;
- }
- if (LayoutStr != LayoutStr2) {
- Error(LayoutLoc2, "expected the same vector layout");
- return MatchOperand_ParseFail;
- }
- if (Count > 4) {
- Error(RegLoc2, "invalid number of vectors");
- return MatchOperand_ParseFail;
- }
-
- LastReg = Reg2;
- }
- }
-
- if (Parser.getTok().isNot(AsmToken::RCurly)) {
- Error(Parser.getTok().getLoc(), "'}' expected");
- return MatchOperand_ParseFail;
- }
- SMLoc ELoc = Parser.getTok().getLoc();
- Parser.Lex(); // Eat '}' token.
-
- A64Layout::VectorLayout Layout = A64StringToVectorLayout(LayoutStr);
- if (Count > 1) { // If count > 1, create vector list using super register.
- bool IsVec64 = (Layout < A64Layout::_16B) ? true : false;
- static unsigned SupRegIDs[3][2] = {
- { AArch64::QPairRegClassID, AArch64::DPairRegClassID },
- { AArch64::QTripleRegClassID, AArch64::DTripleRegClassID },
- { AArch64::QQuadRegClassID, AArch64::DQuadRegClassID }
- };
- unsigned SupRegID = SupRegIDs[Count - 2][static_cast<int>(IsVec64)];
- unsigned Sub0 = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
- const MCRegisterInfo *MRI = getContext().getRegisterInfo();
- Reg = MRI->getMatchingSuperReg(Reg, Sub0,
- &AArch64MCRegisterClasses[SupRegID]);
- }
- Operands.push_back(
- AArch64Operand::CreateVectorList(Reg, Count, Layout, SLoc, ELoc));
-
- return MatchOperand_Success;
-}
-
// FIXME: We would really like to be able to tablegen'erate this.
bool AArch64AsmParser::
validateInstruction(MCInst &Inst,
diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index 16ec0cb..b9d7c16 100644
--- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -361,59 +361,6 @@ DecodeFPR128LoRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
return DecodeFPR128RegisterClass(Inst, RegNo, Address, Decoder);
}
-static DecodeStatus DecodeRegisterClassByID(llvm::MCInst &Inst, unsigned RegNo,
- unsigned RegID,
- const void *Decoder) {
- if (RegNo > 31)
- return MCDisassembler::Fail;
-
- uint16_t Register = getReg(Decoder, RegID, RegNo);
- Inst.addOperand(MCOperand::CreateReg(Register));
- return MCDisassembler::Success;
-}
-
-static DecodeStatus DecodeDPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DPairRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQPairRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QPairRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeDTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DTripleRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQTripleRegisterClass(llvm::MCInst &Inst,
- unsigned RegNo, uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QTripleRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeDQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::DQuadRegClassID,
- Decoder);
-}
-
-static DecodeStatus DecodeQQuadRegisterClass(llvm::MCInst &Inst, unsigned RegNo,
- uint64_t Address,
- const void *Decoder) {
- return DecodeRegisterClassByID(Inst, RegNo, AArch64::QQuadRegClassID,
- Decoder);
-}
-
static DecodeStatus DecodeAddrRegExtendOperand(llvm::MCInst &Inst,
unsigned OptionHiS,
uint64_t Address,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index 51335e1..26bd797 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -507,33 +507,3 @@ void AArch64InstPrinter::printNeonUImm64MaskOperand(const MCInst *MI,
O << "#0x";
O.write_hex(Mask);
}
-
-// If Count > 1, there are two valid kinds of vector list:
-// (1) {Vn.layout, Vn+1.layout, ... , Vm.layout}
-// (2) {Vn.layout - Vm.layout}
-// We choose the first kind as output.
-template <A64Layout::VectorLayout Layout, unsigned Count>
-void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
- raw_ostream &O) {
- assert(Count >= 1 && Count <= 4 && "Invalid Number of Vectors");
-
- unsigned Reg = MI->getOperand(OpNum).getReg();
- std::string LayoutStr = A64VectorLayoutToString(Layout);
- O << "{";
- if (Count > 1) { // Print sub registers separately
- bool IsVec64 = (Layout < A64Layout::_16B) ? true : false;
- unsigned SubRegIdx = IsVec64 ? AArch64::dsub_0 : AArch64::qsub_0;
- for (unsigned I = 0; I < Count; I++) {
- std::string Name = getRegisterName(MRI.getSubReg(Reg, SubRegIdx++));
- Name[0] = 'v';
- O << Name << LayoutStr;
- if (I != Count - 1)
- O << ", ";
- }
- } else { // Print the register directly when NumVecs is 1.
- std::string Name = getRegisterName(Reg);
- Name[0] = 'v';
- O << Name << LayoutStr;
- }
- O << "}";
-}
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 28ebfc4..71c9f4a 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -174,9 +174,6 @@ public:
raw_ostream &O);
void printNeonUImm64MaskOperand(const MCInst *MI, unsigned OpNum,
raw_ostream &O);
-
- template <A64Layout::VectorLayout Layout, unsigned Count>
- void printVectorList(const MCInst *MI, unsigned OpNum, raw_ostream &O);
};
}
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 7db5238..e675efc 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -306,50 +306,6 @@ namespace A64SE {
};
}
-namespace A64Layout {
- enum VectorLayout {
- Invalid = -1,
- _8B,
- _4H,
- _2S,
- _1D,
-
- _16B,
- _8H,
- _4S,
- _2D
- };
-}
-
-inline static const char *
-A64VectorLayoutToString(A64Layout::VectorLayout Layout) {
- switch (Layout) {
- case A64Layout::_8B: return ".8b";
- case A64Layout::_4H: return ".4h";
- case A64Layout::_2S: return ".2s";
- case A64Layout::_1D: return ".1d";
- case A64Layout::_16B: return ".16b";
- case A64Layout::_8H: return ".8h";
- case A64Layout::_4S: return ".4s";
- case A64Layout::_2D: return ".2d";
- default: llvm_unreachable("Unknown Vector Layout");
- }
-}
-
-inline static A64Layout::VectorLayout
-A64StringToVectorLayout(StringRef LayoutStr) {
- return StringSwitch<A64Layout::VectorLayout>(LayoutStr)
- .Case(".8b", A64Layout::_8B)
- .Case(".4h", A64Layout::_4H)
- .Case(".2s", A64Layout::_2S)
- .Case(".1d", A64Layout::_1D)
- .Case(".16b", A64Layout::_16B)
- .Case(".8h", A64Layout::_8H)
- .Case(".4s", A64Layout::_4S)
- .Case(".2d", A64Layout::_2D)
- .Default(A64Layout::Invalid);
-}
-
namespace A64SysReg {
enum SysRegROValues {
MDCCSR_EL0 = 0x9808, // 10 011 0000 0001 000
diff --git a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll b/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
deleted file mode 100644
index 4cd76bc..0000000
--- a/test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ /dev/null
@@ -1,1228 +0,0 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
-
-%struct.int8x16x2_t = type { [2 x <16 x i8>] }
-%struct.int16x8x2_t = type { [2 x <8 x i16>] }
-%struct.int32x4x2_t = type { [2 x <4 x i32>] }
-%struct.int64x2x2_t = type { [2 x <2 x i64>] }
-%struct.float32x4x2_t = type { [2 x <4 x float>] }
-%struct.float64x2x2_t = type { [2 x <2 x double>] }
-%struct.int8x8x2_t = type { [2 x <8 x i8>] }
-%struct.int16x4x2_t = type { [2 x <4 x i16>] }
-%struct.int32x2x2_t = type { [2 x <2 x i32>] }
-%struct.int64x1x2_t = type { [2 x <1 x i64>] }
-%struct.float32x2x2_t = type { [2 x <2 x float>] }
-%struct.float64x1x2_t = type { [2 x <1 x double>] }
-%struct.int8x16x3_t = type { [3 x <16 x i8>] }
-%struct.int16x8x3_t = type { [3 x <8 x i16>] }
-%struct.int32x4x3_t = type { [3 x <4 x i32>] }
-%struct.int64x2x3_t = type { [3 x <2 x i64>] }
-%struct.float32x4x3_t = type { [3 x <4 x float>] }
-%struct.float64x2x3_t = type { [3 x <2 x double>] }
-%struct.int8x8x3_t = type { [3 x <8 x i8>] }
-%struct.int16x4x3_t = type { [3 x <4 x i16>] }
-%struct.int32x2x3_t = type { [3 x <2 x i32>] }
-%struct.int64x1x3_t = type { [3 x <1 x i64>] }
-%struct.float32x2x3_t = type { [3 x <2 x float>] }
-%struct.float64x1x3_t = type { [3 x <1 x double>] }
-%struct.int8x16x4_t = type { [4 x <16 x i8>] }
-%struct.int16x8x4_t = type { [4 x <8 x i16>] }
-%struct.int32x4x4_t = type { [4 x <4 x i32>] }
-%struct.int64x2x4_t = type { [4 x <2 x i64>] }
-%struct.float32x4x4_t = type { [4 x <4 x float>] }
-%struct.float64x2x4_t = type { [4 x <2 x double>] }
-%struct.int8x8x4_t = type { [4 x <8 x i8>] }
-%struct.int16x4x4_t = type { [4 x <4 x i16>] }
-%struct.int32x2x4_t = type { [4 x <2 x i32>] }
-%struct.int64x1x4_t = type { [4 x <1 x i64>] }
-%struct.float32x2x4_t = type { [4 x <2 x float>] }
-%struct.float64x1x4_t = type { [4 x <1 x double>] }
-
-
-define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
-; CHECK: test_vld1q_s8
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
- ret <16 x i8> %vld1
-}
-
-define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
-; CHECK: test_vld1q_s16
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
- ret <8 x i16> %vld1
-}
-
-define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
-; CHECK: test_vld1q_s32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
- ret <4 x i32> %vld1
-}
-
-define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
-; CHECK: test_vld1q_s64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
- ret <2 x i64> %vld1
-}
-
-define <4 x float> @test_vld1q_f32(float* readonly %a) {
-; CHECK: test_vld1q_f32
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
- ret <4 x float> %vld1
-}
-
-define <2 x double> @test_vld1q_f64(double* readonly %a) {
-; CHECK: test_vld1q_f64
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
- ret <2 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_s8(i8* readonly %a) {
-; CHECK: test_vld1_s8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
- ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_s16(i16* readonly %a) {
-; CHECK: test_vld1_s16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
- ret <4 x i16> %vld1
-}
-
-define <2 x i32> @test_vld1_s32(i32* readonly %a) {
-; CHECK: test_vld1_s32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
- ret <2 x i32> %vld1
-}
-
-define <1 x i64> @test_vld1_s64(i64* readonly %a) {
-; CHECK: test_vld1_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
- ret <1 x i64> %vld1
-}
-
-define <2 x float> @test_vld1_f32(float* readonly %a) {
-; CHECK: test_vld1_f32
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
- ret <2 x float> %vld1
-}
-
-define <1 x double> @test_vld1_f64(double* readonly %a) {
-; CHECK: test_vld1_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
- ret <1 x double> %vld1
-}
-
-define <8 x i8> @test_vld1_p8(i8* readonly %a) {
-; CHECK: test_vld1_p8
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
- ret <8 x i8> %vld1
-}
-
-define <4 x i16> @test_vld1_p16(i16* readonly %a) {
-; CHECK: test_vld1_p16
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
- ret <4 x i16> %vld1
-}
-
-define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
-; CHECK: test_vld2q_s8
-; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
- %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
- ret %struct.int8x16x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
-; CHECK: test_vld2q_s16
-; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
- %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
- ret %struct.int16x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
-; CHECK: test_vld2q_s32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
- ret %struct.int32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
-; CHECK: test_vld2q_s64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
- ret %struct.int64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
-; CHECK: test_vld2q_f32
-; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
- ret %struct.float32x4x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
-; CHECK: test_vld2q_f64
-; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
- ret %struct.float64x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
-; CHECK: test_vld2_s8
-; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
- %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
- ret %struct.int8x8x2_t %.fca.0.1.insert
-}
-
-define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
-; CHECK: test_vld2_s16
-; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
- %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
- ret %struct.int16x4x2_t %.fca.0.1.insert
-}
-
-define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
-; CHECK: test_vld2_s32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
- ret %struct.int32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
-; CHECK: test_vld2_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
- ret %struct.int64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
-; CHECK: test_vld2_f32
-; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
- %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
- ret %struct.float32x2x2_t %.fca.0.1.insert
-}
-
-define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
-; CHECK: test_vld2_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
- %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
- %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
- %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
- ret %struct.float64x1x2_t %.fca.0.1.insert
-}
-
-define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
-; CHECK: test_vld3q_s8
-; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
- %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
- ret %struct.int8x16x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
-; CHECK: test_vld3q_s16
-; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
- %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
- ret %struct.int16x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
-; CHECK: test_vld3q_s32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
- ret %struct.int32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
-; CHECK: test_vld3q_s64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
- ret %struct.int64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
-; CHECK: test_vld3q_f32
-; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
- ret %struct.float32x4x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
-; CHECK: test_vld3q_f64
-; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
- ret %struct.float64x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
-; CHECK: test_vld3_s8
-; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
- %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
- ret %struct.int8x8x3_t %.fca.0.2.insert
-}
-
-define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
-; CHECK: test_vld3_s16
-; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
- %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
- ret %struct.int16x4x3_t %.fca.0.2.insert
-}
-
-define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
-; CHECK: test_vld3_s32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
- ret %struct.int32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
-; CHECK: test_vld3_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
- ret %struct.int64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
-; CHECK: test_vld3_f32
-; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
- %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
- ret %struct.float32x2x3_t %.fca.0.2.insert
-}
-
-define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
-; CHECK: test_vld3_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
- %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
- %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
- %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
- %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
- ret %struct.float64x1x3_t %.fca.0.2.insert
-}
-
-define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
-; CHECK: test_vld4q_s8
-; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
- %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
- %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
- ret %struct.int8x16x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
-; CHECK: test_vld4q_s16
-; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
- %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
- ret %struct.int16x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
-; CHECK: test_vld4q_s32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
- ret %struct.int32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
-; CHECK: test_vld4q_s64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
- ret %struct.int64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
-; CHECK: test_vld4q_f32
-; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
- ret %struct.float32x4x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
-; CHECK: test_vld4q_f64
-; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
- ret %struct.float64x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
-; CHECK: test_vld4_s8
-; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
- %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
- %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
- ret %struct.int8x8x4_t %.fca.0.3.insert
-}
-
-define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
-; CHECK: test_vld4_s16
-; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
- %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
- ret %struct.int16x4x4_t %.fca.0.3.insert
-}
-
-define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
-; CHECK: test_vld4_s32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
- ret %struct.int32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
-; CHECK: test_vld4_s64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
- ret %struct.int64x1x4_t %.fca.0.3.insert
-}
-
-define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
-; CHECK: test_vld4_f32
-; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
- %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
- ret %struct.float32x2x4_t %.fca.0.3.insert
-}
-
-define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
-; CHECK: test_vld4_f64
-; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
- %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
- %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
- %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
- %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
- %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
- %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
- %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
- %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
- ret %struct.float64x1x4_t %.fca.0.3.insert
-}
-
-declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
-declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
-declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
-declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
-declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
-declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
-declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
-declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
-declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
-declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
-declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
-declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
-declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
-declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
-declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
-declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
-declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
-declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
-declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
-declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
-declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
-declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
-declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
-declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
-
-define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
-; CHECK: test_vst1q_s8
-; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
- ret void
-}
-
-define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
-; CHECK: test_vst1q_s16
-; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
- ret void
-}
-
-define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
-; CHECK: test_vst1q_s32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
- ret void
-}
-
-define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
-; CHECK: test_vst1q_s64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
- ret void
-}
-
-define void @test_vst1q_f32(float* %a, <4 x float> %b) {
-; CHECK: test_vst1q_f32
-; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
- ret void
-}
-
-define void @test_vst1q_f64(double* %a, <2 x double> %b) {
-; CHECK: test_vst1q_f64
-; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
- ret void
-}
-
-define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
-; CHECK: test_vst1_s8
-; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
- ret void
-}
-
-define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
-; CHECK: test_vst1_s16
-; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
- ret void
-}
-
-define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
-; CHECK: test_vst1_s32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
- ret void
-}
-
-define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
-; CHECK: test_vst1_s64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
- ret void
-}
-
-define void @test_vst1_f32(float* %a, <2 x float> %b) {
-; CHECK: test_vst1_f32
-; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
- ret void
-}
-
-define void @test_vst1_f64(double* %a, <1 x double> %b) {
-; CHECK: test_vst1_f64
-; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
- ret void
-}
-
-define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
-; CHECK: test_vst2q_s8
-; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
- ret void
-}
-
-define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
-; CHECK: test_vst2q_s16
-; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
- ret void
-}
-
-define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
-; CHECK: test_vst2q_s32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
-; CHECK: test_vst2q_s64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
-; CHECK: test_vst2q_f32
-; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
-; CHECK: test_vst2q_f64
-; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
-; CHECK: test_vst2_s8
-; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
- tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
- ret void
-}
-
-define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
-; CHECK: test_vst2_s16
-; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
- ret void
-}
-
-define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
-; CHECK: test_vst2_s32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
-; CHECK: test_vst2_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
-; CHECK: test_vst2_f32
-; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
- ret void
-}
-
-define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
-; CHECK: test_vst2_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
- ret void
-}
-
-define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
-; CHECK: test_vst3q_s8
-; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
- ret void
-}
-
-define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
-; CHECK: test_vst3q_s16
-; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
- ret void
-}
-
-define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
-; CHECK: test_vst3q_s32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
-; CHECK: test_vst3q_s64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
-; CHECK: test_vst3q_f32
-; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
-; CHECK: test_vst3q_f64
-; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
-; CHECK: test_vst3_s8
-; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
- tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
- ret void
-}
-
-define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
-; CHECK: test_vst3_s16
-; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
- ret void
-}
-
-define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
-; CHECK: test_vst3_s32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
-; CHECK: test_vst3_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
-; CHECK: test_vst3_f32
-; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
- ret void
-}
-
-define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
-; CHECK: test_vst3_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
- ret void
-}
-
-define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
-; CHECK: test_vst4q_s8
-; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
- tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
- ret void
-}
-
-define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
-; CHECK: test_vst4q_s16
-; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
- ret void
-}
-
-define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
-; CHECK: test_vst4q_s32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
-; CHECK: test_vst4q_s64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
-; CHECK: test_vst4q_f32
-; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
-; CHECK: test_vst4q_f64
-; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
-; CHECK: test_vst4_s8
-; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
- tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
- ret void
-}
-
-define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
-; CHECK: test_vst4_s16
-; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
- %1 = bitcast i16* %a to i8*
- tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
- ret void
-}
-
-define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
-; CHECK: test_vst4_s32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
- %1 = bitcast i32* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
-; CHECK: test_vst4_s64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
- %1 = bitcast i64* %a to i8*
- tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
-; CHECK: test_vst4_f32
-; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
- %1 = bitcast float* %a to i8*
- tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
- ret void
-}
-
-define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
-; CHECK: test_vst4_f64
-; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
- %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
- %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
- %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
- %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
- %1 = bitcast double* %a to i8*
- tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
- ret void
-}
-
-declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
-declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
-declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
-declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
-declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
-declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
-declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
-declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
-declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
-declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
-declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
-declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
-declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32) \ No newline at end of file
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index 086d487..9127ed8 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -3880,224 +3880,3 @@
// CHECK-ERROR: error: invalid operand for instruction
// CHECK-ERROR: frsqrts d8, s22, d18
// CHECK-ERROR: ^
-
-//----------------------------------------------------------------------
-// Vector load/store multiple N-element structure (class SIMD lselem)
-//----------------------------------------------------------------------
- ld1 {x3}, [x2]
- ld1 {v4}, [x0]
- ld1 {v32.16b}, [x0]
- ld1 {v15.8h}, [x32]
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: ld1 {x3}, [x2]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: ld1 {v4}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: ld1 {v32.16b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: ld1 {v15.8h}, [x32]
-// CHECK-ERROR: ^
-
- ld1 {v0.16b, v2.16b}, [x0]
- ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
- ld1 v0.8b, v1.8b}, [x0]
- ld1 {v0.8h-v4.8h}, [x0]
- ld1 {v1.8h-v1.8h}, [x0]
- ld1 {v15.8h-v17.4h}, [x15]
- ld1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld1 {v0.16b, v2.16b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: ld1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: '{' expected
-// CHECK-ERROR: ld1 v0.8b, v1.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: ld1 {v0.8h-v4.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: ld1 {v1.8h-v1.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: ld1 {v15.8h-v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: '}' expected
-// CHECK-ERROR: ld1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: ^
-
- ld2 {v15.8h, v16.4h}, [x15]
- ld2 {v0.8b, v2.8b}, [x0]
- ld2 {v15.4h, v16.4h, v17.4h}, [x32]
- ld2 {v15.8h-v16.4h}, [x15]
- ld2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld2 {v15.8h, v16.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld2 {v0.8b, v2.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: ld2 {v15.4h, v16.4h, v17.4h}, [x32]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: ld2 {v15.8h-v16.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: ld2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: ^
-
- ld3 {v15.8h, v16.8h, v17.4h}, [x15]
- ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
- ld3 {v0.8b, v2.8b, v3.8b}, [x0]
- ld3 {v15.8h-v17.4h}, [x15]
- ld3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld3 {v15.8h, v16.8h, v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: ld3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld3 {v0.8b, v2.8b, v3.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: ld3 {v15.8h-v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: ld3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: ^
-
- ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
- ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
- ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
- ld4 {v15.8h-v18.4h}, [x15]
- ld4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: ld4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: ld4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: ld4 {v15.8h-v18.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: ld4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: ^
-
- st1 {x3}, [x2]
- st1 {v4}, [x0]
- st1 {v32.16b}, [x0]
- st1 {v15.8h}, [x32]
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: st1 {x3}, [x2]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: st1 {v4}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: st1 {v32.16b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: st1 {v15.8h}, [x32]
-// CHECK-ERROR: ^
-
- st1 {v0.16b, v2.16b}, [x0]
- st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
- st1 v0.8b, v1.8b}, [x0]
- st1 {v0.8h-v4.8h}, [x0]
- st1 {v1.8h-v1.8h}, [x0]
- st1 {v15.8h-v17.4h}, [x15]
- st1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st1 {v0.16b, v2.16b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: st1 {v0.8h, v1.8h, v2.8h, v3.8h, v4.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: '{' expected
-// CHECK-ERROR: st1 v0.8b, v1.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: st1 {v0.8h-v4.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: st1 {v1.8h-v1.8h}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: st1 {v15.8h-v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: '}' expected
-// CHECK-ERROR: st1 {v0.8b-v2.8b, [x0]
-// CHECK-ERROR: ^
-
- st2 {v15.8h, v16.4h}, [x15]
- st2 {v0.8b, v2.8b}, [x0]
- st2 {v15.4h, v16.4h, v17.4h}, [x30]
- st2 {v15.8h-v16.4h}, [x15]
- st2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st2 {v15.8h, v16.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st2 {v0.8b, v2.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: st2 {v15.4h, v16.4h, v17.4h}, [x30]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: st2 {v15.8h-v16.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: st2 {v0.2d-v2.2d}, [x0]
-// CHECK-ERROR: ^
-
- st3 {v15.8h, v16.8h, v17.4h}, [x15]
- st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
- st3 {v0.8b, v2.8b, v3.8b}, [x0]
- st3 {v15.8h-v17.4h}, [x15]
- st3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st3 {v15.8h, v16.8h, v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected vector type register
-// CHECK-ERROR: st3 {v0.8b, v1,8b, v2.8b, v3.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st3 {v0.8b, v2.8b, v3.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: st3 {v15.8h-v17.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: st3 {v31.4s-v2.4s}, [sp]
-// CHECK-ERROR: ^
-
- st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
- st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
- st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
- st4 {v15.8h-v18.4h}, [x15]
- st4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st4 {v15.8h, v16.8h, v17.4h, v18.8h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid space between two vectors
-// CHECK-ERROR: st4 {v0.8b, v2.8b, v3.8b, v4.8b}, [x0]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid number of vectors
-// CHECK-ERROR: st4 {v15.4h, v16.4h, v17.4h, v18.4h, v19.4h}, [x31]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: expected the same vector layout
-// CHECK-ERROR: st4 {v15.8h-v18.4h}, [x15]
-// CHECK-ERROR: ^
-// CHECK-ERROR: error: invalid operand for instruction
-// CHECK-ERROR: st4 {v31.2s-v1.2s}, [x31]
-// CHECK-ERROR: ^
diff --git a/test/MC/AArch64/neon-simd-ldst-multi-elem.s b/test/MC/AArch64/neon-simd-ldst-multi-elem.s
deleted file mode 100644
index 05fe4da..0000000
--- a/test/MC/AArch64/neon-simd-ldst-multi-elem.s
+++ /dev/null
@@ -1,463 +0,0 @@
-// RUN: llvm-mc -triple=aarch64 -mattr=+neon -show-encoding < %s | FileCheck %s
-
-// Check that the assembler can handle the documented syntax for AArch64
-
-//------------------------------------------------------------------------------
-// Store multiple 1-element structures from one register
-//------------------------------------------------------------------------------
- st1 {v0.16b}, [x0]
- st1 {v15.8h}, [x15]
- st1 {v31.4s}, [sp]
- st1 {v0.2d}, [x0]
- st1 {v0.8b}, [x0]
- st1 {v15.4h}, [x15]
- st1 {v31.2s}, [sp]
- st1 {v0.1d}, [x0]
-// CHECK: st1 {v0.16b}, [x0] // encoding: [0x00,0x70,0x00,0x4c]
-// CHECK: st1 {v15.8h}, [x15] // encoding: [0xef,0x75,0x00,0x4c]
-// CHECK: st1 {v31.4s}, [sp] // encoding: [0xff,0x7b,0x00,0x4c]
-// CHECK: st1 {v0.2d}, [x0] // encoding: [0x00,0x7c,0x00,0x4c]
-// CHECK: st1 {v0.8b}, [x0] // encoding: [0x00,0x70,0x00,0x0c]
-// CHECK: st1 {v15.4h}, [x15] // encoding: [0xef,0x75,0x00,0x0c]
-// CHECK: st1 {v31.2s}, [sp] // encoding: [0xff,0x7b,0x00,0x0c]
-// CHECK: st1 {v0.1d}, [x0] // encoding: [0x00,0x7c,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 1-element structures from two consecutive registers
-//------------------------------------------------------------------------------
- st1 {v0.16b, v1.16b}, [x0]
- st1 {v15.8h, v16.8h}, [x15]
- st1 {v31.4s, v0.4s}, [sp]
- st1 {v0.2d, v1.2d}, [x0]
- st1 {v0.8b, v1.8b}, [x0]
- st1 {v15.4h, v16.4h}, [x15]
- st1 {v31.2s, v0.2s}, [sp]
- st1 {v0.1d, v1.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x00,0x0c]
-
- st1 {v0.16b-v1.16b}, [x0]
- st1 {v15.8h-v16.8h}, [x15]
- st1 {v31.4s-v0.4s}, [sp]
- st1 {v0.2d-v1.2d}, [x0]
- st1 {v0.8b-v1.8b}, [x0]
- st1 {v15.4h-v16.4h}, [x15]
- st1 {v31.2s-v0.2s}, [sp]
- st1 {v0.1d-v1.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 1-element structures from three consecutive registers
-//------------------------------------------------------------------------------
- st1 {v0.16b, v1.16b, v2.16b}, [x0]
- st1 {v15.8h, v16.8h, v17.8h}, [x15]
- st1 {v31.4s, v0.4s, v1.4s}, [sp]
- st1 {v0.2d, v1.2d, v2.2d}, [x0]
- st1 {v0.8b, v1.8b, v2.8b}, [x0]
- st1 {v15.4h, v16.4h, v17.4h}, [x15]
- st1 {v31.2s, v0.2s, v1.2s}, [sp]
- st1 {v0.1d, v1.1d, v2.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
-
- st1 {v0.16b-v2.16b}, [x0]
- st1 {v15.8h-v17.8h}, [x15]
- st1 {v31.4s-v1.4s}, [sp]
- st1 {v0.2d-v2.2d}, [x0]
- st1 {v0.8b-v2.8b}, [x0]
- st1 {v15.4h-v17.4h}, [x15]
- st1 {v31.2s-v1.2s}, [sp]
- st1 {v0.1d-v2.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 1-element structures from four consecutive registers
-//------------------------------------------------------------------------------
- st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
- st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
- st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
- st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
- st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
- st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
- st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
- st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
-
- st1 {v0.16b-v3.16b}, [x0]
- st1 {v15.8h-v18.8h}, [x15]
- st1 {v31.4s-v2.4s}, [sp]
- st1 {v0.2d-v3.2d}, [x0]
- st1 {v0.8b-v3.8b}, [x0]
- st1 {v15.4h-v18.4h}, [x15]
- st1 {v31.2s-v2.2s}, [sp]
- st1 {v0.1d-v3.1d}, [x0]
-// CHECK: st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x00,0x4c]
-// CHECK: st1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x00,0x4c]
-// CHECK: st1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x00,0x4c]
-// CHECK: st1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x00,0x4c]
-// CHECK: st1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x00,0x0c]
-// CHECK: st1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x00,0x0c]
-// CHECK: st1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x00,0x0c]
-// CHECK: st1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 2-element structures from two consecutive registers
-//------------------------------------------------------------------------------
- st2 {v0.16b, v1.16b}, [x0]
- st2 {v15.8h, v16.8h}, [x15]
- st2 {v31.4s, v0.4s}, [sp]
- st2 {v0.2d, v1.2d}, [x0]
- st2 {v0.8b, v1.8b}, [x0]
- st2 {v15.4h, v16.4h}, [x15]
- st2 {v31.2s, v0.2s}, [sp]
-// CHECK: st2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x00,0x4c]
-// CHECK: st2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
-// CHECK: st2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x00,0x4c]
-// CHECK: st2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x00,0x4c]
-// CHECK: st2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x00,0x0c]
-// CHECK: st2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
-// CHECK: st2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x00,0x0c]
-
- st2 {v0.16b-v1.16b}, [x0]
- st2 {v15.8h-v16.8h}, [x15]
- st2 {v31.4s-v0.4s}, [sp]
- st2 {v0.2d-v1.2d}, [x0]
- st2 {v0.8b-v1.8b}, [x0]
- st2 {v15.4h-v16.4h}, [x15]
- st2 {v31.2s-v0.2s}, [sp]
-// CHECK: st2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x00,0x4c]
-// CHECK: st2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x00,0x4c]
-// CHECK: st2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x00,0x4c]
-// CHECK: st2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x00,0x4c]
-// CHECK: st2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x00,0x0c]
-// CHECK: st2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x00,0x0c]
-// CHECK: st2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 3-element structures from three consecutive registers
-//------------------------------------------------------------------------------
- st3 {v0.16b, v1.16b, v2.16b}, [x0]
- st3 {v15.8h, v16.8h, v17.8h}, [x15]
- st3 {v31.4s, v0.4s, v1.4s}, [sp]
- st3 {v0.2d, v1.2d, v2.2d}, [x0]
- st3 {v0.8b, v1.8b, v2.8b}, [x0]
- st3 {v15.4h, v16.4h, v17.4h}, [x15]
- st3 {v31.2s, v0.2s, v1.2s}, [sp]
-// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
-// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
-// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
-// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
-// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
-// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
-// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
-
- st3 {v0.16b-v2.16b}, [x0]
- st3 {v15.8h-v17.8h}, [x15]
- st3 {v31.4s-v1.4s}, [sp]
- st3 {v0.2d-v2.2d}, [x0]
- st3 {v0.8b-v2.8b}, [x0]
- st3 {v15.4h-v17.4h}, [x15]
- st3 {v31.2s-v1.2s}, [sp]
-// CHECK: st3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x00,0x4c]
-// CHECK: st3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x00,0x4c]
-// CHECK: st3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x00,0x4c]
-// CHECK: st3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x00,0x4c]
-// CHECK: st3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x00,0x0c]
-// CHECK: st3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x00,0x0c]
-// CHECK: st3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Store multiple 4-element structures from four consecutive registers
-//------------------------------------------------------------------------------
- st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
- st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
- st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
- st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
- st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
- st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
- st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
-// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
-// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
-// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
-// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
-// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
-// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
-
- st4 {v0.16b-v3.16b}, [x0]
- st4 {v15.8h-v18.8h}, [x15]
- st4 {v31.4s-v2.4s}, [sp]
- st4 {v0.2d-v3.2d}, [x0]
- st4 {v0.8b-v3.8b}, [x0]
- st4 {v15.4h-v18.4h}, [x15]
- st4 {v31.2s-v2.2s}, [sp]
-// CHECK: st4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x00,0x4c]
-// CHECK: st4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x00,0x4c]
-// CHECK: st4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x00,0x4c]
-// CHECK: st4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x00,0x4c]
-// CHECK: st4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x00,0x0c]
-// CHECK: st4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x00,0x0c]
-// CHECK: st4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x00,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 1-element structures to one register
-//------------------------------------------------------------------------------
- ld1 {v0.16b}, [x0]
- ld1 {v15.8h}, [x15]
- ld1 {v31.4s}, [sp]
- ld1 {v0.2d}, [x0]
- ld1 {v0.8b}, [x0]
- ld1 {v15.4h}, [x15]
- ld1 {v31.2s}, [sp]
- ld1 {v0.1d}, [x0]
-// CHECK: ld1 {v0.16b}, [x0] // encoding: [0x00,0x70,0x40,0x4c]
-// CHECK: ld1 {v15.8h}, [x15] // encoding: [0xef,0x75,0x40,0x4c]
-// CHECK: ld1 {v31.4s}, [sp] // encoding: [0xff,0x7b,0x40,0x4c]
-// CHECK: ld1 {v0.2d}, [x0] // encoding: [0x00,0x7c,0x40,0x4c]
-// CHECK: ld1 {v0.8b}, [x0] // encoding: [0x00,0x70,0x40,0x0c]
-// CHECK: ld1 {v15.4h}, [x15] // encoding: [0xef,0x75,0x40,0x0c]
-// CHECK: ld1 {v31.2s}, [sp] // encoding: [0xff,0x7b,0x40,0x0c]
-// CHECK: ld1 {v0.1d}, [x0] // encoding: [0x00,0x7c,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 1-element structures to two consecutive registers
-//------------------------------------------------------------------------------
- ld1 {v0.16b, v1.16b}, [x0]
- ld1 {v15.8h, v16.8h}, [x15]
- ld1 {v31.4s, v0.4s}, [sp]
- ld1 {v0.2d, v1.2d}, [x0]
- ld1 {v0.8b, v1.8b}, [x0]
- ld1 {v15.4h, v16.4h}, [x15]
- ld1 {v31.2s, v0.2s}, [sp]
- ld1 {v0.1d, v1.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x40,0x0c]
-
- ld1 {v0.16b-v1.16b}, [x0]
- ld1 {v15.8h-v16.8h}, [x15]
- ld1 {v31.4s-v0.4s}, [sp]
- ld1 {v0.2d-v1.2d}, [x0]
- ld1 {v0.8b-v1.8b}, [x0]
- ld1 {v15.4h-v16.4h}, [x15]
- ld1 {v31.2s-v0.2s}, [sp]
- ld1 {v0.1d-v1.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0xa0,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0xa5,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0xab,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0xac,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0xa0,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0xa5,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0xab,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d}, [x0] // encoding: [0x00,0xac,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 1-element structures to three consecutive registers
-//------------------------------------------------------------------------------
- ld1 {v0.16b, v1.16b, v2.16b}, [x0]
- ld1 {v15.8h, v16.8h, v17.8h}, [x15]
- ld1 {v31.4s, v0.4s, v1.4s}, [sp]
- ld1 {v0.2d, v1.2d, v2.2d}, [x0]
- ld1 {v0.8b, v1.8b, v2.8b}, [x0]
- ld1 {v15.4h, v16.4h, v17.4h}, [x15]
- ld1 {v31.2s, v0.2s, v1.2s}, [sp]
- ld1 {v0.1d, v1.1d, v2.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
-
- ld1 {v0.16b-v2.16b}, [x0]
- ld1 {v15.8h-v17.8h}, [x15]
- ld1 {v31.4s-v1.4s}, [sp]
- ld1 {v0.2d-v2.2d}, [x0]
- ld1 {v0.8b-v2.8b}, [x0]
- ld1 {v15.4h-v17.4h}, [x15]
- ld1 {v31.2s-v1.2s}, [sp]
- ld1 {v0.1d-v2.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x60,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x65,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x6b,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x6c,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x60,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x65,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x6b,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d}, [x0] // encoding: [0x00,0x6c,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 1-element structures to four consecutive registers
-//------------------------------------------------------------------------------
- ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
- ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
- ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
- ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
- ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
- ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
- ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
- ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
-
- ld1 {v0.16b-v3.16b}, [x0]
- ld1 {v15.8h-v18.8h}, [x15]
- ld1 {v31.4s-v2.4s}, [sp]
- ld1 {v0.2d-v3.2d}, [x0]
- ld1 {v0.8b-v3.8b}, [x0]
- ld1 {v15.4h-v18.4h}, [x15]
- ld1 {v31.2s-v2.2s}, [sp]
- ld1 {v0.1d-v3.1d}, [x0]
-// CHECK: ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x20,0x40,0x4c]
-// CHECK: ld1 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x25,0x40,0x4c]
-// CHECK: ld1 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x2b,0x40,0x4c]
-// CHECK: ld1 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x2c,0x40,0x4c]
-// CHECK: ld1 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x20,0x40,0x0c]
-// CHECK: ld1 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x25,0x40,0x0c]
-// CHECK: ld1 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x2b,0x40,0x0c]
-// CHECK: ld1 {v0.1d, v1.1d, v2.1d, v3.1d}, [x0] // encoding: [0x00,0x2c,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 4-element structures to two consecutive registers
-//------------------------------------------------------------------------------
- ld2 {v0.16b, v1.16b}, [x0]
- ld2 {v15.8h, v16.8h}, [x15]
- ld2 {v31.4s, v0.4s}, [sp]
- ld2 {v0.2d, v1.2d}, [x0]
- ld2 {v0.8b, v1.8b}, [x0]
- ld2 {v15.4h, v16.4h}, [x15]
- ld2 {v31.2s, v0.2s}, [sp]
-// CHECK: ld2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x40,0x4c]
-// CHECK: ld2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
-// CHECK: ld2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x40,0x4c]
-// CHECK: ld2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x40,0x4c]
-// CHECK: ld2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x40,0x0c]
-// CHECK: ld2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
-// CHECK: ld2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x40,0x0c]
-
- ld2 {v0.16b-v1.16b}, [x0]
- ld2 {v15.8h-v16.8h}, [x15]
- ld2 {v31.4s-v0.4s}, [sp]
- ld2 {v0.2d-v1.2d}, [x0]
- ld2 {v0.8b-v1.8b}, [x0]
- ld2 {v15.4h-v16.4h}, [x15]
- ld2 {v31.2s-v0.2s}, [sp]
-// CHECK: ld2 {v0.16b, v1.16b}, [x0] // encoding: [0x00,0x80,0x40,0x4c]
-// CHECK: ld2 {v15.8h, v16.8h}, [x15] // encoding: [0xef,0x85,0x40,0x4c]
-// CHECK: ld2 {v31.4s, v0.4s}, [sp] // encoding: [0xff,0x8b,0x40,0x4c]
-// CHECK: ld2 {v0.2d, v1.2d}, [x0] // encoding: [0x00,0x8c,0x40,0x4c]
-// CHECK: ld2 {v0.8b, v1.8b}, [x0] // encoding: [0x00,0x80,0x40,0x0c]
-// CHECK: ld2 {v15.4h, v16.4h}, [x15] // encoding: [0xef,0x85,0x40,0x0c]
-// CHECK: ld2 {v31.2s, v0.2s}, [sp] // encoding: [0xff,0x8b,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 3-element structures to three consecutive registers
-//------------------------------------------------------------------------------
- ld3 {v0.16b, v1.16b, v2.16b}, [x0]
- ld3 {v15.8h, v16.8h, v17.8h}, [x15]
- ld3 {v31.4s, v0.4s, v1.4s}, [sp]
- ld3 {v0.2d, v1.2d, v2.2d}, [x0]
- ld3 {v0.8b, v1.8b, v2.8b}, [x0]
- ld3 {v15.4h, v16.4h, v17.4h}, [x15]
- ld3 {v31.2s, v0.2s, v1.2s}, [sp]
-// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
-// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
-// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
-// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
-// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
-// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
-// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
-
- ld3 {v0.16b-v2.16b}, [x0]
- ld3 {v15.8h-v17.8h}, [x15]
- ld3 {v31.4s-v1.4s}, [sp]
- ld3 {v0.2d-v2.2d}, [x0]
- ld3 {v0.8b-v2.8b}, [x0]
- ld3 {v15.4h-v17.4h}, [x15]
- ld3 {v31.2s-v1.2s}, [sp]
-// CHECK: ld3 {v0.16b, v1.16b, v2.16b}, [x0] // encoding: [0x00,0x40,0x40,0x4c]
-// CHECK: ld3 {v15.8h, v16.8h, v17.8h}, [x15] // encoding: [0xef,0x45,0x40,0x4c]
-// CHECK: ld3 {v31.4s, v0.4s, v1.4s}, [sp] // encoding: [0xff,0x4b,0x40,0x4c]
-// CHECK: ld3 {v0.2d, v1.2d, v2.2d}, [x0] // encoding: [0x00,0x4c,0x40,0x4c]
-// CHECK: ld3 {v0.8b, v1.8b, v2.8b}, [x0] // encoding: [0x00,0x40,0x40,0x0c]
-// CHECK: ld3 {v15.4h, v16.4h, v17.4h}, [x15] // encoding: [0xef,0x45,0x40,0x0c]
-// CHECK: ld3 {v31.2s, v0.2s, v1.2s}, [sp] // encoding: [0xff,0x4b,0x40,0x0c]
-
-//------------------------------------------------------------------------------
-// Load multiple 4-element structures to four consecutive registers
-//------------------------------------------------------------------------------
- ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0]
- ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15]
- ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp]
- ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0]
- ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0]
- ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15]
- ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp]
-// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
-// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
-// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
-// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
-// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
-// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
-// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]
-
- ld4 {v0.16b-v3.16b}, [x0]
- ld4 {v15.8h-v18.8h}, [x15]
- ld4 {v31.4s-v2.4s}, [sp]
- ld4 {v0.2d-v3.2d}, [x0]
- ld4 {v0.8b-v3.8b}, [x0]
- ld4 {v15.4h-v18.4h}, [x15]
- ld4 {v31.2s-v2.2s}, [sp]
-// CHECK: ld4 {v0.16b, v1.16b, v2.16b, v3.16b}, [x0] // encoding: [0x00,0x00,0x40,0x4c]
-// CHECK: ld4 {v15.8h, v16.8h, v17.8h, v18.8h}, [x15] // encoding: [0xef,0x05,0x40,0x4c]
-// CHECK: ld4 {v31.4s, v0.4s, v1.4s, v2.4s}, [sp] // encoding: [0xff,0x0b,0x40,0x4c]
-// CHECK: ld4 {v0.2d, v1.2d, v2.2d, v3.2d}, [x0] // encoding: [0x00,0x0c,0x40,0x4c]
-// CHECK: ld4 {v0.8b, v1.8b, v2.8b, v3.8b}, [x0] // encoding: [0x00,0x00,0x40,0x0c]
-// CHECK: ld4 {v15.4h, v16.4h, v17.4h, v18.4h}, [x15] // encoding: [0xef,0x05,0x40,0x0c]
-// CHECK: ld4 {v31.2s, v0.2s, v1.2s, v2.2s}, [sp] // encoding: [0xff,0x0b,0x40,0x0c]