aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.cpp165
-rw-r--r--lib/Target/AArch64/AArch64ISelLowering.h10
-rw-r--r--lib/Target/AArch64/AArch64InstrNEON.td379
-rw-r--r--test/CodeGen/AArch64/neon-perm.ll14
4 files changed, 240 insertions, 328 deletions
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 003359d..ee98b4c 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -921,6 +921,18 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
return "AArch64ISD::NEON_REV32";
case AArch64ISD::NEON_REV64:
return "AArch64ISD::NEON_REV64";
+ case AArch64ISD::NEON_UZP1:
+ return "AArch64ISD::NEON_UZP1";
+ case AArch64ISD::NEON_UZP2:
+ return "AArch64ISD::NEON_UZP2";
+ case AArch64ISD::NEON_ZIP1:
+ return "AArch64ISD::NEON_ZIP1";
+ case AArch64ISD::NEON_ZIP2:
+ return "AArch64ISD::NEON_ZIP2";
+ case AArch64ISD::NEON_TRN1:
+ return "AArch64ISD::NEON_TRN1";
+ case AArch64ISD::NEON_TRN2:
+ return "AArch64ISD::NEON_TRN2";
case AArch64ISD::NEON_LD1_UPD:
return "AArch64ISD::NEON_LD1_UPD";
case AArch64ISD::NEON_LD2_UPD:
@@ -3826,6 +3838,59 @@ AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
+// Check whether a Build Vector could be presented as Shuffle Vector. If yes,
+// try to call LowerVECTOR_SHUFFLE to lower it.
+bool AArch64TargetLowering::isKnownShuffleVector(SDValue Op, SelectionDAG &DAG,
+ SDValue &Res) const {
+ SDLoc DL(Op);
+ EVT VT = Op.getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned V0NumElts = 0;
+ int Mask[16];
+ SDValue V0, V1;
+
+ // Check if all elements are extracted from less than 3 vectors.
+ for (unsigned i = 0; i < NumElts; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return false;
+
+ if (V0.getNode() == 0) {
+ V0 = Elt.getOperand(0);
+ V0NumElts = V0.getValueType().getVectorNumElements();
+ }
+ if (Elt.getOperand(0) == V0) {
+ Mask[i] = (cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue());
+ continue;
+ } else if (V1.getNode() == 0) {
+ V1 = Elt.getOperand(0);
+ }
+ if (Elt.getOperand(0) == V1) {
+ unsigned Lane = cast<ConstantSDNode>(Elt->getOperand(1))->getZExtValue();
+ Mask[i] = (Lane + V0NumElts);
+ continue;
+ } else {
+ return false;
+ }
+ }
+
+ if (!V1.getNode() && V0NumElts == NumElts * 2) {
+ V1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+ DAG.getConstant(NumElts, MVT::i64));
+ V0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V0,
+ DAG.getConstant(0, MVT::i64));
+ V0NumElts = V0.getValueType().getVectorNumElements();
+ }
+
+ if (V1.getNode() && NumElts == V0NumElts &&
+ V0NumElts == V1.getValueType().getVectorNumElements()) {
+ SDValue Shuffle = DAG.getVectorShuffle(VT, DL, V0, V1, Mask);
+ Res = LowerVECTOR_SHUFFLE(Shuffle, DAG);
+ return true;
+ } else
+ return false;
+}
+
// If this is a case we can't handle, return null and let the default
// expansion code take care of it.
SDValue
@@ -3964,7 +4029,7 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
SmallVector<SDValue, 3> Ops;
Ops.push_back(N);
Ops.push_back(Op.getOperand(I));
- Ops.push_back(DAG.getConstant(I, MVT::i32));
+ Ops.push_back(DAG.getConstant(I, MVT::i64));
N = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, &Ops[0], 3);
}
}
@@ -3980,6 +4045,11 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
if (isConstant)
return SDValue();
+ // Try to lower this in lowering ShuffleVector way.
+ SDValue Shuf;
+ if (isKnownShuffleVector(Op, DAG, Shuf))
+ return Shuf;
+
// If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
// know the default expansion would otherwise fall back on something even
// worse. For a vector with one or two non-undef values, that's
@@ -3992,7 +4062,7 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
SDValue V = Op.getOperand(i);
if (V.getOpcode() == ISD::UNDEF)
continue;
- SDValue LaneIdx = DAG.getConstant(i, MVT::i32);
+ SDValue LaneIdx = DAG.getConstant(i, MVT::i64);
Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, Vec, V, LaneIdx);
}
return Vec;
@@ -4030,6 +4100,83 @@ static bool isREVMask(ArrayRef<int> M, EVT VT, unsigned BlockSize) {
return true;
}
+// isPermuteMask - Check whether the vector shuffle matches to UZP, ZIP and
+// TRN instruction.
+static unsigned isPermuteMask(ArrayRef<int> M, EVT VT) {
+ unsigned NumElts = VT.getVectorNumElements();
+ if (NumElts < 4)
+ return 0;
+
+ bool ismatch = true;
+
+ // Check UZP1
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != i * 2) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_UZP1;
+
+ // Check UZP2
+ ismatch = true;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != i * 2 + 1) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_UZP2;
+
+ // Check ZIP1
+ ismatch = true;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != i / 2 + NumElts * (i % 2)) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_ZIP1;
+
+ // Check ZIP2
+ ismatch = true;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != (NumElts + i) / 2 + NumElts * (i % 2)) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_ZIP2;
+
+ // Check TRN1
+ ismatch = true;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != i + (NumElts - 1) * (i % 2)) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_TRN1;
+
+ // Check TRN2
+ ismatch = true;
+ for (unsigned i = 0; i < NumElts; ++i) {
+ if ((unsigned)M[i] != 1 + i + (NumElts - 1) * (i % 2)) {
+ ismatch = false;
+ break;
+ }
+ }
+ if (ismatch)
+ return AArch64ISD::NEON_TRN2;
+
+ return 0;
+}
+
SDValue
AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
SelectionDAG &DAG) const {
@@ -4056,6 +4203,10 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
if (isREVMask(ShuffleMask, VT, 16))
return DAG.getNode(AArch64ISD::NEON_REV16, dl, VT, V1);
+ unsigned ISDNo = isPermuteMask(ShuffleMask, VT);
+ if (ISDNo)
+ return DAG.getNode(ISDNo, dl, VT, V1, V2);
+
// If the element of shuffle mask are all the same constant, we can
// transform it into either NEON_VDUP or NEON_VDUPLANE
if (ShuffleVectorSDNode::isSplatMask(&ShuffleMask[0], VT)) {
@@ -4167,10 +4318,12 @@ AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
else
EltVT = (EltSize == 64) ? MVT::i64 : MVT::i32;
- ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
- DAG.getConstant(Mask, MVT::i64));
- InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
- DAG.getConstant(InsIndex[I], MVT::i64));
+ if (Mask >= 0) {
+ ExtV = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, ExtV,
+ DAG.getConstant(Mask, MVT::i64));
+ InsV = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, InsV, ExtV,
+ DAG.getConstant(InsIndex[I], MVT::i64));
+ }
}
return InsV;
}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index a51d10f..4cc2135 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -125,6 +125,14 @@ namespace AArch64ISD {
// Vector FP move immediate
NEON_FMOVIMM,
+ // Vector permute
+ NEON_UZP1,
+ NEON_UZP2,
+ NEON_ZIP1,
+ NEON_ZIP2,
+ NEON_TRN1,
+ NEON_TRN2,
+
// Vector Element reverse
NEON_REV64,
NEON_REV32,
@@ -225,6 +233,8 @@ public:
SDLoc dl, SelectionDAG &DAG,
SmallVectorImpl<SDValue> &InVals) const;
+ bool isKnownShuffleVector(SDValue Op, SelectionDAG &DAG, SDValue &Res) const;
+
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
const AArch64Subtarget *ST) const;
diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td
index c0c572a..f6e747a 100644
--- a/lib/Target/AArch64/AArch64InstrNEON.td
+++ b/lib/Target/AArch64/AArch64InstrNEON.td
@@ -46,6 +46,15 @@ def SDTARMVSH : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
def Neon_sqrshlImm : SDNode<"AArch64ISD::NEON_QSHLs", SDTARMVSH>;
def Neon_uqrshlImm : SDNode<"AArch64ISD::NEON_QSHLu", SDTARMVSH>;
+def SDTPERMUTE : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+ SDTCisSameAs<0, 2>]>;
+def Neon_uzp1 : SDNode<"AArch64ISD::NEON_UZP1", SDTPERMUTE>;
+def Neon_uzp2 : SDNode<"AArch64ISD::NEON_UZP2", SDTPERMUTE>;
+def Neon_zip1 : SDNode<"AArch64ISD::NEON_ZIP1", SDTPERMUTE>;
+def Neon_zip2 : SDNode<"AArch64ISD::NEON_ZIP2", SDTPERMUTE>;
+def Neon_trn1 : SDNode<"AArch64ISD::NEON_TRN1", SDTPERMUTE>;
+def Neon_trn2 : SDNode<"AArch64ISD::NEON_TRN2", SDTPERMUTE>;
+
def SDTVSHUF : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
def Neon_rev64 : SDNode<"AArch64ISD::NEON_REV64", SDTVSHUF>;
def Neon_rev32 : SDNode<"AArch64ISD::NEON_REV32", SDTVSHUF>;
@@ -2384,331 +2393,57 @@ defm FMINV : NeonI_2VAcross_3<0b1, 0b01111, 0b10, "fminv",
// The followings are for instruction class (Perm)
class NeonI_Permute<bit q, bits<2> size, bits<3> opcode,
- string asmop, RegisterOperand OpVPR, string OpS>
+ string asmop, RegisterOperand OpVPR, string OpS,
+ SDPatternOperator opnode, ValueType Ty>
: NeonI_Perm<q, size, opcode,
(outs OpVPR:$Rd), (ins OpVPR:$Rn, OpVPR:$Rm),
asmop # "\t$Rd." # OpS # ", $Rn." # OpS # ", $Rm." # OpS,
- [], NoItinerary>;
-
-multiclass NeonI_Perm_pat<bits<3> opcode, string asmop> {
- def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop, VPR64, "8b">;
- def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop, VPR128, "16b">;
- def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop, VPR64, "4h">;
- def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop, VPR128, "8h">;
- def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop, VPR64, "2s">;
- def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop, VPR128, "4s">;
- def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop, VPR128, "2d">;
-}
-
-defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1">;
-defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1">;
-defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1">;
-defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2">;
-defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2">;
-defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2">;
-
-// Extract and Insert
-def NI_ei_i32 : PatFrag<(ops node:$Rn, node:$Rm, node:$Ext, node:$Ins),
- (vector_insert node:$Rn,
- (i32 (vector_extract node:$Rm, node:$Ext)),
- node:$Ins)>;
-
-def NI_ei_f32 : PatFrag<(ops node:$Rn, node:$Rm, node:$Ext, node:$Ins),
- (vector_insert node:$Rn,
- (f32 (vector_extract node:$Rm, node:$Ext)),
- node:$Ins)>;
-
-// uzp1
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rn),
- (v16i8 VPR128:$Rn), 2, 1)),
- (v16i8 VPR128:$Rn), 4, 2)),
- (v16i8 VPR128:$Rn), 6, 3)),
- (v16i8 VPR128:$Rn), 8, 4)),
- (v16i8 VPR128:$Rn), 10, 5)),
- (v16i8 VPR128:$Rn), 12, 6)),
- (v16i8 VPR128:$Rn), 14, 7)),
- (v16i8 VPR128:$Rm), 0, 8)),
- (v16i8 VPR128:$Rm), 2, 9)),
- (v16i8 VPR128:$Rm), 4, 10)),
- (v16i8 VPR128:$Rm), 6, 11)),
- (v16i8 VPR128:$Rm), 8, 12)),
- (v16i8 VPR128:$Rm), 10, 13)),
- (v16i8 VPR128:$Rm), 12, 14)),
- (v16i8 VPR128:$Rm), 14, 15)),
- (UZP1vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Uzp1_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rn),
- (Ty VPR:$Rn), 2, 1)),
- (Ty VPR:$Rn), 4, 2)),
- (Ty VPR:$Rn), 6, 3)),
- (Ty VPR:$Rm), 0, 4)),
- (Ty VPR:$Rm), 2, 5)),
- (Ty VPR:$Rm), 4, 6)),
- (Ty VPR:$Rm), 6, 7)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Uzp1_v8<v8i8, VPR64, UZP1vvv_8b>;
-def : NI_Uzp1_v8<v8i16, VPR128, UZP1vvv_8h>;
-
-class NI_Uzp1_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei (Ty (ei
- (Ty VPR:$Rn),
- (Ty VPR:$Rn), 2, 1)),
- (Ty VPR:$Rm), 0, 2)),
- (Ty VPR:$Rm), 2, 3)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Uzp1_v4<v4i16, VPR64, UZP1vvv_4h, NI_ei_i32>;
-def : NI_Uzp1_v4<v4i32, VPR128, UZP1vvv_4s, NI_ei_i32>;
-def : NI_Uzp1_v4<v4f32, VPR128, UZP1vvv_4s, NI_ei_f32>;
-
-// uzp2
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rm),
- (v16i8 VPR128:$Rn), 1, 0)),
- (v16i8 VPR128:$Rn), 3, 1)),
- (v16i8 VPR128:$Rn), 5, 2)),
- (v16i8 VPR128:$Rn), 7, 3)),
- (v16i8 VPR128:$Rn), 9, 4)),
- (v16i8 VPR128:$Rn), 11, 5)),
- (v16i8 VPR128:$Rn), 13, 6)),
- (v16i8 VPR128:$Rn), 15, 7)),
- (v16i8 VPR128:$Rm), 1, 8)),
- (v16i8 VPR128:$Rm), 3, 9)),
- (v16i8 VPR128:$Rm), 5, 10)),
- (v16i8 VPR128:$Rm), 7, 11)),
- (v16i8 VPR128:$Rm), 9, 12)),
- (v16i8 VPR128:$Rm), 11, 13)),
- (v16i8 VPR128:$Rm), 13, 14)),
- (UZP2vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Uzp2_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 1, 0)),
- (Ty VPR:$Rn), 3, 1)),
- (Ty VPR:$Rn), 5, 2)),
- (Ty VPR:$Rn), 7, 3)),
- (Ty VPR:$Rm), 1, 4)),
- (Ty VPR:$Rm), 3, 5)),
- (Ty VPR:$Rm), 5, 6)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Uzp2_v8<v8i8, VPR64, UZP2vvv_8b>;
-def : NI_Uzp2_v8<v8i16, VPR128, UZP2vvv_8h>;
-
-class NI_Uzp2_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei (Ty (ei
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 1, 0)),
- (Ty VPR:$Rn), 3, 1)),
- (Ty VPR:$Rm), 1, 2)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Uzp2_v4<v4i16, VPR64, UZP2vvv_4h, NI_ei_i32>;
-def : NI_Uzp2_v4<v4i32, VPR128, UZP2vvv_4s, NI_ei_i32>;
-def : NI_Uzp2_v4<v4f32, VPR128, UZP2vvv_4s, NI_ei_f32>;
-
-// zip1
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rn),
- (v16i8 VPR128:$Rm), 0, 1)),
- (v16i8 VPR128:$Rn), 1, 2)),
- (v16i8 VPR128:$Rm), 1, 3)),
- (v16i8 VPR128:$Rn), 2, 4)),
- (v16i8 VPR128:$Rm), 2, 5)),
- (v16i8 VPR128:$Rn), 3, 6)),
- (v16i8 VPR128:$Rm), 3, 7)),
- (v16i8 VPR128:$Rn), 4, 8)),
- (v16i8 VPR128:$Rm), 4, 9)),
- (v16i8 VPR128:$Rn), 5, 10)),
- (v16i8 VPR128:$Rm), 5, 11)),
- (v16i8 VPR128:$Rn), 6, 12)),
- (v16i8 VPR128:$Rm), 6, 13)),
- (v16i8 VPR128:$Rn), 7, 14)),
- (v16i8 VPR128:$Rm), 7, 15)),
- (ZIP1vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Zip1_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rn),
- (Ty VPR:$Rm), 0, 1)),
- (Ty VPR:$Rn), 1, 2)),
- (Ty VPR:$Rm), 1, 3)),
- (Ty VPR:$Rn), 2, 4)),
- (Ty VPR:$Rm), 2, 5)),
- (Ty VPR:$Rn), 3, 6)),
- (Ty VPR:$Rm), 3, 7)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Zip1_v8<v8i8, VPR64, ZIP1vvv_8b>;
-def : NI_Zip1_v8<v8i16, VPR128, ZIP1vvv_8h>;
-
-class NI_Zip1_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei (Ty (ei
- (Ty VPR:$Rn),
- (Ty VPR:$Rm), 0, 1)),
- (Ty VPR:$Rn), 1, 2)),
- (Ty VPR:$Rm), 1, 3)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Zip1_v4<v4i16, VPR64, ZIP1vvv_4h, NI_ei_i32>;
-def : NI_Zip1_v4<v4i32, VPR128, ZIP1vvv_4s, NI_ei_i32>;
-def : NI_Zip1_v4<v4f32, VPR128, ZIP1vvv_4s, NI_ei_f32>;
-
-// zip2
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rm),
- (v16i8 VPR128:$Rn), 8, 0)),
- (v16i8 VPR128:$Rm), 8, 1)),
- (v16i8 VPR128:$Rn), 9, 2)),
- (v16i8 VPR128:$Rm), 9, 3)),
- (v16i8 VPR128:$Rn), 10, 4)),
- (v16i8 VPR128:$Rm), 10, 5)),
- (v16i8 VPR128:$Rn), 11, 6)),
- (v16i8 VPR128:$Rm), 11, 7)),
- (v16i8 VPR128:$Rn), 12, 8)),
- (v16i8 VPR128:$Rm), 12, 9)),
- (v16i8 VPR128:$Rn), 13, 10)),
- (v16i8 VPR128:$Rm), 13, 11)),
- (v16i8 VPR128:$Rn), 14, 12)),
- (v16i8 VPR128:$Rm), 14, 13)),
- (v16i8 VPR128:$Rn), 15, 14)),
- (ZIP2vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Zip2_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 4, 0)),
- (Ty VPR:$Rm), 4, 1)),
- (Ty VPR:$Rn), 5, 2)),
- (Ty VPR:$Rm), 5, 3)),
- (Ty VPR:$Rn), 6, 4)),
- (Ty VPR:$Rm), 6, 5)),
- (Ty VPR:$Rn), 7, 6)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Zip2_v8<v8i8, VPR64, ZIP2vvv_8b>;
-def : NI_Zip2_v8<v8i16, VPR128, ZIP2vvv_8h>;
-
-class NI_Zip2_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei (Ty (ei
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 2, 0)),
- (Ty VPR:$Rm), 2, 1)),
- (Ty VPR:$Rn), 3, 2)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Zip2_v4<v4i16, VPR64, ZIP2vvv_4h, NI_ei_i32>;
-def : NI_Zip2_v4<v4i32, VPR128, ZIP2vvv_4s, NI_ei_i32>;
-def : NI_Zip2_v4<v4f32, VPR128, ZIP2vvv_4s, NI_ei_f32>;
-
-// trn1
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rn),
- (v16i8 VPR128:$Rm), 0, 1)),
- (v16i8 VPR128:$Rm), 2, 3)),
- (v16i8 VPR128:$Rm), 4, 5)),
- (v16i8 VPR128:$Rm), 6, 7)),
- (v16i8 VPR128:$Rm), 8, 9)),
- (v16i8 VPR128:$Rm), 10, 11)),
- (v16i8 VPR128:$Rm), 12, 13)),
- (v16i8 VPR128:$Rm), 14, 15)),
- (TRN1vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Trn1_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rn),
- (Ty VPR:$Rm), 0, 1)),
- (Ty VPR:$Rm), 2, 3)),
- (Ty VPR:$Rm), 4, 5)),
- (Ty VPR:$Rm), 6, 7)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Trn1_v8<v8i8, VPR64, TRN1vvv_8b>;
-def : NI_Trn1_v8<v8i16, VPR128, TRN1vvv_8h>;
-
-class NI_Trn1_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei
- (Ty VPR:$Rn),
- (Ty VPR:$Rm), 0, 1)),
- (Ty VPR:$Rm), 2, 3)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Trn1_v4<v4i16, VPR64, TRN1vvv_4h, NI_ei_i32>;
-def : NI_Trn1_v4<v4i32, VPR128, TRN1vvv_4s, NI_ei_i32>;
-def : NI_Trn1_v4<v4f32, VPR128, TRN1vvv_4s, NI_ei_f32>;
-
-// trn2
-def : Pat<(v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 (NI_ei_i32 (v16i8 (NI_ei_i32
- (v16i8 VPR128:$Rm),
- (v16i8 VPR128:$Rn), 1, 0)),
- (v16i8 VPR128:$Rn), 3, 2)),
- (v16i8 VPR128:$Rn), 5, 4)),
- (v16i8 VPR128:$Rn), 7, 6)),
- (v16i8 VPR128:$Rn), 9, 8)),
- (v16i8 VPR128:$Rn), 11, 10)),
- (v16i8 VPR128:$Rn), 13, 12)),
- (v16i8 VPR128:$Rn), 15, 14)),
- (TRN2vvv_16b VPR128:$Rn, VPR128:$Rm)>;
-
-class NI_Trn2_v8<ValueType Ty, RegisterOperand VPR, Instruction INST>
- : Pat<(Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32 (Ty (NI_ei_i32
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 1, 0)),
- (Ty VPR:$Rn), 3, 2)),
- (Ty VPR:$Rn), 5, 4)),
- (Ty VPR:$Rn), 7, 6)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Trn2_v8<v8i8, VPR64, TRN2vvv_8b>;
-def : NI_Trn2_v8<v8i16, VPR128, TRN2vvv_8h>;
-
-class NI_Trn2_v4<ValueType Ty, RegisterOperand VPR, Instruction INST,
- PatFrag ei>
- : Pat<(Ty (ei (Ty (ei
- (Ty VPR:$Rm),
- (Ty VPR:$Rn), 1, 0)),
- (Ty VPR:$Rn), 3, 2)),
- (INST VPR:$Rn, VPR:$Rm)>;
-
-def : NI_Trn2_v4<v4i16, VPR64, TRN2vvv_4h, NI_ei_i32>;
-def : NI_Trn2_v4<v4i32, VPR128, TRN2vvv_4s, NI_ei_i32>;
-def : NI_Trn2_v4<v4f32, VPR128, TRN2vvv_4s, NI_ei_f32>;
-
-// End of implementation for instruction class (Perm)
+ [(set (Ty OpVPR:$Rd),
+ (Ty (opnode (Ty OpVPR:$Rn), (Ty OpVPR:$Rm))))],
+ NoItinerary>;
+
+multiclass NeonI_Perm_pat<bits<3> opcode, string asmop,
+ SDPatternOperator opnode> {
+ def _8b : NeonI_Permute<0b0, 0b00, opcode, asmop,
+ VPR64, "8b", opnode, v8i8>;
+ def _16b : NeonI_Permute<0b1, 0b00, opcode, asmop,
+ VPR128, "16b",opnode, v16i8>;
+ def _4h : NeonI_Permute<0b0, 0b01, opcode, asmop,
+ VPR64, "4h", opnode, v4i16>;
+ def _8h : NeonI_Permute<0b1, 0b01, opcode, asmop,
+ VPR128, "8h", opnode, v8i16>;
+ def _2s : NeonI_Permute<0b0, 0b10, opcode, asmop,
+ VPR64, "2s", opnode, v2i32>;
+ def _4s : NeonI_Permute<0b1, 0b10, opcode, asmop,
+ VPR128, "4s", opnode, v4i32>;
+ def _2d : NeonI_Permute<0b1, 0b11, opcode, asmop,
+ VPR128, "2d", opnode, v2i64>;
+}
+
+defm UZP1vvv : NeonI_Perm_pat<0b001, "uzp1", Neon_uzp1>;
+defm TRN1vvv : NeonI_Perm_pat<0b010, "trn1", Neon_trn1>;
+defm ZIP1vvv : NeonI_Perm_pat<0b011, "zip1", Neon_zip1>;
+defm UZP2vvv : NeonI_Perm_pat<0b101, "uzp2", Neon_uzp2>;
+defm TRN2vvv : NeonI_Perm_pat<0b110, "trn2", Neon_trn2>;
+defm ZIP2vvv : NeonI_Perm_pat<0b111, "zip2", Neon_zip2>;
+
+multiclass NeonI_Perm_float_pat<string INS, SDPatternOperator opnode> {
+ def : Pat<(v2f32 (opnode (v2f32 VPR64:$Rn), (v2f32 VPR64:$Rm))),
+ (!cast<Instruction>(INS # "_2s") VPR64:$Rn, VPR64:$Rm)>;
+
+ def : Pat<(v4f32 (opnode (v4f32 VPR128:$Rn), (v4f32 VPR128:$Rm))),
+ (!cast<Instruction>(INS # "_4s") VPR128:$Rn, VPR128:$Rm)>;
+
+ def : Pat<(v2f64 (opnode (v2f64 VPR128:$Rn), (v2f64 VPR128:$Rm))),
+ (!cast<Instruction>(INS # "_2d") VPR128:$Rn, VPR128:$Rm)>;
+}
+
+defm : NeonI_Perm_float_pat<"UZP1vvv", Neon_uzp1>;
+defm : NeonI_Perm_float_pat<"UZP2vvv", Neon_uzp2>;
+defm : NeonI_Perm_float_pat<"ZIP1vvv", Neon_zip1>;
+defm : NeonI_Perm_float_pat<"ZIP2vvv", Neon_zip2>;
+defm : NeonI_Perm_float_pat<"TRN1vvv", Neon_trn1>;
+defm : NeonI_Perm_float_pat<"TRN2vvv", Neon_trn2>;
// The followings are for instruction class (3V Diff)
diff --git a/test/CodeGen/AArch64/neon-perm.ll b/test/CodeGen/AArch64/neon-perm.ll
index 4db4771..4e1756e 100644
--- a/test/CodeGen/AArch64/neon-perm.ll
+++ b/test/CodeGen/AArch64/neon-perm.ll
@@ -1674,3 +1674,17 @@ entry:
%.fca.0.1.insert = insertvalue %struct.poly16x8x2_t %.fca.0.0.insert, <8 x i16> %vtrn1.i, 0, 1
ret %struct.poly16x8x2_t %.fca.0.1.insert
}
+
+define %struct.uint8x8x2_t @test_uzp(<16 x i8> %y) {
+; CHECK: test_uzp:
+
+ %vuzp.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+ %vuzp1.i = shufflevector <16 x i8> %y, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+ %.fca.0.0.insert = insertvalue %struct.uint8x8x2_t undef, <8 x i8> %vuzp.i, 0, 0
+ %.fca.0.1.insert = insertvalue %struct.uint8x8x2_t %.fca.0.0.insert, <8 x i8> %vuzp1.i, 0, 1
+ ret %struct.uint8x8x2_t %.fca.0.1.insert
+
+; CHECK: dup {{d[0-9]+}}, {{v[0-9]+}}.d[1]
+; CHECK-NEXT: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+; CHECK-NEXT: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
+}