diff options
-rw-r--r-- | lib/Target/AArch64/AArch64ISelLowering.cpp | 2 | ||||
-rw-r--r-- | lib/Target/AArch64/AArch64InstrNEON.td | 1293 | ||||
-rw-r--r-- | test/CodeGen/AArch64/neon-2velem-high.ll | 331 |
3 files changed, 1022 insertions, 604 deletions
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 6ea4b483..5d20a96 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4004,8 +4004,8 @@ AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Value); unsigned EltSize = VT.getVectorElementType().getSizeInBits(); - // Use VDUP for non-constant splats. if (hasDominantValue && EltSize <= 64) { + // Use VDUP for non-constant splats. if (!isConstant) { SDValue N; diff --git a/lib/Target/AArch64/AArch64InstrNEON.td b/lib/Target/AArch64/AArch64InstrNEON.td index 581ebae..8306fd5 100644 --- a/lib/Target/AArch64/AArch64InstrNEON.td +++ b/lib/Target/AArch64/AArch64InstrNEON.td @@ -6097,6 +6097,612 @@ defm TBX2 : NI_TBX_pat<0b01, 0b1, "tbx", "VPair">; defm TBX3 : NI_TBX_pat<0b10, 0b1, "tbx", "VTriple">; defm TBX4 : NI_TBX_pat<0b11, 0b1, "tbx", "VQuad">; +class NeonI_INS_main<string asmop, string Res, ValueType ResTy, + RegisterClass OpGPR, ValueType OpTy, Operand OpImm> + : NeonI_copy<0b1, 0b0, 0b0011, + (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), + asmop # "\t$Rd." # Res # "[$Imm], $Rn", + [(set (ResTy VPR128:$Rd), + (ResTy (vector_insert + (ResTy VPR128:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; + let Constraints = "$src = $Rd"; +} + +//Insert element (vector, from main) +def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn", + (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn", + (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn", + (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn", + (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy, + RegisterClass OpGPR, ValueType OpTy, + Operand OpImm, Instruction INS> + : Pat<(ResTy (vector_insert + (ResTy VPR64:$src), + (OpTy OpGPR:$Rn), + (OpImm:$Imm))), + (ResTy (EXTRACT_SUBREG + (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), + OpGPR:$Rn, OpImm:$Imm)), sub_64))>; + +def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32, + neon_uimm3_bare, INSbw>; +def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32, + neon_uimm2_bare, INShw>; +def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32, + neon_uimm1_bare, INSsw>; +def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64, + neon_uimm0_bare, INSdx>; + +class NeonI_INS_element<string asmop, string Res, Operand ResImm> + : NeonI_insert<0b1, 0b1, + (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, + ResImm:$Immd, ResImm:$Immn), + asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", + [], + NoItinerary> { + let Constraints = "$src = $Rd"; + bits<4> Immd; + bits<4> Immn; +} + +//Insert element (vector, from element) +def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { + let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; + let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; +} +def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { + let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; + let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0}; + // bit 11 is unspecified, but should be set to zero. +} +def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { + let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; + let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0}; + // bits 11-12 are unspecified, but should be set to zero. +} +def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { + let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; + let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0}; + // bits 11-13 are unspecified, but should be set to zero. +} + +def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]", + (INSELb VPR128:$Rd, VPR128:$Rn, + neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]", + (INSELh VPR128:$Rd, VPR128:$Rn, + neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]", + (INSELs VPR128:$Rd, VPR128:$Rn, + neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>; +def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]", + (INSELd VPR128:$Rd, VPR128:$Rn, + neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>; + +multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy, + ValueType MidTy, Operand StImm, Operand NaImm, + Instruction INS> { +def : Pat<(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), + StImm:$Immd, StImm:$Immn)>; + +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (StImm:$Immd))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + StImm:$Immd, NaImm:$Immn)>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (ResTy VPR128:$Rn), + (StImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy VPR128:$Rn), + NaImm:$Immd, StImm:$Immn)), + sub_64))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy (vector_extract + (NaTy VPR64:$Rn), + (NaImm:$Immn))), + (NaImm:$Immd))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), + NaImm:$Immd, NaImm:$Immn)), + sub_64))>; +} + +defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare, + neon_uimm1_bare, INSELs>; +defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare, + neon_uimm0_bare, INSELd>; +defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare, + neon_uimm3_bare, INSELb>; +defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare, + neon_uimm2_bare, INSELh>; +defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, INSELs>; +defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare, + neon_uimm0_bare, INSELd>; + +multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy, + ValueType MidTy, + RegisterClass OpFPR, Operand ResImm, + SubRegIndex SubIndex, Instruction INS> { +def : Pat <(ResTy (vector_insert + (ResTy VPR128:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (INS (ResTy VPR128:$src), + (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), + ResImm:$Imm, + (i64 0))>; + +def : Pat <(NaTy (vector_insert + (NaTy VPR64:$src), + (MidTy OpFPR:$Rn), + (ResImm:$Imm))), + (NaTy (EXTRACT_SUBREG + (ResTy (INS + (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), + (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), + ResImm:$Imm, + (i64 0))), + sub_64))>; +} + +defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare, + sub_32, INSELs>; +defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare, + sub_64, INSELd>; + +class NeonI_SMOV<string asmop, string Res, bit Q, + ValueType OpTy, ValueType eleTy, + Operand OpImm, RegisterClass ResGPR, ValueType ResTy> + : NeonI_copy<Q, 0b0, 0b0101, + (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # Res # "[$Imm]", + [(set (ResTy ResGPR:$Rd), + (ResTy (sext_inreg + (ResTy (vector_extract + (OpTy VPR128:$Rn), (OpImm:$Imm))), + eleTy)))], + NoItinerary> { + bits<4> Imm; +} + +//Signed integer move (main, from element) +def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, + GPR64, i64> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy, + ValueType eleTy, Operand StImm, Operand NaImm, + Instruction SMOVI> { + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + eleTy)), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (StTy VPR128:$Rn), (StImm:$Imm))))), + (SMOVI VPR128:$Rn, StImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext_inreg + (i64 (anyext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + + def : Pat<(i64 (sext + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))))), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; +} + +defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare, + neon_uimm3_bare, SMOVxb>; +defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare, + neon_uimm2_bare, SMOVxh>; +defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, SMOVxs>; + +class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy, + ValueType eleTy, Operand StImm, Operand NaImm, + Instruction SMOVI> + : Pat<(i32 (sext_inreg + (i32 (vector_extract + (NaTy VPR64:$Rn), (NaImm:$Imm))), + eleTy)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare, + neon_uimm3_bare, SMOVwb>; +def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare, + neon_uimm2_bare, SMOVwh>; + +class NeonI_UMOV<string asmop, string Res, bit Q, + ValueType OpTy, Operand OpImm, + RegisterClass ResGPR, ValueType ResTy> + : NeonI_copy<Q, 0b0, 0b0111, + (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd, $Rn." # Res # "[$Imm]", + [(set (ResTy ResGPR:$Rd), + (ResTy (vector_extract + (OpTy VPR128:$Rn), (OpImm:$Imm))))], + NoItinerary> { + bits<4> Imm; +} + +//Unsigned integer move (main, from element) +def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} +def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} +def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, + GPR32, i32> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} +def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, + GPR64, i64> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]", + (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>; +def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]", + (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>; + +class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy, + Operand StImm, Operand NaImm, + Instruction SMOVI> + : Pat<(ResTy (vector_extract + (NaTy VPR64:$Rn), NaImm:$Imm)), + (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + NaImm:$Imm)>; + +def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare, + neon_uimm3_bare, UMOVwb>; +def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare, + neon_uimm2_bare, UMOVwh>; +def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare, + neon_uimm1_bare, UMOVws>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), + 255)), + (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), + 65535)), + (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), + (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), + 255)), + (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm3_bare:$Imm)>; + +def : Pat<(i32 (and + (i32 (vector_extract + (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), + 65535)), + (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm2_bare:$Imm)>; + +def : Pat<(i64 (zext + (i32 (vector_extract + (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), + (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), + neon_uimm0_bare:$Imm)>; + +// Additional copy patterns for scalar types +def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), + (UMOVwb (v16i8 + (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), + (UMOVwh (v8i16 + (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; + +def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), + (FMOVws FPR32:$Rn)>; + +def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), + (FMOVxd FPR64:$Rn)>; + +def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), + (f64 FPR64:$Rn)>; + +def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), + (f32 FPR32:$Rn)>; + +def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), + (v1i8 (EXTRACT_SUBREG (v16i8 + (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_8))>; + +def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), + (v1i16 (EXTRACT_SUBREG (v8i16 + (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), + sub_16))>; + +def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), + (FMOVsw $src)>; + +def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), + (FMOVdx $src)>; + +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$Rn))), + (v1f32 FPR32:$Rn)>; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$Rn))), + (v1f64 FPR64:$Rn)>; + +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; + +def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), + (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), + (f64 FPR64:$src), sub_64)>; + +class NeonI_DUP_Elt<bit Q, string asmop, string rdlane, string rnlane, + RegisterOperand ResVPR, Operand OpImm> + : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd), + (ins VPR128:$Rn, OpImm:$Imm), + asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]", + [], + NoItinerary> { + bits<4> Imm; +} + +def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, + neon_uimm1_bare> { + let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; +} + +def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, + neon_uimm4_bare> { + let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; +} + +def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, + neon_uimm3_bare> { + let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; +} + +def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, + neon_uimm2_bare> { + let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; +} + +multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy, + ValueType OpTy,ValueType NaTy, + ValueType ExTy, Operand OpLImm, + Operand OpNImm> { +def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), + (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; + +def : Pat<(ResTy (Neon_vduplane + (NaTy VPR64:$Rn), OpNImm:$Imm)), + (ResTy (DUPELT + (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; +} +defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8, + neon_uimm4_bare, neon_uimm3_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8, + neon_uimm4_bare, neon_uimm3_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16, + neon_uimm3_bare, neon_uimm2_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16, + neon_uimm3_bare, neon_uimm2_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64, + neon_uimm1_bare, neon_uimm0_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32, + neon_uimm2_bare, neon_uimm1_bare>; +defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64, + neon_uimm1_bare, neon_uimm0_bare>; + +def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), + (v2f32 (DUPELT2s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), + (v4f32 (DUPELT4s + (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), + (i64 0)))>; +def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), + (v2f64 (DUPELT2d + (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), + (i64 0)))>; + +class NeonI_DUP<bit Q, string asmop, string rdlane, + RegisterOperand ResVPR, ValueType ResTy, + RegisterClass OpGPR, ValueType OpTy> + : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn), + asmop # "\t$Rd" # rdlane # ", $Rn", + [(set (ResTy ResVPR:$Rd), + (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))], + NoItinerary>; + +def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { + let Inst{20-16} = 0b01000; + // bit 20 is unspecified, but should be set to zero. +} + +def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { + let Inst{20-16} = 0b00001; + // bits 17-20 are unspecified, but should be set to zero. +} + +def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { + let Inst{20-16} = 0b00010; + // bits 18-20 are unspecified, but should be set to zero. +} + +def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { + let Inst{20-16} = 0b00100; + // bits 19-20 are unspecified, but should be set to zero. +} + +// patterns for CONCAT_VECTORS +multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> { +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), + (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), + (INSELd + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), + (i64 1), + (i64 0))>; +def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), + (DUPELT2d + (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), + (i64 0))> ; +} + +defm : Concat_Vector_Pattern<v16i8, v8i8>; +defm : Concat_Vector_Pattern<v8i16, v4i16>; +defm : Concat_Vector_Pattern<v4i32, v2i32>; +defm : Concat_Vector_Pattern<v2i64, v1i64>; +defm : Concat_Vector_Pattern<v4f32, v2f32>; +defm : Concat_Vector_Pattern<v2f64, v1f64>; + +//patterns for EXTRACT_SUBVECTOR +def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), + (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), + (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), + (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), + (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), + (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; +def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), + (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; + // The followings are for instruction class (3V Elem) // Variant 1 @@ -6359,6 +6965,18 @@ multiclass NI_2VE_mul_v2_pat<string subop, SDPatternOperator op> { defm FMUL_lane_v2 : NI_2VE_mul_v2_pat<"FMULve", fmul>; defm FMULX_lane_v2 : NI_2VE_mul_v2_pat<"FMULXve", int_aarch64_neon_vmulx>; +def : Pat<(v2f32 (fmul (v2f32 (Neon_vdup (f32 FPR32:$Re))), + (v2f32 VPR64:$Rn))), + (FMULve_2s4s VPR64:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v4f32 (fmul (v4f32 (Neon_vdup (f32 FPR32:$Re))), + (v4f32 VPR128:$Rn))), + (FMULve_4s4s VPR128:$Rn, (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + +def : Pat<(v2f64 (fmul (v2f64 (Neon_vdup (f64 FPR64:$Re))), + (v2f64 VPR128:$Rn))), + (FMULve_2d2d VPR128:$Rn, (SUBREG_TO_REG (i64 0), $Re, sub_64), 0)>; + // The followings are patterns using fma // -ffp-contract=fast generates fma @@ -6400,6 +7018,15 @@ class NI_2VEswap_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, (ResTy ResVPR:$src), (ResTy ResVPR:$Rn))), (INST ResVPR:$src, ResVPR:$Rn, OpVPR:$Re, OpImm:$Index)>; +// Pattern for lane 0 +class NI_2VEfma_lane0<Instruction INST, SDPatternOperator op, + RegisterOperand ResVPR, ValueType ResTy> + : Pat<(ResTy (op (ResTy ResVPR:$Rn), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + // Pattern for lane in 64-bit vector class NI_2VEswap_lane<Instruction INST, Operand OpImm, SDPatternOperator op, RegisterOperand ResVPR, RegisterOperand OpVPR, @@ -6427,10 +7054,16 @@ multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> { neon_uimm2_bare, op, VPR64, VPR128, v2f32, v4f32, BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_2s4s"), + op, VPR64, v2f32>; + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; + def : NI_2VEfma_lane0<!cast<Instruction>(subop # "_4s4s"), + op, VPR128, v4f32>; + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, BinOpFrag<(Neon_vduplane node:$LHS, node:$RHS)>>; @@ -6448,6 +7081,15 @@ multiclass NI_2VE_fma_v2_pat<string subop, SDPatternOperator op> { defm FMLA_lane_v2_s : NI_2VE_fma_v2_pat<"FMLAvve", fma>; +// Pattern for lane 0 +class NI_2VEfms_lane0<Instruction INST, SDPatternOperator op, + RegisterOperand ResVPR, ValueType ResTy> + : Pat<(ResTy (op (ResTy (fneg ResVPR:$Rn)), + (ResTy (Neon_vdup (f32 FPR32:$Re))), + (ResTy ResVPR:$src))), + (INST ResVPR:$src, ResVPR:$Rn, + (SUBREG_TO_REG (i32 0), $Re, sub_32), 0)>; + multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op> { def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2s4s"), @@ -6459,6 +7101,9 @@ multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op> BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; + def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_2s4s"), + op, VPR64, v2f32>; + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_4s4s"), neon_uimm2_bare, op, VPR128, VPR128, v4f32, v4f32, BinOpFrag<(fneg (Neon_vduplane @@ -6469,6 +7114,9 @@ multiclass NI_2VE_fms_v2_pat<string subop, SDPatternOperator op> BinOpFrag<(Neon_vduplane (fneg node:$LHS), node:$RHS)>>; + def : NI_2VEfms_lane0<!cast<Instruction>(subop # "_4s4s"), + op, VPR128, v4f32>; + def : NI_2VEswap_laneq<!cast<Instruction>(subop # "_2d2d"), neon_uimm1_bare, op, VPR128, VPR128, v2f64, v2f64, BinOpFrag<(fneg (Neon_vduplane @@ -6595,6 +7243,11 @@ defm SMULLve : NI_2VE_v3_2op<0b0, 0b1010, "smull">; defm UMULLve : NI_2VE_v3_2op<0b1, 0b1010, "umull">; defm SQDMULLve : NI_2VE_v3_2op<0b0, 0b1011, "sqdmull">; +def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), + (FMOVdd $src)>; +def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), + (FMOVss $src)>; + // Pattern for lane in 128-bit vector class NI_2VEL2_laneq<Instruction INST, Operand OpImm, SDPatternOperator op, RegisterOperand EleOpVPR, ValueType ResTy, @@ -6618,6 +7271,14 @@ class NI_2VEL2_lane<Instruction INST, Operand OpImm, SDPatternOperator op, (INST VPR128:$src, VPR128:$Rn, (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; +class NI_2VEL2_lane0<Instruction INST, SDPatternOperator op, + ValueType ResTy, ValueType OpTy, ValueType HalfOpTy, + SDPatternOperator hiop, Instruction DupInst> + : Pat<(ResTy (op (ResTy VPR128:$src), + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$src, VPR128:$Rn, (DupInst $Re), 0)>; + multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> { def : NI_2VE_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare, op, VPR128, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; @@ -6631,6 +7292,12 @@ multiclass NI_2VEL_v3_pat<string subop, SDPatternOperator op> { def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + // Index can only be half of the max value for lane in 64-bit vector def : NI_2VE_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare, @@ -6674,6 +7341,15 @@ class NI_2VEL2_mul_lane<Instruction INST, Operand OpImm, SDPatternOperator op, (INST VPR128:$Rn, (SUBREG_TO_REG (i64 0), EleOpVPR:$Re, sub_64), OpImm:$Index)>; +// Pattern for fixed lane 0 +class NI_2VEL2_mul_lane0<Instruction INST, SDPatternOperator op, + ValueType ResTy, ValueType OpTy, ValueType HalfOpTy, + SDPatternOperator hiop, Instruction DupInst> + : Pat<(ResTy (op + (HalfOpTy (hiop (OpTy VPR128:$Rn))), + (HalfOpTy (Neon_vdup (i32 GPR32:$Re))))), + (INST VPR128:$Rn, (DupInst $Re), 0)>; + multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> { def : NI_2VE_mul_laneq<!cast<Instruction>(subop # "_4s4h"), neon_uimm3_bare, op, VPR64, VPR128Lo, v4i32, v4i16, v8i16>; @@ -6686,7 +7362,13 @@ multiclass NI_2VEL_mul_v3_pat<string subop, SDPatternOperator op> { def : NI_2VEL2_mul_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, op, VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_4s8h"), + op, v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + def : NI_2VEL2_mul_lane0<!cast<Instruction>(subop # "_2d4s"), + op, v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; + // Index can only be half of the max value for lane in 64-bit vector def : NI_2VE_mul_lane<!cast<Instruction>(subop # "_4s4h"), neon_uimm2_bare, @@ -6735,6 +7417,14 @@ multiclass NI_2VEL_v3_qdma_pat<string subop, string op> { def : NI_2VEL2_laneq<!cast<Instruction>(subop # "_2d4s"), neon_uimm2_bare, !cast<PatFrag>(op # "_2d"), VPR128, v2i64, v4i32, v4i32, v2i32, Neon_High4S>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_4s8h"), + !cast<PatFrag>(op # "_4s"), + v4i32, v8i16, v4i16, Neon_High8H, DUP8h>; + + def : NI_2VEL2_lane0<!cast<Instruction>(subop # "_2d4s"), + !cast<PatFrag>(op # "_2d"), + v2i64, v4i32, v2i32, Neon_High4S, DUP4s>; // Index can only be half of the max value for lane in 64-bit vector @@ -6760,609 +7450,6 @@ defm SQDMLSL_lane_v3 : NI_2VEL_v3_qdma_pat<"SQDMLSLvve", "Neon_qdmlsl">; // End of implementation for instruction class (3V Elem) -class NeonI_INS_main<string asmop, string Res, ValueType ResTy, - RegisterClass OpGPR, ValueType OpTy, Operand OpImm> - : NeonI_copy<0b1, 0b0, 0b0011, - (outs VPR128:$Rd), (ins VPR128:$src, OpGPR:$Rn, OpImm:$Imm), - asmop # "\t$Rd." # Res # "[$Imm], $Rn", - [(set (ResTy VPR128:$Rd), - (ResTy (vector_insert - (ResTy VPR128:$src), - (OpTy OpGPR:$Rn), - (OpImm:$Imm))))], - NoItinerary> { - bits<4> Imm; - let Constraints = "$src = $Rd"; -} - -//Insert element (vector, from main) -def INSbw : NeonI_INS_main<"ins", "b", v16i8, GPR32, i32, - neon_uimm4_bare> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def INShw : NeonI_INS_main<"ins", "h", v8i16, GPR32, i32, - neon_uimm3_bare> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def INSsw : NeonI_INS_main<"ins", "s", v4i32, GPR32, i32, - neon_uimm2_bare> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} -def INSdx : NeonI_INS_main<"ins", "d", v2i64, GPR64, i64, - neon_uimm1_bare> { - let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; -} - -def : NeonInstAlias<"mov $Rd.b[$Imm], $Rn", - (INSbw VPR128:$Rd, GPR32:$Rn, neon_uimm4_bare:$Imm), 0>; -def : NeonInstAlias<"mov $Rd.h[$Imm], $Rn", - (INShw VPR128:$Rd, GPR32:$Rn, neon_uimm3_bare:$Imm), 0>; -def : NeonInstAlias<"mov $Rd.s[$Imm], $Rn", - (INSsw VPR128:$Rd, GPR32:$Rn, neon_uimm2_bare:$Imm), 0>; -def : NeonInstAlias<"mov $Rd.d[$Imm], $Rn", - (INSdx VPR128:$Rd, GPR64:$Rn, neon_uimm1_bare:$Imm), 0>; - -class Neon_INS_main_pattern <ValueType ResTy,ValueType ExtResTy, - RegisterClass OpGPR, ValueType OpTy, - Operand OpImm, Instruction INS> - : Pat<(ResTy (vector_insert - (ResTy VPR64:$src), - (OpTy OpGPR:$Rn), - (OpImm:$Imm))), - (ResTy (EXTRACT_SUBREG - (ExtResTy (INS (ExtResTy (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64)), - OpGPR:$Rn, OpImm:$Imm)), sub_64))>; - -def INSbw_pattern : Neon_INS_main_pattern<v8i8, v16i8, GPR32, i32, - neon_uimm3_bare, INSbw>; -def INShw_pattern : Neon_INS_main_pattern<v4i16, v8i16, GPR32, i32, - neon_uimm2_bare, INShw>; -def INSsw_pattern : Neon_INS_main_pattern<v2i32, v4i32, GPR32, i32, - neon_uimm1_bare, INSsw>; -def INSdx_pattern : Neon_INS_main_pattern<v1i64, v2i64, GPR64, i64, - neon_uimm0_bare, INSdx>; - -class NeonI_INS_element<string asmop, string Res, Operand ResImm> - : NeonI_insert<0b1, 0b1, - (outs VPR128:$Rd), (ins VPR128:$src, VPR128:$Rn, - ResImm:$Immd, ResImm:$Immn), - asmop # "\t$Rd." # Res # "[$Immd], $Rn." # Res # "[$Immn]", - [], - NoItinerary> { - let Constraints = "$src = $Rd"; - bits<4> Immd; - bits<4> Immn; -} - -//Insert element (vector, from element) -def INSELb : NeonI_INS_element<"ins", "b", neon_uimm4_bare> { - let Inst{20-16} = {Immd{3}, Immd{2}, Immd{1}, Immd{0}, 0b1}; - let Inst{14-11} = {Immn{3}, Immn{2}, Immn{1}, Immn{0}}; -} -def INSELh : NeonI_INS_element<"ins", "h", neon_uimm3_bare> { - let Inst{20-16} = {Immd{2}, Immd{1}, Immd{0}, 0b1, 0b0}; - let Inst{14-11} = {Immn{2}, Immn{1}, Immn{0}, 0b0}; - // bit 11 is unspecified, but should be set to zero. -} -def INSELs : NeonI_INS_element<"ins", "s", neon_uimm2_bare> { - let Inst{20-16} = {Immd{1}, Immd{0}, 0b1, 0b0, 0b0}; - let Inst{14-11} = {Immn{1}, Immn{0}, 0b0, 0b0}; - // bits 11-12 are unspecified, but should be set to zero. -} -def INSELd : NeonI_INS_element<"ins", "d", neon_uimm1_bare> { - let Inst{20-16} = {Immd, 0b1, 0b0, 0b0, 0b0}; - let Inst{14-11} = {Immn{0}, 0b0, 0b0, 0b0}; - // bits 11-13 are unspecified, but should be set to zero. -} - -def : NeonInstAlias<"mov $Rd.b[$Immd], $Rn.b[$Immn]", - (INSELb VPR128:$Rd, VPR128:$Rn, - neon_uimm4_bare:$Immd, neon_uimm4_bare:$Immn), 0>; -def : NeonInstAlias<"mov $Rd.h[$Immd], $Rn.h[$Immn]", - (INSELh VPR128:$Rd, VPR128:$Rn, - neon_uimm3_bare:$Immd, neon_uimm3_bare:$Immn), 0>; -def : NeonInstAlias<"mov $Rd.s[$Immd], $Rn.s[$Immn]", - (INSELs VPR128:$Rd, VPR128:$Rn, - neon_uimm2_bare:$Immd, neon_uimm2_bare:$Immn), 0>; -def : NeonInstAlias<"mov $Rd.d[$Immd], $Rn.d[$Immn]", - (INSELd VPR128:$Rd, VPR128:$Rn, - neon_uimm1_bare:$Immd, neon_uimm1_bare:$Immn), 0>; - -multiclass Neon_INS_elt_pattern<ValueType ResTy, ValueType NaTy, - ValueType MidTy, Operand StImm, Operand NaImm, - Instruction INS> { -def : Pat<(ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy (vector_extract - (ResTy VPR128:$Rn), - (StImm:$Immn))), - (StImm:$Immd))), - (INS (ResTy VPR128:$src), (ResTy VPR128:$Rn), - StImm:$Immd, StImm:$Immn)>; - -def : Pat <(ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (StImm:$Immd))), - (INS (ResTy VPR128:$src), - (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), - StImm:$Immd, NaImm:$Immn)>; - -def : Pat <(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (ResTy VPR128:$Rn), - (StImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (ResTy (INS - (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), - (ResTy VPR128:$Rn), - NaImm:$Immd, StImm:$Immn)), - sub_64))>; - -def : Pat <(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy (vector_extract - (NaTy VPR64:$Rn), - (NaImm:$Immn))), - (NaImm:$Immd))), - (NaTy (EXTRACT_SUBREG - (ResTy (INS - (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), - (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$Rn), sub_64)), - NaImm:$Immd, NaImm:$Immn)), - sub_64))>; -} - -defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, neon_uimm2_bare, - neon_uimm1_bare, INSELs>; -defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, neon_uimm1_bare, - neon_uimm0_bare, INSELd>; -defm : Neon_INS_elt_pattern<v16i8, v8i8, i32, neon_uimm4_bare, - neon_uimm3_bare, INSELb>; -defm : Neon_INS_elt_pattern<v8i16, v4i16, i32, neon_uimm3_bare, - neon_uimm2_bare, INSELh>; -defm : Neon_INS_elt_pattern<v4i32, v2i32, i32, neon_uimm2_bare, - neon_uimm1_bare, INSELs>; -defm : Neon_INS_elt_pattern<v2i64, v1i64, i64, neon_uimm1_bare, - neon_uimm0_bare, INSELd>; - -multiclass Neon_INS_elt_float_pattern<ValueType ResTy, ValueType NaTy, - ValueType MidTy, - RegisterClass OpFPR, Operand ResImm, - SubRegIndex SubIndex, Instruction INS> { -def : Pat <(ResTy (vector_insert - (ResTy VPR128:$src), - (MidTy OpFPR:$Rn), - (ResImm:$Imm))), - (INS (ResTy VPR128:$src), - (ResTy (SUBREG_TO_REG (i64 0), OpFPR:$Rn, SubIndex)), - ResImm:$Imm, - (i64 0))>; - -def : Pat <(NaTy (vector_insert - (NaTy VPR64:$src), - (MidTy OpFPR:$Rn), - (ResImm:$Imm))), - (NaTy (EXTRACT_SUBREG - (ResTy (INS - (ResTy (SUBREG_TO_REG (i64 0), (NaTy VPR64:$src), sub_64)), - (ResTy (SUBREG_TO_REG (i64 0), (MidTy OpFPR:$Rn), SubIndex)), - ResImm:$Imm, - (i64 0))), - sub_64))>; -} - -defm : Neon_INS_elt_float_pattern<v4f32, v2f32, f32, FPR32, neon_uimm2_bare, - sub_32, INSELs>; -defm : Neon_INS_elt_float_pattern<v2f64, v1f64, f64, FPR64, neon_uimm1_bare, - sub_64, INSELd>; - -class NeonI_SMOV<string asmop, string Res, bit Q, - ValueType OpTy, ValueType eleTy, - Operand OpImm, RegisterClass ResGPR, ValueType ResTy> - : NeonI_copy<Q, 0b0, 0b0101, - (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), - asmop # "\t$Rd, $Rn." # Res # "[$Imm]", - [(set (ResTy ResGPR:$Rd), - (ResTy (sext_inreg - (ResTy (vector_extract - (OpTy VPR128:$Rn), (OpImm:$Imm))), - eleTy)))], - NoItinerary> { - bits<4> Imm; -} - -//Signed integer move (main, from element) -def SMOVwb : NeonI_SMOV<"smov", "b", 0b0, v16i8, i8, neon_uimm4_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def SMOVwh : NeonI_SMOV<"smov", "h", 0b0, v8i16, i16, neon_uimm3_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def SMOVxb : NeonI_SMOV<"smov", "b", 0b1, v16i8, i8, neon_uimm4_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def SMOVxh : NeonI_SMOV<"smov", "h", 0b1, v8i16, i16, neon_uimm3_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def SMOVxs : NeonI_SMOV<"smov", "s", 0b1, v4i32, i32, neon_uimm2_bare, - GPR64, i64> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} - -multiclass Neon_SMOVx_pattern <ValueType StTy, ValueType NaTy, - ValueType eleTy, Operand StImm, Operand NaImm, - Instruction SMOVI> { - def : Pat<(i64 (sext_inreg - (i64 (anyext - (i32 (vector_extract - (StTy VPR128:$Rn), (StImm:$Imm))))), - eleTy)), - (SMOVI VPR128:$Rn, StImm:$Imm)>; - - def : Pat<(i64 (sext - (i32 (vector_extract - (StTy VPR128:$Rn), (StImm:$Imm))))), - (SMOVI VPR128:$Rn, StImm:$Imm)>; - - def : Pat<(i64 (sext_inreg - (i64 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - - def : Pat<(i64 (sext_inreg - (i64 (anyext - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - - def : Pat<(i64 (sext - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))))), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; -} - -defm : Neon_SMOVx_pattern<v16i8, v8i8, i8, neon_uimm4_bare, - neon_uimm3_bare, SMOVxb>; -defm : Neon_SMOVx_pattern<v8i16, v4i16, i16, neon_uimm3_bare, - neon_uimm2_bare, SMOVxh>; -defm : Neon_SMOVx_pattern<v4i32, v2i32, i32, neon_uimm2_bare, - neon_uimm1_bare, SMOVxs>; - -class Neon_SMOVw_pattern <ValueType StTy, ValueType NaTy, - ValueType eleTy, Operand StImm, Operand NaImm, - Instruction SMOVI> - : Pat<(i32 (sext_inreg - (i32 (vector_extract - (NaTy VPR64:$Rn), (NaImm:$Imm))), - eleTy)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - -def : Neon_SMOVw_pattern<v16i8, v8i8, i8, neon_uimm4_bare, - neon_uimm3_bare, SMOVwb>; -def : Neon_SMOVw_pattern<v8i16, v4i16, i16, neon_uimm3_bare, - neon_uimm2_bare, SMOVwh>; - -class NeonI_UMOV<string asmop, string Res, bit Q, - ValueType OpTy, Operand OpImm, - RegisterClass ResGPR, ValueType ResTy> - : NeonI_copy<Q, 0b0, 0b0111, - (outs ResGPR:$Rd), (ins VPR128:$Rn, OpImm:$Imm), - asmop # "\t$Rd, $Rn." # Res # "[$Imm]", - [(set (ResTy ResGPR:$Rd), - (ResTy (vector_extract - (OpTy VPR128:$Rn), (OpImm:$Imm))))], - NoItinerary> { - bits<4> Imm; -} - -//Unsigned integer move (main, from element) -def UMOVwb : NeonI_UMOV<"umov", "b", 0b0, v16i8, neon_uimm4_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} -def UMOVwh : NeonI_UMOV<"umov", "h", 0b0, v8i16, neon_uimm3_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} -def UMOVws : NeonI_UMOV<"umov", "s", 0b0, v4i32, neon_uimm2_bare, - GPR32, i32> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} -def UMOVxd : NeonI_UMOV<"umov", "d", 0b1, v2i64, neon_uimm1_bare, - GPR64, i64> { - let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; -} - -def : NeonInstAlias<"mov $Rd, $Rn.s[$Imm]", - (UMOVws GPR32:$Rd, VPR128:$Rn, neon_uimm2_bare:$Imm), 0>; -def : NeonInstAlias<"mov $Rd, $Rn.d[$Imm]", - (UMOVxd GPR64:$Rd, VPR128:$Rn, neon_uimm1_bare:$Imm), 0>; - -class Neon_UMOV_pattern <ValueType StTy, ValueType NaTy, ValueType ResTy, - Operand StImm, Operand NaImm, - Instruction SMOVI> - : Pat<(ResTy (vector_extract - (NaTy VPR64:$Rn), NaImm:$Imm)), - (SMOVI (StTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - NaImm:$Imm)>; - -def : Neon_UMOV_pattern<v16i8, v8i8, i32, neon_uimm4_bare, - neon_uimm3_bare, UMOVwb>; -def : Neon_UMOV_pattern<v8i16, v4i16, i32, neon_uimm3_bare, - neon_uimm2_bare, UMOVwh>; -def : Neon_UMOV_pattern<v4i32, v2i32, i32, neon_uimm2_bare, - neon_uimm1_bare, UMOVws>; - -def : Pat<(i32 (and - (i32 (vector_extract - (v16i8 VPR128:$Rn), (neon_uimm4_bare:$Imm))), - 255)), - (UMOVwb VPR128:$Rn, neon_uimm4_bare:$Imm)>; - -def : Pat<(i32 (and - (i32 (vector_extract - (v8i16 VPR128:$Rn), (neon_uimm3_bare:$Imm))), - 65535)), - (UMOVwh VPR128:$Rn, neon_uimm3_bare:$Imm)>; - -def : Pat<(i64 (zext - (i32 (vector_extract - (v2i64 VPR128:$Rn), (neon_uimm1_bare:$Imm))))), - (UMOVxd VPR128:$Rn, neon_uimm1_bare:$Imm)>; - -def : Pat<(i32 (and - (i32 (vector_extract - (v8i8 VPR64:$Rn), (neon_uimm3_bare:$Imm))), - 255)), - (UMOVwb (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm3_bare:$Imm)>; - -def : Pat<(i32 (and - (i32 (vector_extract - (v4i16 VPR64:$Rn), (neon_uimm2_bare:$Imm))), - 65535)), - (UMOVwh (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm2_bare:$Imm)>; - -def : Pat<(i64 (zext - (i32 (vector_extract - (v1i64 VPR64:$Rn), (neon_uimm0_bare:$Imm))))), - (UMOVxd (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64), - neon_uimm0_bare:$Imm)>; - -// Additional copy patterns for scalar types -def : Pat<(i32 (vector_extract (v1i8 FPR8:$Rn), (i64 0))), - (UMOVwb (v16i8 - (SUBREG_TO_REG (i64 0), FPR8:$Rn, sub_8)), (i64 0))>; - -def : Pat<(i32 (vector_extract (v1i16 FPR16:$Rn), (i64 0))), - (UMOVwh (v8i16 - (SUBREG_TO_REG (i64 0), FPR16:$Rn, sub_16)), (i64 0))>; - -def : Pat<(i32 (vector_extract (v1i32 FPR32:$Rn), (i64 0))), - (FMOVws FPR32:$Rn)>; - -def : Pat<(i64 (vector_extract (v1i64 FPR64:$Rn), (i64 0))), - (FMOVxd FPR64:$Rn)>; - -def : Pat<(f64 (vector_extract (v1f64 FPR64:$Rn), (i64 0))), - (f64 FPR64:$Rn)>; - -def : Pat<(f32 (vector_extract (v1f32 FPR32:$Rn), (i64 0))), - (f32 FPR32:$Rn)>; - -def : Pat<(v1i8 (scalar_to_vector GPR32:$Rn)), - (v1i8 (EXTRACT_SUBREG (v16i8 - (INSbw (v16i8 (IMPLICIT_DEF)), $Rn, (i64 0))), - sub_8))>; - -def : Pat<(v1i16 (scalar_to_vector GPR32:$Rn)), - (v1i16 (EXTRACT_SUBREG (v8i16 - (INShw (v8i16 (IMPLICIT_DEF)), $Rn, (i64 0))), - sub_16))>; - -def : Pat<(v1i32 (scalar_to_vector GPR32:$src)), - (FMOVsw $src)>; - -def : Pat<(v1i64 (scalar_to_vector GPR64:$src)), - (FMOVdx $src)>; - -def : Pat<(v1f64 (scalar_to_vector (f64 FPR64:$src))), - (FMOVdd $src)>; -def : Pat<(v1f32 (scalar_to_vector (f32 FPR32:$src))), - (FMOVss $src)>; - -def : Pat<(v2f64 (scalar_to_vector (f64 FPR64:$src))), - (INSERT_SUBREG (v2f64 (IMPLICIT_DEF)), - (f64 FPR64:$src), sub_64)>; - -class NeonI_DUP_Elt<bit Q, string asmop, string rdlane, string rnlane, - RegisterOperand ResVPR, Operand OpImm> - : NeonI_copy<Q, 0b0, 0b0000, (outs ResVPR:$Rd), - (ins VPR128:$Rn, OpImm:$Imm), - asmop # "\t$Rd" # rdlane # ", $Rn" # rnlane # "[$Imm]", - [], - NoItinerary> { - bits<4> Imm; -} - -def DUPELT16b : NeonI_DUP_Elt<0b1, "dup", ".16b", ".b", VPR128, - neon_uimm4_bare> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} - -def DUPELT8h : NeonI_DUP_Elt<0b1, "dup", ".8h", ".h", VPR128, - neon_uimm3_bare> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} - -def DUPELT4s : NeonI_DUP_Elt<0b1, "dup", ".4s", ".s", VPR128, - neon_uimm2_bare> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} - -def DUPELT2d : NeonI_DUP_Elt<0b1, "dup", ".2d", ".d", VPR128, - neon_uimm1_bare> { - let Inst{20-16} = {Imm, 0b1, 0b0, 0b0, 0b0}; -} - -def DUPELT8b : NeonI_DUP_Elt<0b0, "dup", ".8b", ".b", VPR64, - neon_uimm4_bare> { - let Inst{20-16} = {Imm{3}, Imm{2}, Imm{1}, Imm{0}, 0b1}; -} - -def DUPELT4h : NeonI_DUP_Elt<0b0, "dup", ".4h", ".h", VPR64, - neon_uimm3_bare> { - let Inst{20-16} = {Imm{2}, Imm{1}, Imm{0}, 0b1, 0b0}; -} - -def DUPELT2s : NeonI_DUP_Elt<0b0, "dup", ".2s", ".s", VPR64, - neon_uimm2_bare> { - let Inst{20-16} = {Imm{1}, Imm{0}, 0b1, 0b0, 0b0}; -} - -multiclass NeonI_DUP_Elt_pattern<Instruction DUPELT, ValueType ResTy, - ValueType OpTy,ValueType NaTy, - ValueType ExTy, Operand OpLImm, - Operand OpNImm> { -def : Pat<(ResTy (Neon_vduplane (OpTy VPR128:$Rn), OpLImm:$Imm)), - (ResTy (DUPELT (OpTy VPR128:$Rn), OpLImm:$Imm))>; - -def : Pat<(ResTy (Neon_vduplane - (NaTy VPR64:$Rn), OpNImm:$Imm)), - (ResTy (DUPELT - (ExTy (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), OpNImm:$Imm))>; -} -defm : NeonI_DUP_Elt_pattern<DUPELT16b, v16i8, v16i8, v8i8, v16i8, - neon_uimm4_bare, neon_uimm3_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT8b, v8i8, v16i8, v8i8, v16i8, - neon_uimm4_bare, neon_uimm3_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT8h, v8i16, v8i16, v4i16, v8i16, - neon_uimm3_bare, neon_uimm2_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT4h, v4i16, v8i16, v4i16, v8i16, - neon_uimm3_bare, neon_uimm2_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4i32, v4i32, v2i32, v4i32, - neon_uimm2_bare, neon_uimm1_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2i32, v4i32, v2i32, v4i32, - neon_uimm2_bare, neon_uimm1_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2i64, v2i64, v1i64, v2i64, - neon_uimm1_bare, neon_uimm0_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT4s, v4f32, v4f32, v2f32, v4f32, - neon_uimm2_bare, neon_uimm1_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT2s, v2f32, v4f32, v2f32, v4f32, - neon_uimm2_bare, neon_uimm1_bare>; -defm : NeonI_DUP_Elt_pattern<DUPELT2d, v2f64, v2f64, v1f64, v2f64, - neon_uimm1_bare, neon_uimm0_bare>; - -def : Pat<(v2f32 (Neon_vdup (f32 FPR32:$Rn))), - (v2f32 (DUPELT2s - (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), - (i64 0)))>; -def : Pat<(v4f32 (Neon_vdup (f32 FPR32:$Rn))), - (v4f32 (DUPELT4s - (SUBREG_TO_REG (i64 0), FPR32:$Rn, sub_32), - (i64 0)))>; -def : Pat<(v2f64 (Neon_vdup (f64 FPR64:$Rn))), - (v2f64 (DUPELT2d - (SUBREG_TO_REG (i64 0), FPR64:$Rn, sub_64), - (i64 0)))>; - -class NeonI_DUP<bit Q, string asmop, string rdlane, - RegisterOperand ResVPR, ValueType ResTy, - RegisterClass OpGPR, ValueType OpTy> - : NeonI_copy<Q, 0b0, 0b0001, (outs ResVPR:$Rd), (ins OpGPR:$Rn), - asmop # "\t$Rd" # rdlane # ", $Rn", - [(set (ResTy ResVPR:$Rd), - (ResTy (Neon_vdup (OpTy OpGPR:$Rn))))], - NoItinerary>; - -def DUP16b : NeonI_DUP<0b1, "dup", ".16b", VPR128, v16i8, GPR32, i32> { - let Inst{20-16} = 0b00001; - // bits 17-20 are unspecified, but should be set to zero. -} - -def DUP8h : NeonI_DUP<0b1, "dup", ".8h", VPR128, v8i16, GPR32, i32> { - let Inst{20-16} = 0b00010; - // bits 18-20 are unspecified, but should be set to zero. -} - -def DUP4s : NeonI_DUP<0b1, "dup", ".4s", VPR128, v4i32, GPR32, i32> { - let Inst{20-16} = 0b00100; - // bits 19-20 are unspecified, but should be set to zero. -} - -def DUP2d : NeonI_DUP<0b1, "dup", ".2d", VPR128, v2i64, GPR64, i64> { - let Inst{20-16} = 0b01000; - // bit 20 is unspecified, but should be set to zero. -} - -def DUP8b : NeonI_DUP<0b0, "dup", ".8b", VPR64, v8i8, GPR32, i32> { - let Inst{20-16} = 0b00001; - // bits 17-20 are unspecified, but should be set to zero. -} - -def DUP4h : NeonI_DUP<0b0, "dup", ".4h", VPR64, v4i16, GPR32, i32> { - let Inst{20-16} = 0b00010; - // bits 18-20 are unspecified, but should be set to zero. -} - -def DUP2s : NeonI_DUP<0b0, "dup", ".2s", VPR64, v2i32, GPR32, i32> { - let Inst{20-16} = 0b00100; - // bits 19-20 are unspecified, but should be set to zero. -} - -// patterns for CONCAT_VECTORS -multiclass Concat_Vector_Pattern<ValueType ResTy, ValueType OpTy> { -def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), undef)), - (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)>; -def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rm))), - (INSELd - (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rm, sub_64)), - (i64 1), - (i64 0))>; -def : Pat<(ResTy (concat_vectors (OpTy VPR64:$Rn), (OpTy VPR64:$Rn))), - (DUPELT2d - (v2i64 (SUBREG_TO_REG (i64 0), VPR64:$Rn, sub_64)), - (i64 0))> ; -} - -defm : Concat_Vector_Pattern<v16i8, v8i8>; -defm : Concat_Vector_Pattern<v8i16, v4i16>; -defm : Concat_Vector_Pattern<v4i32, v2i32>; -defm : Concat_Vector_Pattern<v2i64, v1i64>; -defm : Concat_Vector_Pattern<v4f32, v2f32>; -defm : Concat_Vector_Pattern<v2f64, v1f64>; - -//patterns for EXTRACT_SUBVECTOR -def : Pat<(v8i8 (extract_subvector (v16i8 VPR128:$Rn), (i64 0))), - (v8i8 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; -def : Pat<(v4i16 (extract_subvector (v8i16 VPR128:$Rn), (i64 0))), - (v4i16 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; -def : Pat<(v2i32 (extract_subvector (v4i32 VPR128:$Rn), (i64 0))), - (v2i32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; -def : Pat<(v1i64 (extract_subvector (v2i64 VPR128:$Rn), (i64 0))), - (v1i64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; -def : Pat<(v2f32 (extract_subvector (v4f32 VPR128:$Rn), (i64 0))), - (v2f32 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; -def : Pat<(v1f64 (extract_subvector (v2f64 VPR128:$Rn), (i64 0))), - (v1f64 (EXTRACT_SUBREG VPR128:$Rn, sub_64))>; - class NeonI_REV<string asmop, string Res, bits<2> size, bit Q, bit U, bits<5> opcode, RegisterOperand ResVPR, ValueType ResTy, SDPatternOperator Neon_Rev> diff --git a/test/CodeGen/AArch64/neon-2velem-high.ll b/test/CodeGen/AArch64/neon-2velem-high.ll new file mode 100644 index 0000000..97031d9 --- /dev/null +++ b/test/CodeGen/AArch64/neon-2velem-high.ll @@ -0,0 +1,331 @@ +; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s + +declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>) + +declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) + +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) + +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) + +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) + +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) + +define <4 x i32> @test_vmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_s16: +; CHECK: smull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_s32: +; CHECK: smull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vmull_high_n_u16(<8 x i16> %a, i16 %b) { +; CHECK: test_vmull_high_n_u16: +; CHECK: umull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vmull15.i.i +} + +define <2 x i64> @test_vmull_high_n_u32(<4 x i32> %a, i32 %b) { +; CHECK: test_vmull_high_n_u32: +; CHECK: umull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vmull9.i.i +} + +define <4 x i32> @test_vqdmull_high_n_s16(<8 x i16> %a, i16 %b) { +; CHECK: test_vqdmull_high_n_s16: +; CHECK: sqdmull2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[0] +entry: + %shuffle.i.i = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %b, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %b, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %b, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %b, i32 3 + %vqdmull15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + ret <4 x i32> %vqdmull15.i.i +} + +define <2 x i64> @test_vqdmull_high_n_s32(<4 x i32> %a, i32 %b) { +; CHECK: test_vqdmull_high_n_s32: +; CHECK: sqdmull2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %shuffle.i.i = shufflevector <4 x i32> %a, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %b, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %b, i32 1 + %vqdmull9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + ret <2 x i64> %vqdmull9.i.i +} + +define <4 x i32> @test_vmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_s16: +; CHECK: smlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_s32: +; CHECK: smlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vmlal_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlal_high_n_u16: +; CHECK: umlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %add.i.i = add <4 x i32> %vmull2.i.i.i, %a + ret <4 x i32> %add.i.i +} + +define <2 x i64> @test_vmlal_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlal_high_n_u32: +; CHECK: umlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %add.i.i = add <2 x i64> %vmull2.i.i.i, %a + ret <2 x i64> %add.i.i +} + +define <4 x i32> @test_vqdmlal_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlal_high_n_s16: +; CHECK: sqdmlal2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlal15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlal17.i.i = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %a, <4 x i32> %vqdmlal15.i.i) + ret <4 x i32> %vqdmlal17.i.i +} + +define <2 x i64> @test_vqdmlal_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlal_high_n_s32: +; CHECK: sqdmlal2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlal9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlal11.i.i = tail call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %a, <2 x i64> %vqdmlal9.i.i) + ret <2 x i64> %vqdmlal11.i.i +} + +define <4 x i32> @test_vmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_s16: +; CHECK: smlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_s32: +; CHECK: smlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vmlsl_high_n_u16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vmlsl_high_n_u16: +; CHECK: umlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vmull2.i.i.i = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %sub.i.i = sub <4 x i32> %a, %vmull2.i.i.i + ret <4 x i32> %sub.i.i +} + +define <2 x i64> @test_vmlsl_high_n_u32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vmlsl_high_n_u32: +; CHECK: umlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vmull2.i.i.i = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %sub.i.i = sub <2 x i64> %a, %vmull2.i.i.i + ret <2 x i64> %sub.i.i +} + +define <4 x i32> @test_vqdmlsl_high_n_s16(<4 x i32> %a, <8 x i16> %b, i16 %c) { +; CHECK: test_vqdmlsl_high_n_s16: +; CHECK: sqdmlsl2 {{v[0-9]+}}.4s, {{v[0-9]+}}.8h, {{v[0-9]+}}.h[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <8 x i16> %b, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7> + %vecinit.i.i = insertelement <4 x i16> undef, i16 %c, i32 0 + %vecinit1.i.i = insertelement <4 x i16> %vecinit.i.i, i16 %c, i32 1 + %vecinit2.i.i = insertelement <4 x i16> %vecinit1.i.i, i16 %c, i32 2 + %vecinit3.i.i = insertelement <4 x i16> %vecinit2.i.i, i16 %c, i32 3 + %vqdmlsl15.i.i = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %shuffle.i.i, <4 x i16> %vecinit3.i.i) + %vqdmlsl17.i.i = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %a, <4 x i32> %vqdmlsl15.i.i) + ret <4 x i32> %vqdmlsl17.i.i +} + +define <2 x i64> @test_vqdmlsl_high_n_s32(<2 x i64> %a, <4 x i32> %b, i32 %c) { +; CHECK: test_vqdmlsl_high_n_s32: +; CHECK: sqdmlsl2 {{v[0-9]+}}.2d, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %shuffle.i.i = shufflevector <4 x i32> %b, <4 x i32> undef, <2 x i32> <i32 2, i32 3> + %vecinit.i.i = insertelement <2 x i32> undef, i32 %c, i32 0 + %vecinit1.i.i = insertelement <2 x i32> %vecinit.i.i, i32 %c, i32 1 + %vqdmlsl9.i.i = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %shuffle.i.i, <2 x i32> %vecinit1.i.i) + %vqdmlsl11.i.i = tail call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %a, <2 x i64> %vqdmlsl9.i.i) + ret <2 x i64> %vqdmlsl11.i.i +} + +define <2 x float> @test_vmul_n_f32(<2 x float> %a, float %b) { +; CHECK: test_vmul_n_f32: +; CHECK: fmul {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <2 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %b, i32 1 + %mul.i = fmul <2 x float> %vecinit1.i, %a + ret <2 x float> %mul.i +} + +define <4 x float> @test_vmulq_n_f32(<4 x float> %a, float %b) { +; CHECK: test_vmulq_n_f32: +; CHECK: fmul {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[0] +entry: + %vecinit.i = insertelement <4 x float> undef, float %b, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %b, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %b, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %b, i32 3 + %mul.i = fmul <4 x float> %vecinit3.i, %a + ret <4 x float> %mul.i +} + +define <2 x double> @test_vmulq_n_f64(<2 x double> %a, double %b) { +; CHECK: test_vmulq_n_f64: +; CHECK: fmul {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.d[0] +entry: + %vecinit.i = insertelement <2 x double> undef, double %b, i32 0 + %vecinit1.i = insertelement <2 x double> %vecinit.i, double %b, i32 1 + %mul.i = fmul <2 x double> %vecinit1.i, %a + ret <2 x double> %mul.i +} + +define <2 x float> @test_vfma_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfma_n_f32: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %b, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %0 +} + +define <4 x float> @test_vfmaq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmaq_n_f32: +; CHECK: fmla {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %b, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %0 +} + +define <2 x float> @test_vfms_n_f32(<2 x float> %a, <2 x float> %b, float %n) { +; CHECK: test_vfms_n_f32: +; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <2 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <2 x float> %vecinit.i, float %n, i32 1 + %0 = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %b + %1 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %0, <2 x float> %vecinit1.i, <2 x float> %a) + ret <2 x float> %1 +} + +define <4 x float> @test_vfmsq_n_f32(<4 x float> %a, <4 x float> %b, float %n) { +; CHECK: test_vfmsq_n_f32: +; CHECK: fmls {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.s[{{[0-9]+}}] +entry: + %vecinit.i = insertelement <4 x float> undef, float %n, i32 0 + %vecinit1.i = insertelement <4 x float> %vecinit.i, float %n, i32 1 + %vecinit2.i = insertelement <4 x float> %vecinit1.i, float %n, i32 2 + %vecinit3.i = insertelement <4 x float> %vecinit2.i, float %n, i32 3 + %0 = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %b + %1 = tail call <4 x float> @llvm.fma.v4f32(<4 x float> %0, <4 x float> %vecinit3.i, <4 x float> %a) + ret <4 x float> %1 +} |