diff options
-rw-r--r-- | include/llvm/IntrinsicsARM.td | 6 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 7 | ||||
-rw-r--r-- | lib/VMCore/AutoUpgrade.cpp | 1 | ||||
-rw-r--r-- | test/Bitcode/neon-intrinsics.ll | 13 | ||||
-rw-r--r-- | test/CodeGen/ARM/vmul.ll | 98 |
5 files changed, 113 insertions, 12 deletions
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td index 823f095..c93dd5a 100644 --- a/include/llvm/IntrinsicsARM.td +++ b/include/llvm/IntrinsicsARM.td @@ -129,8 +129,12 @@ let Properties = [IntrNoMem, Commutative] in { def int_arm_neon_vmulp : Neon_2Arg_Intrinsic; def int_arm_neon_vqdmulh : Neon_2Arg_Intrinsic; def int_arm_neon_vqrdmulh : Neon_2Arg_Intrinsic; + def int_arm_neon_vmulls : Neon_2Arg_Long_Intrinsic; + def int_arm_neon_vmullu : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vmullp : Neon_2Arg_Long_Intrinsic; def int_arm_neon_vqdmull : Neon_2Arg_Long_Intrinsic; + + // Vector Multiply and Accumulate/Subtract. def int_arm_neon_vqdmlal : Neon_3Arg_Long_Intrinsic; def int_arm_neon_vqdmlsl : Neon_3Arg_Long_Intrinsic; @@ -292,7 +296,7 @@ def int_arm_neon_vcvtfp2hf def int_arm_neon_vcvthf2fp : Intrinsic<[llvm_v4f32_ty], [llvm_v4i16_ty], [IntrNoMem]>; -// Narrowing Saturating Vector Moves. +// Narrowing and Lengthening Vector Moves. def int_arm_neon_vqmovns : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnu : Neon_1Arg_Narrow_Intrinsic; def int_arm_neon_vqmovnsu : Neon_1Arg_Narrow_Intrinsic; diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 5ecd9b9..9dc103f 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -2157,6 +2157,13 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, } return Result; } + case Intrinsic::arm_neon_vmulls: + case Intrinsic::arm_neon_vmullu: { + unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls) + ? ARMISD::VMULLs : ARMISD::VMULLu; + return DAG.getNode(NewOpc, Op.getDebugLoc(), Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + } } } diff --git a/lib/VMCore/AutoUpgrade.cpp b/lib/VMCore/AutoUpgrade.cpp index b323540..4e578ed 100644 --- a/lib/VMCore/AutoUpgrade.cpp +++ b/lib/VMCore/AutoUpgrade.cpp @@ -84,7 +84,6 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) { Name.compare(14, 5, "vsubl", 5) == 0 || Name.compare(14, 5, "vaddw", 5) == 0 || Name.compare(14, 5, "vsubw", 5) == 0 || - Name.compare(14, 5, "vmull", 5) == 0 || Name.compare(14, 5, "vmlal", 5) == 0 || Name.compare(14, 5, "vmlsl", 5) == 0 || Name.compare(14, 5, "vabdl", 5) == 0 || diff --git a/test/Bitcode/neon-intrinsics.ll b/test/Bitcode/neon-intrinsics.ll index 272cd42..feb2d74 100644 --- a/test/Bitcode/neon-intrinsics.ll +++ b/test/Bitcode/neon-intrinsics.ll @@ -76,20 +76,13 @@ ; CHECK: zext <4 x i16> ; CHECK-NEXT: sub <4 x i32> -; vmull should be auto-upgraded to multiply with sext/zext -; (but vmullp should remain an intrinsic) +; vmull* intrinsics will remain intrinsics ; CHECK: vmulls8 -; CHECK-NOT: arm.neon.vmulls.v8i16 -; CHECK: sext <8 x i8> -; CHECK-NEXT: sext <8 x i8> -; CHECK-NEXT: mul <8 x i16> +; CHECK: arm.neon.vmulls.v8i16 ; CHECK: vmullu16 -; CHECK-NOT: arm.neon.vmullu.v4i32 -; CHECK: zext <4 x i16> -; CHECK-NEXT: zext <4 x i16> -; CHECK-NEXT: mul <4 x i32> +; CHECK: arm.neon.vmullu.v4i32 ; CHECK: vmullp8 ; CHECK: arm.neon.vmullp.v8i16 diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 585394e..80ba9be 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -158,6 +158,15 @@ define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i16> %tmp5 } +define <8 x i16> @vmulls8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmulls8_int: +;CHECK: vmull.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vmulls16: ;CHECK: vmull.s16 @@ -169,6 +178,15 @@ define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i32> %tmp5 } +define <4 x i32> @vmulls16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vmulls16_int: +;CHECK: vmull.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vmulls32: ;CHECK: vmull.s32 @@ -180,6 +198,15 @@ define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ret <2 x i64> %tmp5 } +define <2 x i64> @vmulls32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vmulls32_int: +;CHECK: vmull.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vmullu8: ;CHECK: vmull.u8 @@ -191,6 +218,15 @@ define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ret <8 x i16> %tmp5 } +define <8 x i16> @vmullu8_int(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmullu8_int: +;CHECK: vmull.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ;CHECK: vmullu16: ;CHECK: vmull.u16 @@ -202,6 +238,15 @@ define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { ret <4 x i32> %tmp5 } +define <4 x i32> @vmullu16_int(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vmullu16_int: +;CHECK: vmull.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ;CHECK: vmullu32: ;CHECK: vmull.u32 @@ -213,6 +258,15 @@ define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { ret <2 x i64> %tmp5 } +define <2 x i64> @vmullu32_int(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vmullu32_int: +;CHECK: vmull.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vmullp8: ;CHECK: vmull.p8 @@ -233,6 +287,15 @@ entry: ret <4 x i32> %3 } +define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16_int(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmull_lanes16_int +; CHECK: vmull.s16 q0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { entry: ; CHECK: test_vmull_lanes32 @@ -244,6 +307,15 @@ entry: ret <2 x i64> %3 } +define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32_int(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmull_lanes32_int +; CHECK: vmull.s32 q0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone { entry: ; CHECK: test_vmull_laneu16 @@ -255,6 +327,15 @@ entry: ret <4 x i32> %3 } +define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16_int(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone { +entry: +; CHECK: test_vmull_laneu16_int +; CHECK: vmull.u16 q0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone { entry: ; CHECK: test_vmull_laneu32 @@ -266,6 +347,23 @@ entry: ret <2 x i64> %3 } +define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32_int(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone { +entry: +; CHECK: test_vmull_laneu32_int +; CHECK: vmull.u32 q0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone |