diff options
author | Tim Northover <tnorthover@apple.com> | 2013-08-28 12:15:16 +0000 |
---|---|---|
committer | Tim Northover <tnorthover@apple.com> | 2013-08-28 12:15:16 +0000 |
commit | c85bb78714e8e05fb3022148320ea685d7f98d60 (patch) | |
tree | 16584ec0ec1271e4d52c91d3b4e84cf4fc121a86 /test/CodeGen/ARM | |
parent | 3c380d5e28f86984b147fcd424736c498773f37e (diff) | |
download | external_llvm-c85bb78714e8e05fb3022148320ea685d7f98d60.zip external_llvm-c85bb78714e8e05fb3022148320ea685d7f98d60.tar.gz external_llvm-c85bb78714e8e05fb3022148320ea685d7f98d60.tar.bz2 |
ARM: add patterns for vqdmlal with separate vqdmull and vqadds
The vqdmlal and vqdmlls instructions are really just a fused pair consisting of
a vqdmull.sN and a vqadd.sN. This adds patterns to LLVM so that we can switch
Clang's CodeGen over to generating these instead of the special vqdmlal
intrinsics.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@189480 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/CodeGen/ARM')
-rw-r--r-- | test/CodeGen/ARM/vqdmul.ll | 90 |
1 files changed, 90 insertions, 0 deletions
diff --git a/test/CodeGen/ARM/vqdmul.ll b/test/CodeGen/ARM/vqdmul.ll index a28cae9..01bf1a4 100644 --- a/test/CodeGen/ARM/vqdmul.ll +++ b/test/CodeGen/ARM/vqdmul.ll @@ -238,6 +238,51 @@ entry: declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone +define <4 x i32> @vqdmlals16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK-LABEL: vqdmlals16_natural: +;CHECK: vqdmlal.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) + ret <4 x i32> %tmp5 +} + +define <2 x i64> @vqdmlals32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK-LABEL: vqdmlals32_natural: +;CHECK: vqdmlal.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) + ret <2 x i64> %tmp5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK-LABEL: test_vqdmlal_lanes16_natural: +; CHECK: vqdmlal.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) + %2 = tail call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK-LABEL: test_vqdmlal_lanes32_natural: +; CHECK: vqdmlal.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) + %2 = call <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) + ret <2 x i64> %2 +} + +declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqadds.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { ;CHECK-LABEL: vqdmlsls16: ;CHECK: vqdmlsl.s16 @@ -278,3 +323,48 @@ entry: declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +define <4 x i32> @vqdmlsls16_natural(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK-LABEL: vqdmlsls16_natural: +;CHECK: vqdmlsl.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp2, <4 x i16> %tmp3) + %tmp5 = call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp4) + ret <4 x i32> %tmp5 +} + +define <2 x i64> @vqdmlsls32_natural(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK-LABEL: vqdmlsls32_natural: +;CHECK: vqdmlsl.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp2, <2 x i32> %tmp3) + %tmp5 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp4) + ret <2 x i64> %tmp5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16_natural(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK-LABEL: test_vqdmlsl_lanes16_natural: +; CHECK: vqdmlsl.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg1_int16x4_t, <4 x i16> %0) + %2 = tail call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32_natural(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK-LABEL: test_vqdmlsl_lanes32_natural: +; CHECK: vqdmlsl.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg1_int32x2_t, <2 x i32> %0) + %2 = call <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i64> %1) + ret <2 x i64> %2 +} + +declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqsubs.v2i64(<2 x i64>, <2 x i64>) nounwind readnone |