diff options
author | Bob Wilson <bob.wilson@apple.com> | 2009-10-09 20:20:54 +0000 |
---|---|---|
committer | Bob Wilson <bob.wilson@apple.com> | 2009-10-09 20:20:54 +0000 |
commit | ff57ca95835abff5340b955ef0277117c35439db (patch) | |
tree | 817d505af4e81be7628355f79fab2ad29856bdc2 | |
parent | c7973eb023472b13237076954240d915c30c2c69 (diff) | |
download | external_llvm-ff57ca95835abff5340b955ef0277117c35439db.zip external_llvm-ff57ca95835abff5340b955ef0277117c35439db.tar.gz external_llvm-ff57ca95835abff5340b955ef0277117c35439db.tar.bz2 |
Merge a bunch of NEON tests into larger files so they run faster.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@83667 91177308-0d34-0410-b5e6-96231b3b80d8
87 files changed, 3526 insertions, 3650 deletions
diff --git a/test/CodeGen/ARM/vaba.ll b/test/CodeGen/ARM/vaba.ll index 5d58e29..e2dca46 100644 --- a/test/CodeGen/ARM/vaba.ll +++ b/test/CodeGen/ARM/vaba.ll @@ -135,3 +135,71 @@ declare <4 x i32> @llvm.arm.neon.vabas.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) no declare <16 x i8> @llvm.arm.neon.vabau.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vabau.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vabau.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vabals8: +;CHECK: vabal.s8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vabals16: +;CHECK: vabal.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vabals32: +;CHECK: vabal.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vabalu8: +;CHECK: vabal.u8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vabalu16: +;CHECK: vabal.u16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vabalu32: +;CHECK: vabal.u32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabal.ll b/test/CodeGen/ARM/vabal.ll deleted file mode 100644 index 89efd5b..0000000 --- a/test/CodeGen/ARM/vabal.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vabals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vabals8: -;CHECK: vabal.s8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vabals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vabals16: -;CHECK: vabal.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vabals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vabals32: -;CHECK: vabal.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -define <8 x i16> @vabalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vabalu8: -;CHECK: vabal.u8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vabalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vabalu16: -;CHECK: vabal.u16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vabalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vabalu32: -;CHECK: vabal.u32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <8 x i16> @llvm.arm.neon.vabals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vabalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabd.ll b/test/CodeGen/ARM/vabd.ll index db762bd..2b45393 100644 --- a/test/CodeGen/ARM/vabd.ll +++ b/test/CodeGen/ARM/vabd.ll @@ -145,3 +145,65 @@ declare <8 x i16> @llvm.arm.neon.vabdu.v8i16(<8 x i16>, <8 x i16>) nounwind read declare <4 x i32> @llvm.arm.neon.vabdu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone declare <4 x float> @llvm.arm.neon.vabds.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vabdls8: +;CHECK: vabdl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vabdls16: +;CHECK: vabdl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vabdls32: +;CHECK: vabdl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vabdlu8: +;CHECK: vabdl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vabdlu16: +;CHECK: vabdl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vabdlu32: +;CHECK: vabdl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabdl.ll b/test/CodeGen/ARM/vabdl.ll deleted file mode 100644 index 23840f7..0000000 --- a/test/CodeGen/ARM/vabdl.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vabdls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vabdls8: -;CHECK: vabdl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vabdls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vabdls16: -;CHECK: vabdl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vabdls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vabdls32: -;CHECK: vabdl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vabdlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vabdlu8: -;CHECK: vabdl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vabdlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vabdlu16: -;CHECK: vabdl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vabdlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vabdlu32: -;CHECK: vabdl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vabdls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabdls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabdls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vabdlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vabdlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vabdlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vabs.ll b/test/CodeGen/ARM/vabs.ll index f1aafed..18ba61f 100644 --- a/test/CodeGen/ARM/vabs.ll +++ b/test/CodeGen/ARM/vabs.ll @@ -74,3 +74,58 @@ declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vabs.v4i32(<4 x i32>) nounwind readnone declare <4 x float> @llvm.arm.neon.vabs.v4f32(<4 x float>) nounwind readnone +define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind { +;CHECK: vqabss8: +;CHECK: vqabs.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind { +;CHECK: vqabss16: +;CHECK: vqabs.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind { +;CHECK: vqabss32: +;CHECK: vqabs.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind { +;CHECK: vqabsQs8: +;CHECK: vqabs.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind { +;CHECK: vqabsQs16: +;CHECK: vqabs.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind { +;CHECK: vqabsQs32: +;CHECK: vqabs.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vacge.ll b/test/CodeGen/ARM/vacge.ll deleted file mode 100644 index b178446..0000000 --- a/test/CodeGen/ARM/vacge.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vacgef32: -;CHECK: vacge.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x i32> %tmp3 -} - -define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK: vacgeQf32: -;CHECK: vacge.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2) - ret <4 x i32> %tmp3 -} - -declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vacgt.ll b/test/CodeGen/ARM/vacgt.ll deleted file mode 100644 index fd01163..0000000 --- a/test/CodeGen/ARM/vacgt.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vacgtf32: -;CHECK: vacgt.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x i32> %tmp3 -} - -define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK: vacgtQf32: -;CHECK: vacgt.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2) - ret <4 x i32> %tmp3 -} - -declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vadd.ll b/test/CodeGen/ARM/vadd.ll index 9d2deb9..9fa5307 100644 --- a/test/CodeGen/ARM/vadd.ll +++ b/test/CodeGen/ARM/vadd.ll @@ -89,3 +89,189 @@ define <4 x float> @vaddQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp3 = add <4 x float> %tmp1, %tmp2 ret <4 x float> %tmp3 } + +define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vaddhni16: +;CHECK: vaddhn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vaddhni32: +;CHECK: vaddhn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vaddhni64: +;CHECK: vaddhn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vraddhni16: +;CHECK: vraddhn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vraddhni32: +;CHECK: vraddhn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vraddhni64: +;CHECK: vraddhn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vaddls8: +;CHECK: vaddl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vaddls16: +;CHECK: vaddl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vaddls32: +;CHECK: vaddl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vaddlu8: +;CHECK: vaddl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vaddlu16: +;CHECK: vaddl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vaddlu32: +;CHECK: vaddl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { +;CHECK: vaddws8: +;CHECK: vaddw.s8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { +;CHECK: vaddws16: +;CHECK: vaddw.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { +;CHECK: vaddws32: +;CHECK: vaddw.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { +;CHECK: vaddwu8: +;CHECK: vaddw.u8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { +;CHECK: vaddwu16: +;CHECK: vaddw.u16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { +;CHECK: vaddwu32: +;CHECK: vaddw.u32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vaddhn.ll b/test/CodeGen/ARM/vaddhn.ll deleted file mode 100644 index aba5712..0000000 --- a/test/CodeGen/ARM/vaddhn.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vaddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vaddhni16: -;CHECK: vaddhn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vaddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vaddhni32: -;CHECK: vaddhn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vaddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vaddhni64: -;CHECK: vaddhn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i32> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vaddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vaddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vaddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vaddl.ll b/test/CodeGen/ARM/vaddl.ll deleted file mode 100644 index 3a31b95..0000000 --- a/test/CodeGen/ARM/vaddl.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vaddls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vaddls8: -;CHECK: vaddl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vaddls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vaddls16: -;CHECK: vaddl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vaddls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vaddls32: -;CHECK: vaddl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vaddlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vaddlu8: -;CHECK: vaddl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vaddlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vaddlu16: -;CHECK: vaddl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vaddlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vaddlu32: -;CHECK: vaddl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vaddls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vaddls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vaddls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vaddlu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vaddlu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vaddlu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vaddw.ll b/test/CodeGen/ARM/vaddw.ll deleted file mode 100644 index 6d0459e..0000000 --- a/test/CodeGen/ARM/vaddw.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vaddws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { -;CHECK: vaddws8: -;CHECK: vaddw.s8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vaddws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { -;CHECK: vaddws16: -;CHECK: vaddw.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vaddws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { -;CHECK: vaddws32: -;CHECK: vaddw.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vaddwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { -;CHECK: vaddwu8: -;CHECK: vaddw.u8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vaddwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { -;CHECK: vaddwu16: -;CHECK: vaddw.u16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vaddwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { -;CHECK: vaddwu32: -;CHECK: vaddw.u32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vaddws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vaddws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vaddws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vaddwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vaddwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vaddwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vand.ll b/test/CodeGen/ARM/vand.ll deleted file mode 100644 index 653a70b..0000000 --- a/test/CodeGen/ARM/vand.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: v_andi8: -;CHECK: vand - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = and <8 x i8> %tmp1, %tmp2 - ret <8 x i8> %tmp3 -} - -define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: v_andi16: -;CHECK: vand - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = and <4 x i16> %tmp1, %tmp2 - ret <4 x i16> %tmp3 -} - -define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: v_andi32: -;CHECK: vand - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = and <2 x i32> %tmp1, %tmp2 - ret <2 x i32> %tmp3 -} - -define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: v_andi64: -;CHECK: vand - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = and <1 x i64> %tmp1, %tmp2 - ret <1 x i64> %tmp3 -} - -define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: v_andQi8: -;CHECK: vand - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = and <16 x i8> %tmp1, %tmp2 - ret <16 x i8> %tmp3 -} - -define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: v_andQi16: -;CHECK: vand - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = and <8 x i16> %tmp1, %tmp2 - ret <8 x i16> %tmp3 -} - -define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: v_andQi32: -;CHECK: vand - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = and <4 x i32> %tmp1, %tmp2 - ret <4 x i32> %tmp3 -} - -define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: v_andQi64: -;CHECK: vand - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = and <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %tmp3 -} diff --git a/test/CodeGen/ARM/vbic.ll b/test/CodeGen/ARM/vbic.ll deleted file mode 100644 index 2f79232..0000000 --- a/test/CodeGen/ARM/vbic.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: v_bici8: -;CHECK: vbic - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp4 = and <8 x i8> %tmp1, %tmp3 - ret <8 x i8> %tmp4 -} - -define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: v_bici16: -;CHECK: vbic - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > - %tmp4 = and <4 x i16> %tmp1, %tmp3 - ret <4 x i16> %tmp4 -} - -define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: v_bici32: -;CHECK: vbic - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > - %tmp4 = and <2 x i32> %tmp1, %tmp3 - ret <2 x i32> %tmp4 -} - -define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: v_bici64: -;CHECK: vbic - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > - %tmp4 = and <1 x i64> %tmp1, %tmp3 - ret <1 x i64> %tmp4 -} - -define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: v_bicQi8: -;CHECK: vbic - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp4 = and <16 x i8> %tmp1, %tmp3 - ret <16 x i8> %tmp4 -} - -define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: v_bicQi16: -;CHECK: vbic - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > - %tmp4 = and <8 x i16> %tmp1, %tmp3 - ret <8 x i16> %tmp4 -} - -define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: v_bicQi32: -;CHECK: vbic - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > - %tmp4 = and <4 x i32> %tmp1, %tmp3 - ret <4 x i32> %tmp4 -} - -define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: v_bicQi64: -;CHECK: vbic - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > - %tmp4 = and <2 x i64> %tmp1, %tmp3 - ret <2 x i64> %tmp4 -} diff --git a/test/CodeGen/ARM/vbits.ll b/test/CodeGen/ARM/vbits.ll new file mode 100644 index 0000000..e1d23a1 --- /dev/null +++ b/test/CodeGen/ARM/vbits.ll @@ -0,0 +1,507 @@ +; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s + +define <8 x i8> @v_andi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: v_andi8: +;CHECK: vand + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = and <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_andi16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: v_andi16: +;CHECK: vand + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = and <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_andi32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: v_andi32: +;CHECK: vand + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = and <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_andi64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: v_andi64: +;CHECK: vand + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = and <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_andQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: v_andQi8: +;CHECK: vand + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = and <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_andQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: v_andQi16: +;CHECK: vand + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = and <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_andQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: v_andQi32: +;CHECK: vand + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = and <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_andQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: v_andQi64: +;CHECK: vand + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = and <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <8 x i8> @v_bici8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: v_bici8: +;CHECK: vbic + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = and <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @v_bici16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: v_bici16: +;CHECK: vbic + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = and <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_bici32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: v_bici32: +;CHECK: vbic + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > + %tmp4 = and <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @v_bici64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: v_bici64: +;CHECK: vbic + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > + %tmp4 = and <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @v_bicQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: v_bicQi8: +;CHECK: vbic + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = and <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @v_bicQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: v_bicQi16: +;CHECK: vbic + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = and <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @v_bicQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: v_bicQi32: +;CHECK: vbic + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp4 = and <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @v_bicQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: v_bicQi64: +;CHECK: vbic + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > + %tmp4 = and <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: v_eori8: +;CHECK: veor + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: v_eori16: +;CHECK: veor + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: v_eori32: +;CHECK: veor + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: v_eori64: +;CHECK: veor + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: v_eorQi8: +;CHECK: veor + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: v_eorQi16: +;CHECK: veor + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: v_eorQi32: +;CHECK: veor + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: v_eorQi64: +;CHECK: veor + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind { +;CHECK: v_mvni8: +;CHECK: vmvn + %tmp1 = load <8 x i8>* %A + %tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind { +;CHECK: v_mvni16: +;CHECK: vmvn + %tmp1 = load <4 x i16>* %A + %tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind { +;CHECK: v_mvni32: +;CHECK: vmvn + %tmp1 = load <2 x i32>* %A + %tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > + ret <2 x i32> %tmp2 +} + +define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind { +;CHECK: v_mvni64: +;CHECK: vmvn + %tmp1 = load <1 x i64>* %A + %tmp2 = xor <1 x i64> %tmp1, < i64 -1 > + ret <1 x i64> %tmp2 +} + +define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind { +;CHECK: v_mvnQi8: +;CHECK: vmvn + %tmp1 = load <16 x i8>* %A + %tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind { +;CHECK: v_mvnQi16: +;CHECK: vmvn + %tmp1 = load <8 x i16>* %A + %tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind { +;CHECK: v_mvnQi32: +;CHECK: vmvn + %tmp1 = load <4 x i32>* %A + %tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > + ret <4 x i32> %tmp2 +} + +define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind { +;CHECK: v_mvnQi64: +;CHECK: vmvn + %tmp1 = load <2 x i64>* %A + %tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > + ret <2 x i64> %tmp2 +} + +define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: v_orri8: +;CHECK: vorr + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = or <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + +define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: v_orri16: +;CHECK: vorr + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = or <4 x i16> %tmp1, %tmp2 + ret <4 x i16> %tmp3 +} + +define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: v_orri32: +;CHECK: vorr + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = or <2 x i32> %tmp1, %tmp2 + ret <2 x i32> %tmp3 +} + +define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: v_orri64: +;CHECK: vorr + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = or <1 x i64> %tmp1, %tmp2 + ret <1 x i64> %tmp3 +} + +define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: v_orrQi8: +;CHECK: vorr + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = or <16 x i8> %tmp1, %tmp2 + ret <16 x i8> %tmp3 +} + +define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: v_orrQi16: +;CHECK: vorr + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = or <8 x i16> %tmp1, %tmp2 + ret <8 x i16> %tmp3 +} + +define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: v_orrQi32: +;CHECK: vorr + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = or <4 x i32> %tmp1, %tmp2 + ret <4 x i32> %tmp3 +} + +define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: v_orrQi64: +;CHECK: vorr + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = or <2 x i64> %tmp1, %tmp2 + ret <2 x i64> %tmp3 +} + +define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: v_orni8: +;CHECK: vorn + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = or <8 x i8> %tmp1, %tmp3 + ret <8 x i8> %tmp4 +} + +define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: v_orni16: +;CHECK: vorn + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = or <4 x i16> %tmp1, %tmp3 + ret <4 x i16> %tmp4 +} + +define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: v_orni32: +;CHECK: vorn + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > + %tmp4 = or <2 x i32> %tmp1, %tmp3 + ret <2 x i32> %tmp4 +} + +define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: v_orni64: +;CHECK: vorn + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > + %tmp4 = or <1 x i64> %tmp1, %tmp3 + ret <1 x i64> %tmp4 +} + +define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: v_ornQi8: +;CHECK: vorn + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > + %tmp4 = or <16 x i8> %tmp1, %tmp3 + ret <16 x i8> %tmp4 +} + +define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: v_ornQi16: +;CHECK: vorn + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > + %tmp4 = or <8 x i16> %tmp1, %tmp3 + ret <8 x i16> %tmp4 +} + +define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: v_ornQi32: +;CHECK: vorn + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > + %tmp4 = or <4 x i32> %tmp1, %tmp3 + ret <4 x i32> %tmp4 +} + +define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: v_ornQi64: +;CHECK: vorn + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > + %tmp4 = or <2 x i64> %tmp1, %tmp3 + ret <2 x i64> %tmp4 +} + +define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vtsti8: +;CHECK: vtst.i8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = and <8 x i8> %tmp1, %tmp2 + %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> + ret <8 x i8> %tmp5 +} + +define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vtsti16: +;CHECK: vtst.i16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = and <4 x i16> %tmp1, %tmp2 + %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> + ret <4 x i16> %tmp5 +} + +define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vtsti32: +;CHECK: vtst.i32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = and <2 x i32> %tmp1, %tmp2 + %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer + %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> + ret <2 x i32> %tmp5 +} + +define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vtstQi8: +;CHECK: vtst.i8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = and <16 x i8> %tmp1, %tmp2 + %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer + %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> + ret <16 x i8> %tmp5 +} + +define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vtstQi16: +;CHECK: vtst.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = and <8 x i16> %tmp1, %tmp2 + %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer + %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> + ret <8 x i16> %tmp5 +} + +define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vtstQi32: +;CHECK: vtst.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = and <4 x i32> %tmp1, %tmp2 + %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer + %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> + ret <4 x i32> %tmp5 +} diff --git a/test/CodeGen/ARM/vcge.ll b/test/CodeGen/ARM/vcge.ll index b8debd9..2c16111 100644 --- a/test/CodeGen/ARM/vcge.ll +++ b/test/CodeGen/ARM/vcge.ll @@ -139,3 +139,24 @@ define <4 x i32> @vcgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ret <4 x i32> %tmp4 } + +define <2 x i32> @vacgef32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vacgef32: +;CHECK: vacge.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vacged(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @vacgeQf32(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vacgeQf32: +;CHECK: vacge.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vacgeq(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <2 x i32> @llvm.arm.neon.vacged(<2 x float>, <2 x float>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vacgeq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vcgt.ll b/test/CodeGen/ARM/vcgt.ll index eae29b2..6b11ba5 100644 --- a/test/CodeGen/ARM/vcgt.ll +++ b/test/CodeGen/ARM/vcgt.ll @@ -139,3 +139,24 @@ define <4 x i32> @vcgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp4 = sext <4 x i1> %tmp3 to <4 x i32> ret <4 x i32> %tmp4 } + +define <2 x i32> @vacgtf32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vacgtf32: +;CHECK: vacgt.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vacgtd(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x i32> %tmp3 +} + +define <4 x i32> @vacgtQf32(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vacgtQf32: +;CHECK: vacgt.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vacgtq(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <2 x i32> @llvm.arm.neon.vacgtd(<2 x float>, <2 x float>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vacgtq(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vcls.ll b/test/CodeGen/ARM/vcls.ll deleted file mode 100644 index 43bd3f9..0000000 --- a/test/CodeGen/ARM/vcls.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { -;CHECK: vclss8: -;CHECK: vcls.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { -;CHECK: vclss16: -;CHECK: vcls.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { -;CHECK: vclss32: -;CHECK: vcls.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { -;CHECK: vclsQs8: -;CHECK: vcls.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { -;CHECK: vclsQs16: -;CHECK: vcls.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { -;CHECK: vclsQs32: -;CHECK: vcls.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vclz.ll b/test/CodeGen/ARM/vclz.ll deleted file mode 100644 index ec439db..0000000 --- a/test/CodeGen/ARM/vclz.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { -;CHECK: vclz8: -;CHECK: vclz.i8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { -;CHECK: vclz16: -;CHECK: vclz.i16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { -;CHECK: vclz32: -;CHECK: vclz.i32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { -;CHECK: vclzQ8: -;CHECK: vclz.i8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { -;CHECK: vclzQ16: -;CHECK: vclz.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { -;CHECK: vclzQ32: -;CHECK: vclz.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vcnt.ll b/test/CodeGen/ARM/vcnt.ll index 7e045ee..450f90d 100644 --- a/test/CodeGen/ARM/vcnt.ll +++ b/test/CodeGen/ARM/vcnt.ll @@ -18,3 +18,115 @@ define <16 x i8> @vcntQ8(<16 x i8>* %A) nounwind { declare <8 x i8> @llvm.arm.neon.vcnt.v8i8(<8 x i8>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vcnt.v16i8(<16 x i8>) nounwind readnone + +define <8 x i8> @vclz8(<8 x i8>* %A) nounwind { +;CHECK: vclz8: +;CHECK: vclz.i8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vclz16(<4 x i16>* %A) nounwind { +;CHECK: vclz16: +;CHECK: vclz.i16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vclz32(<2 x i32>* %A) nounwind { +;CHECK: vclz32: +;CHECK: vclz.i32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vclzQ8(<16 x i8>* %A) nounwind { +;CHECK: vclzQ8: +;CHECK: vclz.i8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vclzQ16(<8 x i16>* %A) nounwind { +;CHECK: vclzQ16: +;CHECK: vclz.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vclzQ32(<4 x i32>* %A) nounwind { +;CHECK: vclzQ32: +;CHECK: vclz.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vclz.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vclz.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vclz.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vclz.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vclz.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vclz.v4i32(<4 x i32>) nounwind readnone + +define <8 x i8> @vclss8(<8 x i8>* %A) nounwind { +;CHECK: vclss8: +;CHECK: vcls.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vclss16(<4 x i16>* %A) nounwind { +;CHECK: vclss16: +;CHECK: vcls.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vclss32(<2 x i32>* %A) nounwind { +;CHECK: vclss32: +;CHECK: vcls.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vclsQs8(<16 x i8>* %A) nounwind { +;CHECK: vclsQs8: +;CHECK: vcls.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vclsQs16(<8 x i16>* %A) nounwind { +;CHECK: vclsQs16: +;CHECK: vcls.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vclsQs32(<4 x i32>* %A) nounwind { +;CHECK: vclsQs32: +;CHECK: vcls.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vcls.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vcls.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vcls.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vcls.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vcls.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vcls.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vcvt.ll b/test/CodeGen/ARM/vcvt.ll index 795caa4..f4cc536 100644 --- a/test/CodeGen/ARM/vcvt.ll +++ b/test/CodeGen/ARM/vcvt.ll @@ -63,3 +63,78 @@ define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind { %tmp2 = uitofp <4 x i32> %tmp1 to <4 x float> ret <4 x float> %tmp2 } + +define <2 x i32> @vcvt_n_f32tos32(<2 x float>* %A) nounwind { +;CHECK: vcvt_n_f32tos32: +;CHECK: vcvt.s32.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1) + ret <2 x i32> %tmp2 +} + +define <2 x i32> @vcvt_n_f32tou32(<2 x float>* %A) nounwind { +;CHECK: vcvt_n_f32tou32: +;CHECK: vcvt.u32.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1) + ret <2 x i32> %tmp2 +} + +define <2 x float> @vcvt_n_s32tof32(<2 x i32>* %A) nounwind { +;CHECK: vcvt_n_s32tof32: +;CHECK: vcvt.f32.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) + ret <2 x float> %tmp2 +} + +define <2 x float> @vcvt_n_u32tof32(<2 x i32>* %A) nounwind { +;CHECK: vcvt_n_u32tof32: +;CHECK: vcvt.f32.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) + ret <2 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone +declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone +declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone + +define <4 x i32> @vcvtQ_n_f32tos32(<4 x float>* %A) nounwind { +;CHECK: vcvtQ_n_f32tos32: +;CHECK: vcvt.s32.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1) + ret <4 x i32> %tmp2 +} + +define <4 x i32> @vcvtQ_n_f32tou32(<4 x float>* %A) nounwind { +;CHECK: vcvtQ_n_f32tou32: +;CHECK: vcvt.u32.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1) + ret <4 x i32> %tmp2 +} + +define <4 x float> @vcvtQ_n_s32tof32(<4 x i32>* %A) nounwind { +;CHECK: vcvtQ_n_s32tof32: +;CHECK: vcvt.f32.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) + ret <4 x float> %tmp2 +} + +define <4 x float> @vcvtQ_n_u32tof32(<4 x i32>* %A) nounwind { +;CHECK: vcvtQ_n_u32tof32: +;CHECK: vcvt.f32.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) + ret <4 x float> %tmp2 +} + +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone +declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone + diff --git a/test/CodeGen/ARM/vcvt_n.ll b/test/CodeGen/ARM/vcvt_n.ll deleted file mode 100644 index 0ee9976..0000000 --- a/test/CodeGen/ARM/vcvt_n.ll +++ /dev/null @@ -1,76 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x i32> @vcvt_f32tos32(<2 x float>* %A) nounwind { -;CHECK: vcvt_f32tos32: -;CHECK: vcvt.s32.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float> %tmp1, i32 1) - ret <2 x i32> %tmp2 -} - -define <2 x i32> @vcvt_f32tou32(<2 x float>* %A) nounwind { -;CHECK: vcvt_f32tou32: -;CHECK: vcvt.u32.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float> %tmp1, i32 1) - ret <2 x i32> %tmp2 -} - -define <2 x float> @vcvt_s32tof32(<2 x i32>* %A) nounwind { -;CHECK: vcvt_s32tof32: -;CHECK: vcvt.f32.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) - ret <2 x float> %tmp2 -} - -define <2 x float> @vcvt_u32tof32(<2 x i32>* %A) nounwind { -;CHECK: vcvt_u32tof32: -;CHECK: vcvt.f32.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32> %tmp1, i32 1) - ret <2 x float> %tmp2 -} - -declare <2 x i32> @llvm.arm.neon.vcvtfp2fxs.v2i32.v2f32(<2 x float>, i32) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vcvtfp2fxu.v2i32.v2f32(<2 x float>, i32) nounwind readnone -declare <2 x float> @llvm.arm.neon.vcvtfxs2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone -declare <2 x float> @llvm.arm.neon.vcvtfxu2fp.v2f32.v2i32(<2 x i32>, i32) nounwind readnone - -define <4 x i32> @vcvtQ_f32tos32(<4 x float>* %A) nounwind { -;CHECK: vcvtQ_f32tos32: -;CHECK: vcvt.s32.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float> %tmp1, i32 1) - ret <4 x i32> %tmp2 -} - -define <4 x i32> @vcvtQ_f32tou32(<4 x float>* %A) nounwind { -;CHECK: vcvtQ_f32tou32: -;CHECK: vcvt.u32.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float> %tmp1, i32 1) - ret <4 x i32> %tmp2 -} - -define <4 x float> @vcvtQ_s32tof32(<4 x i32>* %A) nounwind { -;CHECK: vcvtQ_s32tof32: -;CHECK: vcvt.f32.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) - ret <4 x float> %tmp2 -} - -define <4 x float> @vcvtQ_u32tof32(<4 x i32>* %A) nounwind { -;CHECK: vcvtQ_u32tof32: -;CHECK: vcvt.f32.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32> %tmp1, i32 1) - ret <4 x float> %tmp2 -} - -declare <4 x i32> @llvm.arm.neon.vcvtfp2fxs.v4i32.v4f32(<4 x float>, i32) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vcvtfp2fxu.v4i32.v4f32(<4 x float>, i32) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvtfxs2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone -declare <4 x float> @llvm.arm.neon.vcvtfxu2fp.v4f32.v4i32(<4 x i32>, i32) nounwind readnone - diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll index cee24c8..c9a68ca 100644 --- a/test/CodeGen/ARM/vdup.ll +++ b/test/CodeGen/ARM/vdup.ll @@ -179,3 +179,91 @@ define <4 x float> @v_shuffledupQfloat2(float* %A) nounwind { %tmp2 = shufflevector <4 x float> %tmp1, <4 x float> undef, <4 x i32> zeroinitializer ret <4 x float> %tmp2 } + +define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { +;CHECK: vduplane8: +;CHECK: vdup.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { +;CHECK: vduplane16: +;CHECK: vdup.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { +;CHECK: vduplane32: +;CHECK: vdup.32 + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x i32> %tmp2 +} + +define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { +;CHECK: vduplanefloat: +;CHECK: vdup.32 + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > + ret <2 x float> %tmp2 +} + +define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { +;CHECK: vduplaneQ8: +;CHECK: vdup.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { +;CHECK: vduplaneQ16: +;CHECK: vdup.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { +;CHECK: vduplaneQ32: +;CHECK: vdup.32 + %tmp1 = load <2 x i32>* %A + %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x i32> %tmp2 +} + +define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { +;CHECK: vduplaneQfloat: +;CHECK: vdup.32 + %tmp1 = load <2 x float>* %A + %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > + ret <4 x float> %tmp2 +} + +define arm_apcscc <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { +entry: + %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> + ret <2 x i64> %0 +} + +define arm_apcscc <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { +entry: + %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> + ret <2 x i64> %0 +} + +define arm_apcscc <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { +entry: + %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> + ret <2 x double> %0 +} + +define arm_apcscc <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { +entry: + %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> + ret <2 x double> %0 +} diff --git a/test/CodeGen/ARM/vdup_lane.ll b/test/CodeGen/ARM/vdup_lane.ll deleted file mode 100644 index 313260a..0000000 --- a/test/CodeGen/ARM/vdup_lane.ll +++ /dev/null @@ -1,89 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vduplane8(<8 x i8>* %A) nounwind { -;CHECK: vduplane8: -;CHECK: vdup.8 - %tmp1 = load <8 x i8>* %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vduplane16(<4 x i16>* %A) nounwind { -;CHECK: vduplane16: -;CHECK: vdup.16 - %tmp1 = load <4 x i16>* %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vduplane32(<2 x i32>* %A) nounwind { -;CHECK: vduplane32: -;CHECK: vdup.32 - %tmp1 = load <2 x i32>* %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <2 x i32> < i32 1, i32 1 > - ret <2 x i32> %tmp2 -} - -define <2 x float> @vduplanefloat(<2 x float>* %A) nounwind { -;CHECK: vduplanefloat: -;CHECK: vdup.32 - %tmp1 = load <2 x float>* %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <2 x i32> < i32 1, i32 1 > - ret <2 x float> %tmp2 -} - -define <16 x i8> @vduplaneQ8(<8 x i8>* %A) nounwind { -;CHECK: vduplaneQ8: -;CHECK: vdup.8 - %tmp1 = load <8 x i8>* %A - %tmp2 = shufflevector <8 x i8> %tmp1, <8 x i8> undef, <16 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vduplaneQ16(<4 x i16>* %A) nounwind { -;CHECK: vduplaneQ16: -;CHECK: vdup.16 - %tmp1 = load <4 x i16>* %A - %tmp2 = shufflevector <4 x i16> %tmp1, <4 x i16> undef, <8 x i32> < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 > - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vduplaneQ32(<2 x i32>* %A) nounwind { -;CHECK: vduplaneQ32: -;CHECK: vdup.32 - %tmp1 = load <2 x i32>* %A - %tmp2 = shufflevector <2 x i32> %tmp1, <2 x i32> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x i32> %tmp2 -} - -define <4 x float> @vduplaneQfloat(<2 x float>* %A) nounwind { -;CHECK: vduplaneQfloat: -;CHECK: vdup.32 - %tmp1 = load <2 x float>* %A - %tmp2 = shufflevector <2 x float> %tmp1, <2 x float> undef, <4 x i32> < i32 1, i32 1, i32 1, i32 1 > - ret <4 x float> %tmp2 -} - -define arm_apcscc <2 x i64> @foo(<2 x i64> %arg0_int64x1_t) nounwind readnone { -entry: - %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 1, i32 1> - ret <2 x i64> %0 -} - -define arm_apcscc <2 x i64> @bar(<2 x i64> %arg0_int64x1_t) nounwind readnone { -entry: - %0 = shufflevector <2 x i64> %arg0_int64x1_t, <2 x i64> undef, <2 x i32> <i32 0, i32 0> - ret <2 x i64> %0 -} - -define arm_apcscc <2 x double> @baz(<2 x double> %arg0_int64x1_t) nounwind readnone { -entry: - %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 1, i32 1> - ret <2 x double> %0 -} - -define arm_apcscc <2 x double> @qux(<2 x double> %arg0_int64x1_t) nounwind readnone { -entry: - %0 = shufflevector <2 x double> %arg0_int64x1_t, <2 x double> undef, <2 x i32> <i32 0, i32 0> - ret <2 x double> %0 -} diff --git a/test/CodeGen/ARM/veor.ll b/test/CodeGen/ARM/veor.ll deleted file mode 100644 index febceb4..0000000 --- a/test/CodeGen/ARM/veor.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_eori8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: v_eori8: -;CHECK: veor - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = xor <8 x i8> %tmp1, %tmp2 - ret <8 x i8> %tmp3 -} - -define <4 x i16> @v_eori16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: v_eori16: -;CHECK: veor - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = xor <4 x i16> %tmp1, %tmp2 - ret <4 x i16> %tmp3 -} - -define <2 x i32> @v_eori32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: v_eori32: -;CHECK: veor - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = xor <2 x i32> %tmp1, %tmp2 - ret <2 x i32> %tmp3 -} - -define <1 x i64> @v_eori64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: v_eori64: -;CHECK: veor - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = xor <1 x i64> %tmp1, %tmp2 - ret <1 x i64> %tmp3 -} - -define <16 x i8> @v_eorQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: v_eorQi8: -;CHECK: veor - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = xor <16 x i8> %tmp1, %tmp2 - ret <16 x i8> %tmp3 -} - -define <8 x i16> @v_eorQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: v_eorQi16: -;CHECK: veor - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = xor <8 x i16> %tmp1, %tmp2 - ret <8 x i16> %tmp3 -} - -define <4 x i32> @v_eorQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: v_eorQi32: -;CHECK: veor - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = xor <4 x i32> %tmp1, %tmp2 - ret <4 x i32> %tmp3 -} - -define <2 x i64> @v_eorQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: v_eorQi64: -;CHECK: veor - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = xor <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %tmp3 -} diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll index b4f093c..f0df798 100644 --- a/test/CodeGen/ARM/vget_lane.ll +++ b/test/CodeGen/ARM/vget_lane.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s +; RUN: llc < %s -mattr=+neon | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" +target triple = "thumbv7-elf" define i32 @vget_lanes8(<8 x i8>* %A) nounwind { ;CHECK: vget_lanes8: @@ -91,3 +93,120 @@ define i32 @vgetQ_lanei32(<4 x i32>* %A) nounwind { %tmp3 = extractelement <4 x i32> %tmp2, i32 1 ret i32 %tmp3 } + +define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind { +entry: +; CHECK: vmov.u16 r0, d0[1] + %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1] + %out_uint16_t = alloca i16 ; <i16*> [#uses=1] + %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] + %0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1] + %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1] + store i16 %1, i16* %out_uint16_t, align 2 + br label %return + +return: ; preds = %entry + ret void +} + +define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind { +entry: +; CHECK: vmov.u8 r0, d0[1] + %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1] + %out_uint8_t = alloca i8 ; <i8*> [#uses=1] + %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] + %0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1] + %1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1] + store i8 %1, i8* %out_uint8_t, align 1 + br label %return + +return: ; preds = %entry + ret void +} + +define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind { +entry: +; CHECK: vmov.u16 r0, d0[1] + %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1] + %out_uint16_t = alloca i16 ; <i16*> [#uses=1] + %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] + %0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1] + %1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1] + store i16 %1, i16* %out_uint16_t, align 2 + br label %return + +return: ; preds = %entry + ret void +} + +define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind { +entry: +; CHECK: vmov.u8 r0, d0[1] + %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1] + %out_uint8_t = alloca i8 ; <i8*> [#uses=1] + %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] + %0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1] + %1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1] + store i8 %1, i8* %out_uint8_t, align 1 + br label %return + +return: ; preds = %entry + ret void +} + +define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind { +;CHECK: vset_lane8: +;CHECK: vmov.8 + %tmp1 = load <8 x i8>* %A + %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1 + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind { +;CHECK: vset_lane16: +;CHECK: vmov.16 + %tmp1 = load <4 x i16>* %A + %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1 + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind { +;CHECK: vset_lane32: +;CHECK: vmov.32 + %tmp1 = load <2 x i32>* %A + %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1 + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind { +;CHECK: vsetQ_lane8: +;CHECK: vmov.8 + %tmp1 = load <16 x i8>* %A + %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1 + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { +;CHECK: vsetQ_lane16: +;CHECK: vmov.16 + %tmp1 = load <8 x i16>* %A + %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1 + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { +;CHECK: vsetQ_lane32: +;CHECK: vmov.32 + %tmp1 = load <4 x i32>* %A + %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 + ret <4 x i32> %tmp2 +} + +define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind { +;CHECK: test_vset_lanef32: +;CHECK: fcpys +;CHECK: fcpys +entry: + %0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1] + ret <2 x float> %0 +} diff --git a/test/CodeGen/ARM/vget_lane2.ll b/test/CodeGen/ARM/vget_lane2.ll deleted file mode 100644 index 1981bf9..0000000 --- a/test/CodeGen/ARM/vget_lane2.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -mattr=+neon | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc void @test_vget_laneu16() nounwind { -entry: -; CHECK: vmov.u16 r0, d0[1] - %arg0_uint16x4_t = alloca <4 x i16> ; <<4 x i16>*> [#uses=1] - %out_uint16_t = alloca i16 ; <i16*> [#uses=1] - %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - %0 = load <4 x i16>* %arg0_uint16x4_t, align 8 ; <<4 x i16>> [#uses=1] - %1 = extractelement <4 x i16> %0, i32 1 ; <i16> [#uses=1] - store i16 %1, i16* %out_uint16_t, align 2 - br label %return - -return: ; preds = %entry - ret void -} - -define arm_aapcs_vfpcc void @test_vget_laneu8() nounwind { -entry: -; CHECK: vmov.u8 r0, d0[1] - %arg0_uint8x8_t = alloca <8 x i8> ; <<8 x i8>*> [#uses=1] - %out_uint8_t = alloca i8 ; <i8*> [#uses=1] - %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - %0 = load <8 x i8>* %arg0_uint8x8_t, align 8 ; <<8 x i8>> [#uses=1] - %1 = extractelement <8 x i8> %0, i32 1 ; <i8> [#uses=1] - store i8 %1, i8* %out_uint8_t, align 1 - br label %return - -return: ; preds = %entry - ret void -} - -define arm_aapcs_vfpcc void @test_vgetQ_laneu16() nounwind { -entry: -; CHECK: vmov.u16 r0, d0[1] - %arg0_uint16x8_t = alloca <8 x i16> ; <<8 x i16>*> [#uses=1] - %out_uint16_t = alloca i16 ; <i16*> [#uses=1] - %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - %0 = load <8 x i16>* %arg0_uint16x8_t, align 16 ; <<8 x i16>> [#uses=1] - %1 = extractelement <8 x i16> %0, i32 1 ; <i16> [#uses=1] - store i16 %1, i16* %out_uint16_t, align 2 - br label %return - -return: ; preds = %entry - ret void -} - -define arm_aapcs_vfpcc void @test_vgetQ_laneu8() nounwind { -entry: -; CHECK: vmov.u8 r0, d0[1] - %arg0_uint8x16_t = alloca <16 x i8> ; <<16 x i8>*> [#uses=1] - %out_uint8_t = alloca i8 ; <i8*> [#uses=1] - %"alloca point" = bitcast i32 0 to i32 ; <i32> [#uses=0] - %0 = load <16 x i8>* %arg0_uint8x16_t, align 16 ; <<16 x i8>> [#uses=1] - %1 = extractelement <16 x i8> %0, i32 1 ; <i8> [#uses=1] - store i8 %1, i8* %out_uint8_t, align 1 - br label %return - -return: ; preds = %entry - ret void -} diff --git a/test/CodeGen/ARM/vhadd.ll b/test/CodeGen/ARM/vhadd.ll index d767097..379e062 100644 --- a/test/CodeGen/ARM/vhadd.ll +++ b/test/CodeGen/ARM/vhadd.ll @@ -123,3 +123,127 @@ declare <4 x i32> @llvm.arm.neon.vhadds.v4i32(<4 x i32>, <4 x i32>) nounwind rea declare <16 x i8> @llvm.arm.neon.vhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone declare <8 x i16> @llvm.arm.neon.vhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vrhadds8: +;CHECK: vrhadd.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vrhadds16: +;CHECK: vrhadd.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vrhadds32: +;CHECK: vrhadd.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vrhaddu8: +;CHECK: vrhadd.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vrhaddu16: +;CHECK: vrhadd.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vrhaddu32: +;CHECK: vrhadd.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vrhaddQs8: +;CHECK: vrhadd.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vrhaddQs16: +;CHECK: vrhadd.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vrhaddQs32: +;CHECK: vrhadd.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vrhaddQu8: +;CHECK: vrhadd.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vrhaddQu16: +;CHECK: vrhadd.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vrhaddQu32: +;CHECK: vrhadd.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmin.ll b/test/CodeGen/ARM/vmin.ll deleted file mode 100644 index 65ed82b..0000000 --- a/test/CodeGen/ARM/vmin.ll +++ /dev/null @@ -1,147 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vmins8: -;CHECK: vmin.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vmins16: -;CHECK: vmin.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vmins32: -;CHECK: vmin.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vminu8: -;CHECK: vmin.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vminu16: -;CHECK: vmin.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vminu32: -;CHECK: vmin.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vminf32: -;CHECK: vmin.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x float> %tmp3 -} - -define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vminQs8: -;CHECK: vmin.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vminQs16: -;CHECK: vmin.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vminQs32: -;CHECK: vmin.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vminQu8: -;CHECK: vmin.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vminQu16: -;CHECK: vmin.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vminQu32: -;CHECK: vmin.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK: vminQf32: -;CHECK: vmin.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) - ret <4 x float> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vmax.ll b/test/CodeGen/ARM/vminmax.ll index 007ac56..e3527c1 100644 --- a/test/CodeGen/ARM/vmax.ll +++ b/test/CodeGen/ARM/vminmax.ll @@ -1,5 +1,151 @@ ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s +define <8 x i8> @vmins8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmins8: +;CHECK: vmin.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vmins16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vmins16: +;CHECK: vmin.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vmins32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vmins32: +;CHECK: vmin.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vminu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vminu8: +;CHECK: vmin.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vminu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vminu16: +;CHECK: vmin.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vminu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vminu32: +;CHECK: vmin.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vminf32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vminf32: +;CHECK: vmin.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <16 x i8> @vminQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vminQs8: +;CHECK: vmin.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vminQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vminQs16: +;CHECK: vmin.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vminQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vminQs32: +;CHECK: vmin.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <16 x i8> @vminQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vminQu8: +;CHECK: vmin.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vminQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vminQu16: +;CHECK: vmin.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vminQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vminQu32: +;CHECK: vmin.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <4 x float> @vminQf32(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vminQf32: +;CHECK: vmin.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vmins.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmins.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmins.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vminu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vminu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vmins.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vmins.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmins.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vminu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vminu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vminu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +declare <4 x float> @llvm.arm.neon.vmins.v4f32(<4 x float>, <4 x float>) nounwind readnone + define <8 x i8> @vmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vmaxs8: ;CHECK: vmax.s8 diff --git a/test/CodeGen/ARM/vmla.ll b/test/CodeGen/ARM/vmla.ll index 58ac6b0..8405218 100644 --- a/test/CodeGen/ARM/vmla.ll +++ b/test/CodeGen/ARM/vmla.ll @@ -87,3 +87,107 @@ define <4 x float> @vmlaQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) %tmp5 = add <4 x float> %tmp1, %tmp4 ret <4 x float> %tmp5 } + +define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vmlals8: +;CHECK: vmlal.s8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vmlals16: +;CHECK: vmlal.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vmlals32: +;CHECK: vmlal.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vmlalu8: +;CHECK: vmlal.u8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vmlalu16: +;CHECK: vmlal.u16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vmlalu32: +;CHECK: vmlal.u32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmlal_lanes16 +; CHECK: vmlal.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmlal_lanes32 +; CHECK: vmlal.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { +entry: +; CHECK: test_vmlal_laneu16 +; CHECK: vmlal.u16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { +entry: +; CHECK: test_vmlal_laneu32 +; CHECK: vmlal.u32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmlal.ll b/test/CodeGen/ARM/vmlal.ll deleted file mode 100644 index 6e18971..0000000 --- a/test/CodeGen/ARM/vmlal.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vmlals8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vmlals8: -;CHECK: vmlal.s8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vmlals16: -;CHECK: vmlal.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vmlals32: -;CHECK: vmlal.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -define <8 x i16> @vmlalu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vmlalu8: -;CHECK: vmlal.u8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vmlalu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vmlalu16: -;CHECK: vmlal.u16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vmlalu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vmlalu32: -;CHECK: vmlal.u32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <8 x i16> @llvm.arm.neon.vmlals.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmlalu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmlal_lane.ll b/test/CodeGen/ARM/vmlal_lane.ll deleted file mode 100644 index 5bb0621..0000000 --- a/test/CodeGen/ARM/vmlal_lane.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vmlal_lanes16 -; CHECK: vmlal.s16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmlals.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vmlal_lanes32 -; CHECK: vmlal.s32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmlals.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i32> @test_vmlal_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { -entry: -; CHECK: test_vmlal_laneu16 -; CHECK: vmlal.u16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmlalu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmlal_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { -entry: -; CHECK: test_vmlal_laneu32 -; CHECK: vmlal.u32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmlalu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmls.ll b/test/CodeGen/ARM/vmls.ll index 0dca6e3..c89552e 100644 --- a/test/CodeGen/ARM/vmls.ll +++ b/test/CodeGen/ARM/vmls.ll @@ -87,3 +87,107 @@ define <4 x float> @vmlsQf32(<4 x float>* %A, <4 x float>* %B, <4 x float>* %C) %tmp5 = sub <4 x float> %tmp1, %tmp4 ret <4 x float> %tmp5 } + +define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vmlsls8: +;CHECK: vmlsl.s8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vmlsls16: +;CHECK: vmlsl.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vmlsls32: +;CHECK: vmlsl.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { +;CHECK: vmlslu8: +;CHECK: vmlsl.u8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = load <8 x i8>* %C + %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) + ret <8 x i16> %tmp4 +} + +define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vmlslu16: +;CHECK: vmlsl.u16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vmlslu32: +;CHECK: vmlsl.u32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmlsl_lanes16 +; CHECK: vmlsl.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmlsl_lanes32 +; CHECK: vmlsl.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { +entry: +; CHECK: test_vmlsl_laneu16 +; CHECK: vmlsl.u16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { +entry: +; CHECK: test_vmlsl_laneu32 +; CHECK: vmlsl.u32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmlsl.ll b/test/CodeGen/ARM/vmlsl.ll deleted file mode 100644 index a2be79c..0000000 --- a/test/CodeGen/ARM/vmlsl.ll +++ /dev/null @@ -1,69 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vmlsls8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vmlsls8: -;CHECK: vmlsl.s8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vmlsls16: -;CHECK: vmlsl.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vmlsls32: -;CHECK: vmlsl.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -define <8 x i16> @vmlslu8(<8 x i16>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind { -;CHECK: vmlslu8: -;CHECK: vmlsl.u8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = load <8 x i8>* %C - %tmp4 = call <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2, <8 x i8> %tmp3) - ret <8 x i16> %tmp4 -} - -define <4 x i32> @vmlslu16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vmlslu16: -;CHECK: vmlsl.u16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vmlslu32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vmlslu32: -;CHECK: vmlsl.u32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <8 x i16> @llvm.arm.neon.vmlsls.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmlslu.v8i16(<8 x i16>, <8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmlsl_lane.ll b/test/CodeGen/ARM/vmlsl_lane.ll deleted file mode 100644 index 1effbd6..0000000 --- a/test/CodeGen/ARM/vmlsl_lane.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vmlsl_lanes16 -; CHECK: vmlsl.s16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmlsls.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vmlsl_lanes32 -; CHECK: vmlsl.s32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmlsls.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i32> @test_vmlsl_laneu16(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %arg2_uint16x4_t) nounwind readnone { -entry: -; CHECK: test_vmlsl_laneu16 -; CHECK: vmlsl.u16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32> %arg0_uint32x4_t, <4 x i16> %arg1_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmlslu.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmlsl_laneu32(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %arg2_uint32x2_t) nounwind readnone { -entry: -; CHECK: test_vmlsl_laneu32 -; CHECK: vmlsl.u32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64> %arg0_uint64x2_t, <2 x i32> %arg1_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmlslu.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll index f43266c..ed69f97 100644 --- a/test/CodeGen/ARM/vmov.ll +++ b/test/CodeGen/ARM/vmov.ll @@ -133,3 +133,171 @@ define <2 x i64> @v_movQi64() nounwind { ;CHECK: vmov.i64 ret <2 x i64> < i64 18374687574888349695, i64 18374687574888349695 > } + +define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind { +;CHECK: vmovls8: +;CHECK: vmovl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind { +;CHECK: vmovls16: +;CHECK: vmovl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind { +;CHECK: vmovls32: +;CHECK: vmovl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind { +;CHECK: vmovlu8: +;CHECK: vmovl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind { +;CHECK: vmovlu16: +;CHECK: vmovl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind { +;CHECK: vmovlu32: +;CHECK: vmovl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone + +define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind { +;CHECK: vmovni16: +;CHECK: vmovn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind { +;CHECK: vmovni32: +;CHECK: vmovn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind { +;CHECK: vmovni64: +;CHECK: vmovn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone + +define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind { +;CHECK: vqmovns16: +;CHECK: vqmovn.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind { +;CHECK: vqmovns32: +;CHECK: vqmovn.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind { +;CHECK: vqmovns64: +;CHECK: vqmovn.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind { +;CHECK: vqmovnu16: +;CHECK: vqmovn.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind { +;CHECK: vqmovnu32: +;CHECK: vqmovn.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind { +;CHECK: vqmovnu64: +;CHECK: vqmovn.u64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind { +;CHECK: vqmovuns16: +;CHECK: vqmovun.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind { +;CHECK: vqmovuns32: +;CHECK: vqmovun.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind { +;CHECK: vqmovuns64: +;CHECK: vqmovun.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vmovl.ll b/test/CodeGen/ARM/vmovl.ll deleted file mode 100644 index beb1613..0000000 --- a/test/CodeGen/ARM/vmovl.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vmovls8(<8 x i8>* %A) nounwind { -;CHECK: vmovls8: -;CHECK: vmovl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vmovls16(<4 x i16>* %A) nounwind { -;CHECK: vmovls16: -;CHECK: vmovl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vmovls32(<2 x i32>* %A) nounwind { -;CHECK: vmovls32: -;CHECK: vmovl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32> %tmp1) - ret <2 x i64> %tmp2 -} - -define <8 x i16> @vmovlu8(<8 x i8>* %A) nounwind { -;CHECK: vmovlu8: -;CHECK: vmovl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vmovlu16(<4 x i16>* %A) nounwind { -;CHECK: vmovlu16: -;CHECK: vmovl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vmovlu32(<2 x i32>* %A) nounwind { -;CHECK: vmovlu32: -;CHECK: vmovl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32> %tmp1) - ret <2 x i64> %tmp2 -} - -declare <8 x i16> @llvm.arm.neon.vmovls.v8i16(<8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmovls.v4i32(<4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmovls.v2i64(<2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmovlu.v8i16(<8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmovlu.v4i32(<4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmovlu.v2i64(<2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmovn.ll b/test/CodeGen/ARM/vmovn.ll deleted file mode 100644 index 7bf049f..0000000 --- a/test/CodeGen/ARM/vmovn.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vmovni16(<8 x i16>* %A) nounwind { -;CHECK: vmovni16: -;CHECK: vmovn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vmovni32(<4 x i32>* %A) nounwind { -;CHECK: vmovni32: -;CHECK: vmovn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vmovni64(<2 x i64>* %A) nounwind { -;CHECK: vmovni64: -;CHECK: vmovn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64> %tmp1) - ret <2 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vmovn.v8i8(<8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vmovn.v4i16(<4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vmovn.v2i32(<2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vmul.ll b/test/CodeGen/ARM/vmul.ll index 83ec55e..325da5d 100644 --- a/test/CodeGen/ARM/vmul.ll +++ b/test/CodeGen/ARM/vmul.ll @@ -92,3 +92,166 @@ define <16 x i8> @vmulQp8(<16 x i8>* %A, <16 x i8>* %B) nounwind { declare <8 x i8> @llvm.arm.neon.vmulp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone declare <16 x i8> @llvm.arm.neon.vmulp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + +define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone { +entry: +; CHECK: test_vmul_lanef32: +; CHECK: vmul.f32 d0, d0, d1[0] + %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1] + %1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1] + ret <2 x float> %1 +} + +define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmul_lanes16: +; CHECK: vmul.i16 d0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$ + %1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1] + ret <4 x i16> %1 +} + +define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmul_lanes32: +; CHECK: vmul.i32 d0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1] + ret <2 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone { +entry: +; CHECK: test_vmulQ_lanef32: +; CHECK: vmul.f32 q0, q0, d2[1] + %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$ + %1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1] + ret <4 x float> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmulQ_lanes16: +; CHECK: vmul.i16 q0, q0, d2[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> + %1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1] + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmulQ_lanes32: +; CHECK: vmul.i32 q0, q0, d2[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$ + %1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmulls8: +;CHECK: vmull.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vmulls16: +;CHECK: vmull.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vmulls32: +;CHECK: vmull.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmullu8: +;CHECK: vmull.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vmullu16: +;CHECK: vmull.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vmullu32: +;CHECK: vmull.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vmullp8: +;CHECK: vmull.p8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vmull_lanes16 +; CHECK: vmull.s16 q0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vmull_lanes32 +; CHECK: vmull.s32 q0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone { +entry: +; CHECK: test_vmull_laneu16 +; CHECK: vmull.u16 q0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone { +entry: +; CHECK: test_vmull_laneu32 +; CHECK: vmull.u32 q0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone diff --git a/test/CodeGen/ARM/vmul_lane.ll b/test/CodeGen/ARM/vmul_lane.ll deleted file mode 100644 index 7edd873..0000000 --- a/test/CodeGen/ARM/vmul_lane.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <2 x float> @test_vmul_lanef32(<2 x float> %arg0_float32x2_t, <2 x float> %arg1_float32x2_t) nounwind readnone { -entry: -; CHECK: test_vmul_lanef32: -; CHECK: vmul.f32 d0, d0, d1[0] - %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <2 x i32> zeroinitializer ; <<2 x float>> [#uses=1] - %1 = fmul <2 x float> %0, %arg0_float32x2_t ; <<2 x float>> [#uses=1] - ret <2 x float> %1 -} - -define arm_aapcs_vfpcc <4 x i16> @test_vmul_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vmul_lanes16: -; CHECK: vmul.i16 d0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses$ - %1 = mul <4 x i16> %0, %arg0_int16x4_t ; <<4 x i16>> [#uses=1] - ret <4 x i16> %1 -} - -define arm_aapcs_vfpcc <2 x i32> @test_vmul_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vmul_lanes32: -; CHECK: vmul.i32 d0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = mul <2 x i32> %0, %arg0_int32x2_t ; <<2 x i32>> [#uses=1] - ret <2 x i32> %1 -} - -define arm_aapcs_vfpcc <4 x float> @test_vmulQ_lanef32(<4 x float> %arg0_float32x4_t, <2 x float> %arg1_float32x2_t) nounwind readnone { -entry: -; CHECK: test_vmulQ_lanef32: -; CHECK: vmul.f32 q0, q0, d2[1] - %0 = shufflevector <2 x float> %arg1_float32x2_t, <2 x float> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x float>$ - %1 = fmul <4 x float> %0, %arg0_float32x4_t ; <<4 x float>> [#uses=1] - ret <4 x float> %1 -} - -define arm_aapcs_vfpcc <8 x i16> @test_vmulQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vmulQ_lanes16: -; CHECK: vmul.i16 q0, q0, d2[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> - %1 = mul <8 x i16> %0, %arg0_int16x8_t ; <<8 x i16>> [#uses=1] - ret <8 x i16> %1 -} - -define arm_aapcs_vfpcc <4 x i32> @test_vmulQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vmulQ_lanes32: -; CHECK: vmul.i32 q0, q0, d2[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses$ - %1 = mul <4 x i32> %0, %arg0_int32x4_t ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} diff --git a/test/CodeGen/ARM/vmull.ll b/test/CodeGen/ARM/vmull.ll deleted file mode 100644 index 56149da..0000000 --- a/test/CodeGen/ARM/vmull.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vmulls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vmulls8: -;CHECK: vmull.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vmulls16: -;CHECK: vmull.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vmulls32: -;CHECK: vmull.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vmullu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vmullu8: -;CHECK: vmull.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vmullu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vmullu16: -;CHECK: vmull.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vmullu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vmullu32: -;CHECK: vmull.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vmullp8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vmullp8: -;CHECK: vmull.p8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vmulls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmullu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>) nounwind readnone diff --git a/test/CodeGen/ARM/vmull_lane.ll b/test/CodeGen/ARM/vmull_lane.ll deleted file mode 100644 index 72cb3b1..0000000 --- a/test/CodeGen/ARM/vmull_lane.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vmull_lanes16 -; CHECK: vmull.s16 q0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmulls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vmull_lanes32 -; CHECK: vmull.s32 q0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmulls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i32> @test_vmull_laneu16(<4 x i16> %arg0_uint16x4_t, <4 x i16> %arg1_uint16x4_t) nounwind readnone { -entry: -; CHECK: test_vmull_laneu16 -; CHECK: vmull.u16 q0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_uint16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %arg0_uint16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vmull_laneu32(<2 x i32> %arg0_uint32x2_t, <2 x i32> %arg1_uint32x2_t) nounwind readnone { -entry: -; CHECK: test_vmull_laneu32 -; CHECK: vmull.u32 q0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_uint32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32> %arg0_uint32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vmullu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vmvn.ll b/test/CodeGen/ARM/vmvn.ll deleted file mode 100644 index d756b76..0000000 --- a/test/CodeGen/ARM/vmvn.ll +++ /dev/null @@ -1,65 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_mvni8(<8 x i8>* %A) nounwind { -;CHECK: v_mvni8: -;CHECK: vmvn - %tmp1 = load <8 x i8>* %A - %tmp2 = xor <8 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - ret <8 x i8> %tmp2 -} - -define <4 x i16> @v_mvni16(<4 x i16>* %A) nounwind { -;CHECK: v_mvni16: -;CHECK: vmvn - %tmp1 = load <4 x i16>* %A - %tmp2 = xor <4 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1 > - ret <4 x i16> %tmp2 -} - -define <2 x i32> @v_mvni32(<2 x i32>* %A) nounwind { -;CHECK: v_mvni32: -;CHECK: vmvn - %tmp1 = load <2 x i32>* %A - %tmp2 = xor <2 x i32> %tmp1, < i32 -1, i32 -1 > - ret <2 x i32> %tmp2 -} - -define <1 x i64> @v_mvni64(<1 x i64>* %A) nounwind { -;CHECK: v_mvni64: -;CHECK: vmvn - %tmp1 = load <1 x i64>* %A - %tmp2 = xor <1 x i64> %tmp1, < i64 -1 > - ret <1 x i64> %tmp2 -} - -define <16 x i8> @v_mvnQi8(<16 x i8>* %A) nounwind { -;CHECK: v_mvnQi8: -;CHECK: vmvn - %tmp1 = load <16 x i8>* %A - %tmp2 = xor <16 x i8> %tmp1, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - ret <16 x i8> %tmp2 -} - -define <8 x i16> @v_mvnQi16(<8 x i16>* %A) nounwind { -;CHECK: v_mvnQi16: -;CHECK: vmvn - %tmp1 = load <8 x i16>* %A - %tmp2 = xor <8 x i16> %tmp1, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > - ret <8 x i16> %tmp2 -} - -define <4 x i32> @v_mvnQi32(<4 x i32>* %A) nounwind { -;CHECK: v_mvnQi32: -;CHECK: vmvn - %tmp1 = load <4 x i32>* %A - %tmp2 = xor <4 x i32> %tmp1, < i32 -1, i32 -1, i32 -1, i32 -1 > - ret <4 x i32> %tmp2 -} - -define <2 x i64> @v_mvnQi64(<2 x i64>* %A) nounwind { -;CHECK: v_mvnQi64: -;CHECK: vmvn - %tmp1 = load <2 x i64>* %A - %tmp2 = xor <2 x i64> %tmp1, < i64 -1, i64 -1 > - ret <2 x i64> %tmp2 -} diff --git a/test/CodeGen/ARM/vneg.ll b/test/CodeGen/ARM/vneg.ll index e560bdd..7764e87 100644 --- a/test/CodeGen/ARM/vneg.ll +++ b/test/CodeGen/ARM/vneg.ll @@ -63,3 +63,59 @@ define <4 x float> @vnegQf32(<4 x float>* %A) nounwind { %tmp2 = sub <4 x float> < float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00 >, %tmp1 ret <4 x float> %tmp2 } + +define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind { +;CHECK: vqnegs8: +;CHECK: vqneg.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind { +;CHECK: vqnegs16: +;CHECK: vqneg.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind { +;CHECK: vqnegs32: +;CHECK: vqneg.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind { +;CHECK: vqnegQs8: +;CHECK: vqneg.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind { +;CHECK: vqnegQs16: +;CHECK: vqneg.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind { +;CHECK: vqnegQs32: +;CHECK: vqneg.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vorn.ll b/test/CodeGen/ARM/vorn.ll deleted file mode 100644 index bef6d37..0000000 --- a/test/CodeGen/ARM/vorn.ll +++ /dev/null @@ -1,81 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_orni8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: v_orni8: -;CHECK: vorn - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = xor <8 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp4 = or <8 x i8> %tmp1, %tmp3 - ret <8 x i8> %tmp4 -} - -define <4 x i16> @v_orni16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: v_orni16: -;CHECK: vorn - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = xor <4 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1 > - %tmp4 = or <4 x i16> %tmp1, %tmp3 - ret <4 x i16> %tmp4 -} - -define <2 x i32> @v_orni32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: v_orni32: -;CHECK: vorn - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = xor <2 x i32> %tmp2, < i32 -1, i32 -1 > - %tmp4 = or <2 x i32> %tmp1, %tmp3 - ret <2 x i32> %tmp4 -} - -define <1 x i64> @v_orni64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: v_orni64: -;CHECK: vorn - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = xor <1 x i64> %tmp2, < i64 -1 > - %tmp4 = or <1 x i64> %tmp1, %tmp3 - ret <1 x i64> %tmp4 -} - -define <16 x i8> @v_ornQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: v_ornQi8: -;CHECK: vorn - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = xor <16 x i8> %tmp2, < i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1 > - %tmp4 = or <16 x i8> %tmp1, %tmp3 - ret <16 x i8> %tmp4 -} - -define <8 x i16> @v_ornQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: v_ornQi16: -;CHECK: vorn - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = xor <8 x i16> %tmp2, < i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1 > - %tmp4 = or <8 x i16> %tmp1, %tmp3 - ret <8 x i16> %tmp4 -} - -define <4 x i32> @v_ornQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: v_ornQi32: -;CHECK: vorn - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = xor <4 x i32> %tmp2, < i32 -1, i32 -1, i32 -1, i32 -1 > - %tmp4 = or <4 x i32> %tmp1, %tmp3 - ret <4 x i32> %tmp4 -} - -define <2 x i64> @v_ornQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: v_ornQi64: -;CHECK: vorn - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = xor <2 x i64> %tmp2, < i64 -1, i64 -1 > - %tmp4 = or <2 x i64> %tmp1, %tmp3 - ret <2 x i64> %tmp4 -} diff --git a/test/CodeGen/ARM/vorr.ll b/test/CodeGen/ARM/vorr.ll deleted file mode 100644 index e9777ab..0000000 --- a/test/CodeGen/ARM/vorr.ll +++ /dev/null @@ -1,73 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @v_orri8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: v_orri8: -;CHECK: vorr - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = or <8 x i8> %tmp1, %tmp2 - ret <8 x i8> %tmp3 -} - -define <4 x i16> @v_orri16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: v_orri16: -;CHECK: vorr - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = or <4 x i16> %tmp1, %tmp2 - ret <4 x i16> %tmp3 -} - -define <2 x i32> @v_orri32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: v_orri32: -;CHECK: vorr - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = or <2 x i32> %tmp1, %tmp2 - ret <2 x i32> %tmp3 -} - -define <1 x i64> @v_orri64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: v_orri64: -;CHECK: vorr - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = or <1 x i64> %tmp1, %tmp2 - ret <1 x i64> %tmp3 -} - -define <16 x i8> @v_orrQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: v_orrQi8: -;CHECK: vorr - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = or <16 x i8> %tmp1, %tmp2 - ret <16 x i8> %tmp3 -} - -define <8 x i16> @v_orrQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: v_orrQi16: -;CHECK: vorr - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = or <8 x i16> %tmp1, %tmp2 - ret <8 x i16> %tmp3 -} - -define <4 x i32> @v_orrQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: v_orrQi32: -;CHECK: vorr - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = or <4 x i32> %tmp1, %tmp2 - ret <4 x i32> %tmp3 -} - -define <2 x i64> @v_orrQi64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: v_orrQi64: -;CHECK: vorr - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = or <2 x i64> %tmp1, %tmp2 - ret <2 x i64> %tmp3 -} diff --git a/test/CodeGen/ARM/vpadd.ll b/test/CodeGen/ARM/vpadd.ll index 69cfb23..2125573 100644 --- a/test/CodeGen/ARM/vpadd.ll +++ b/test/CodeGen/ARM/vpadd.ll @@ -41,3 +41,115 @@ declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind read declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind { +;CHECK: vpaddls8: +;CHECK: vpaddl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind { +;CHECK: vpaddls16: +;CHECK: vpaddl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind { +;CHECK: vpaddls32: +;CHECK: vpaddl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1) + ret <1 x i64> %tmp2 +} + +define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind { +;CHECK: vpaddlu8: +;CHECK: vpaddl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind { +;CHECK: vpaddlu16: +;CHECK: vpaddl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind { +;CHECK: vpaddlu32: +;CHECK: vpaddl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1) + ret <1 x i64> %tmp2 +} + +define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind { +;CHECK: vpaddlQs8: +;CHECK: vpaddl.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind { +;CHECK: vpaddlQs16: +;CHECK: vpaddl.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind { +;CHECK: vpaddlQs32: +;CHECK: vpaddl.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind { +;CHECK: vpaddlQu8: +;CHECK: vpaddl.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind { +;CHECK: vpaddlQu16: +;CHECK: vpaddl.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind { +;CHECK: vpaddlQu32: +;CHECK: vpaddl.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1) + ret <2 x i64> %tmp2 +} + +declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone + +declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vpaddl.ll b/test/CodeGen/ARM/vpaddl.ll deleted file mode 100644 index 44c4e30..0000000 --- a/test/CodeGen/ARM/vpaddl.ll +++ /dev/null @@ -1,113 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <4 x i16> @vpaddls8(<8 x i8>* %A) nounwind { -;CHECK: vpaddls8: -;CHECK: vpaddl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vpaddls16(<4 x i16>* %A) nounwind { -;CHECK: vpaddls16: -;CHECK: vpaddl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1) - ret <2 x i32> %tmp2 -} - -define <1 x i64> @vpaddls32(<2 x i32>* %A) nounwind { -;CHECK: vpaddls32: -;CHECK: vpaddl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1) - ret <1 x i64> %tmp2 -} - -define <4 x i16> @vpaddlu8(<8 x i8>* %A) nounwind { -;CHECK: vpaddlu8: -;CHECK: vpaddl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vpaddlu16(<4 x i16>* %A) nounwind { -;CHECK: vpaddlu16: -;CHECK: vpaddl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1) - ret <2 x i32> %tmp2 -} - -define <1 x i64> @vpaddlu32(<2 x i32>* %A) nounwind { -;CHECK: vpaddlu32: -;CHECK: vpaddl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1) - ret <1 x i64> %tmp2 -} - -define <8 x i16> @vpaddlQs8(<16 x i8>* %A) nounwind { -;CHECK: vpaddlQs8: -;CHECK: vpaddl.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vpaddlQs16(<8 x i16>* %A) nounwind { -;CHECK: vpaddlQs16: -;CHECK: vpaddl.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vpaddlQs32(<4 x i32>* %A) nounwind { -;CHECK: vpaddlQs32: -;CHECK: vpaddl.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1) - ret <2 x i64> %tmp2 -} - -define <8 x i16> @vpaddlQu8(<16 x i8>* %A) nounwind { -;CHECK: vpaddlQu8: -;CHECK: vpaddl.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vpaddlQu16(<8 x i16>* %A) nounwind { -;CHECK: vpaddlQu16: -;CHECK: vpaddl.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vpaddlQu32(<4 x i32>* %A) nounwind { -;CHECK: vpaddlQu32: -;CHECK: vpaddl.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1) - ret <2 x i64> %tmp2 -} - -declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone - -declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmax.ll b/test/CodeGen/ARM/vpmax.ll deleted file mode 100644 index b2a4a6e..0000000 --- a/test/CodeGen/ARM/vpmax.ll +++ /dev/null @@ -1,74 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vpmaxs8: -;CHECK: vpmax.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vpmaxs16: -;CHECK: vpmax.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vpmaxs32: -;CHECK: vpmax.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vpmaxu8: -;CHECK: vpmax.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vpmaxu16: -;CHECK: vpmax.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vpmaxu32: -;CHECK: vpmax.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vpmaxf32: -;CHECK: vpmax.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x float> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vpmin.ll b/test/CodeGen/ARM/vpminmax.ll index d27043f..b75bcc9 100644 --- a/test/CodeGen/ARM/vpmin.ll +++ b/test/CodeGen/ARM/vpminmax.ll @@ -72,3 +72,76 @@ declare <4 x i16> @llvm.arm.neon.vpminu.v4i16(<4 x i16>, <4 x i16>) nounwind rea declare <2 x i32> @llvm.arm.neon.vpminu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone declare <2 x float> @llvm.arm.neon.vpmins.v2f32(<2 x float>, <2 x float>) nounwind readnone + +define <8 x i8> @vpmaxs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vpmaxs8: +;CHECK: vpmax.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpmaxs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vpmaxs16: +;CHECK: vpmax.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpmaxs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vpmaxs32: +;CHECK: vpmax.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i8> @vpmaxu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vpmaxu8: +;CHECK: vpmax.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vpmaxu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vpmaxu16: +;CHECK: vpmax.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vpmaxu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vpmaxu32: +;CHECK: vpmax.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <2 x float> @vpmaxf32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vpmaxf32: +;CHECK: vpmax.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpmaxs.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpmaxs.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vpmaxu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vpmaxu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vpmaxu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vpmaxs.v2f32(<2 x float>, <2 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vqRdmulh_lane.ll b/test/CodeGen/ARM/vqRdmulh_lane.ll deleted file mode 100644 index f308c52..0000000 --- a/test/CodeGen/ARM/vqRdmulh_lane.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqRdmulhQ_lanes16 -; CHECK: vqrdmulh.s16 q0, q0, d2[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] - %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %1 -} - -declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqRdmulhQ_lanes32 -; CHECK: vqrdmulh.s32 q0, q0, d2[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqRdmulh_lanes16 -; CHECK: vqrdmulh.s16 d0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] - ret <4 x i16> %1 -} - -declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqRdmulh_lanes32 -; CHECK: vqrdmulh.s32 d0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] - ret <2 x i32> %1 -} - -declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqabs.ll b/test/CodeGen/ARM/vqabs.ll deleted file mode 100644 index d923557..0000000 --- a/test/CodeGen/ARM/vqabs.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vqabss8(<8 x i8>* %A) nounwind { -;CHECK: vqabss8: -;CHECK: vqabs.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqabss16(<4 x i16>* %A) nounwind { -;CHECK: vqabss16: -;CHECK: vqabs.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqabss32(<2 x i32>* %A) nounwind { -;CHECK: vqabss32: -;CHECK: vqabs.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <16 x i8> @vqabsQs8(<16 x i8>* %A) nounwind { -;CHECK: vqabsQs8: -;CHECK: vqabs.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8> %tmp1) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vqabsQs16(<8 x i16>* %A) nounwind { -;CHECK: vqabsQs16: -;CHECK: vqabs.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vqabsQs32(<4 x i32>* %A) nounwind { -;CHECK: vqabsQs32: -;CHECK: vqabs.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vqabs.v8i8(<8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqabs.v4i16(<4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqabs.v2i32(<2 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vqabs.v16i8(<16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqabs.v8i16(<8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqabs.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlal.ll b/test/CodeGen/ARM/vqdmlal.ll deleted file mode 100644 index 3e4b7f5..0000000 --- a/test/CodeGen/ARM/vqdmlal.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vqdmlals16: -;CHECK: vqdmlal.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vqdmlals32: -;CHECK: vqdmlal.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlal_lanes.ll b/test/CodeGen/ARM/vqdmlal_lanes.ll deleted file mode 100644 index ff532f3..0000000 --- a/test/CodeGen/ARM/vqdmlal_lanes.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqdmlal_lanes16 -; CHECK: vqdmlal.s16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqdmlal_lanes32 -; CHECK: vqdmlal.s32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlsl.ll b/test/CodeGen/ARM/vqdmlsl.ll deleted file mode 100644 index ab3c51f..0000000 --- a/test/CodeGen/ARM/vqdmlsl.ll +++ /dev/null @@ -1,24 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { -;CHECK: vqdmlsls16: -;CHECK: vqdmlsl.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = load <4 x i16>* %C - %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) - ret <4 x i32> %tmp4 -} - -define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { -;CHECK: vqdmlsls32: -;CHECK: vqdmlsl.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = load <2 x i32>* %C - %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) - ret <2 x i64> %tmp4 -} - -declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmlsl_lanes.ll b/test/CodeGen/ARM/vqdmlsl_lanes.ll deleted file mode 100644 index 1a834ff..0000000 --- a/test/CodeGen/ARM/vqdmlsl_lanes.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqdmlsl_lanes16 -; CHECK: vqdmlsl.s16 q0, d2, d3[1] - %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqdmlsl_lanes32 -; CHECK: vqdmlsl.s32 q0, d2, d3[1] - %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmul.ll b/test/CodeGen/ARM/vqdmul.ll new file mode 100644 index 0000000..8dcc7f7 --- /dev/null +++ b/test/CodeGen/ARM/vqdmul.ll @@ -0,0 +1,281 @@ +; RUN: llc -mattr=+neon < %s | FileCheck %s +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" +target triple = "thumbv7-elf" + +define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vqdmulhs16: +;CHECK: vqdmulh.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vqdmulhs32: +;CHECK: vqdmulh.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vqdmulhQs16: +;CHECK: vqdmulh.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vqdmulhQs32: +;CHECK: vqdmulh.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqdmulhQ_lanes16 +; CHECK: vqdmulh.s16 q0, q0, d2[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] + %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqdmulhQ_lanes32 +; CHECK: vqdmulh.s32 q0, q0, d2[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqdmulh_lanes16 +; CHECK: vqdmulh.s16 d0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] + ret <4 x i16> %1 +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqdmulh_lanes32 +; CHECK: vqdmulh.s32 d0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] + ret <2 x i32> %1 +} + +declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vqrdmulhs16: +;CHECK: vqrdmulh.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vqrdmulhs32: +;CHECK: vqrdmulh.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vqrdmulhQs16: +;CHECK: vqrdmulh.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vqrdmulhQs32: +;CHECK: vqrdmulh.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vqRdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqRdmulhQ_lanes16 +; CHECK: vqrdmulh.s16 q0, q0, d2[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] + %1 = tail call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqRdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqRdmulhQ_lanes32 +; CHECK: vqrdmulh.s32 q0, q0, d2[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x i16> @test_vqRdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqRdmulh_lanes16 +; CHECK: vqrdmulh.s16 d0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] + ret <4 x i16> %1 +} + +define arm_aapcs_vfpcc <2 x i32> @test_vqRdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqRdmulh_lanes32 +; CHECK: vqrdmulh.s32 d0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] + ret <2 x i32> %1 +} + +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone + +define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vqdmulls16: +;CHECK: vqdmull.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vqdmulls32: +;CHECK: vqdmull.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqdmull_lanes16 +; CHECK: vqdmull.s16 q0, d0, d1[1] + %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqdmull_lanes32 +; CHECK: vqdmull.s32 q0, d0, d1[1] + %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +define <4 x i32> @vqdmlals16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vqdmlals16: +;CHECK: vqdmlal.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vqdmlals32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vqdmlals32: +;CHECK: vqdmlal.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlal_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqdmlal_lanes16 +; CHECK: vqdmlal.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlal_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqdmlal_lanes32 +; CHECK: vqdmlal.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.arm.neon.vqdmlal.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmlal.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone + +define <4 x i32> @vqdmlsls16(<4 x i32>* %A, <4 x i16>* %B, <4 x i16>* %C) nounwind { +;CHECK: vqdmlsls16: +;CHECK: vqdmlsl.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = load <4 x i16>* %C + %tmp4 = call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2, <4 x i16> %tmp3) + ret <4 x i32> %tmp4 +} + +define <2 x i64> @vqdmlsls32(<2 x i64>* %A, <2 x i32>* %B, <2 x i32>* %C) nounwind { +;CHECK: vqdmlsls32: +;CHECK: vqdmlsl.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = load <2 x i32>* %C + %tmp4 = call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2, <2 x i32> %tmp3) + ret <2 x i64> %tmp4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vqdmlsl_lanes16(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %arg2_int16x4_t) nounwind readnone { +entry: +; CHECK: test_vqdmlsl_lanes16 +; CHECK: vqdmlsl.s16 q0, d2, d3[1] + %0 = shufflevector <4 x i16> %arg2_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] + %1 = tail call <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i16> %arg1_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vqdmlsl_lanes32(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %arg2_int32x2_t) nounwind readnone { +entry: +; CHECK: test_vqdmlsl_lanes32 +; CHECK: vqdmlsl.s32 q0, d2, d3[1] + %0 = shufflevector <2 x i32> %arg2_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] + %1 = tail call <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64> %arg0_int64x2_t, <2 x i32> %arg1_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] + ret <2 x i64> %1 +} + +declare <4 x i32> @llvm.arm.neon.vqdmlsl.v4i32(<4 x i32>, <4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqdmlsl.v2i64(<2 x i64>, <2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmulh.ll b/test/CodeGen/ARM/vqdmulh.ll deleted file mode 100644 index c9e6dd9..0000000 --- a/test/CodeGen/ARM/vqdmulh.ll +++ /dev/null @@ -1,85 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <4 x i16> @vqdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vqdmulhs16: -;CHECK: vqdmulh.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vqdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vqdmulhs32: -;CHECK: vqdmulh.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <8 x i16> @vqdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vqdmulhQs16: -;CHECK: vqdmulh.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vqdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vqdmulhQs32: -;CHECK: vqdmulh.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -define <4 x i16> @vqrdmulhs16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vqrdmulhs16: -;CHECK: vqrdmulh.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vqrdmulhs32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vqrdmulhs32: -;CHECK: vqrdmulh.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <8 x i16> @vqrdmulhQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vqrdmulhQs16: -;CHECK: vqrdmulh.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vqrdmulhQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vqrdmulhQs32: -;CHECK: vqrdmulh.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmulh_lane.ll b/test/CodeGen/ARM/vqdmulh_lane.ll deleted file mode 100644 index 874f5f3..0000000 --- a/test/CodeGen/ARM/vqdmulh_lane.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <8 x i16> @test_vqdmulhQ_lanes16(<8 x i16> %arg0_int16x8_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqdmulhQ_lanes16 -; CHECK: vqdmulh.s16 q0, q0, d2[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1> ; <<8 x i16>> [#uses=1] - %1 = tail call <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16> %arg0_int16x8_t, <8 x i16> %0) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %1 -} - -declare <8 x i16> @llvm.arm.neon.vqdmulh.v8i16(<8 x i16>, <8 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i32> @test_vqdmulhQ_lanes32(<4 x i32> %arg0_int32x4_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqdmulhQ_lanes32 -; CHECK: vqdmulh.s32 q0, q0, d2[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i32>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32> %arg0_int32x4_t, <4 x i32> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vqdmulh.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -define arm_aapcs_vfpcc <4 x i16> @test_vqdmulh_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqdmulh_lanes16 -; CHECK: vqdmulh.s16 d0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i16>> [#uses=1] - ret <4 x i16> %1 -} - -declare <4 x i16> @llvm.arm.neon.vqdmulh.v4i16(<4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i32> @test_vqdmulh_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqdmulh_lanes32 -; CHECK: vqdmulh.s32 d0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i32>> [#uses=1] - ret <2 x i32> %1 -} - -declare <2 x i32> @llvm.arm.neon.vqdmulh.v2i32(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmull.ll b/test/CodeGen/ARM/vqdmull.ll deleted file mode 100644 index 9062536..0000000 --- a/test/CodeGen/ARM/vqdmull.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <4 x i32> @vqdmulls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vqdmulls16: -;CHECK: vqdmull.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vqdmulls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vqdmulls32: -;CHECK: vqdmull.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqdmull_lane.ll b/test/CodeGen/ARM/vqdmull_lane.ll deleted file mode 100644 index 21f4e94..0000000 --- a/test/CodeGen/ARM/vqdmull_lane.ll +++ /dev/null @@ -1,25 +0,0 @@ -; RUN: llc -mattr=+neon < %s | FileCheck %s -target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:64:64-v128:128:128-a0:0:32" -target triple = "thumbv7-elf" - -define arm_aapcs_vfpcc <4 x i32> @test_vqdmull_lanes16(<4 x i16> %arg0_int16x4_t, <4 x i16> %arg1_int16x4_t) nounwind readnone { -entry: -; CHECK: test_vqdmull_lanes16 -; CHECK: vqdmull.s16 q0, d0, d1[1] - %0 = shufflevector <4 x i16> %arg1_int16x4_t, <4 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1> ; <<4 x i16>> [#uses=1] - %1 = tail call <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16> %arg0_int16x4_t, <4 x i16> %0) ; <<4 x i32>> [#uses=1] - ret <4 x i32> %1 -} - -declare <4 x i32> @llvm.arm.neon.vqdmull.v4i32(<4 x i16>, <4 x i16>) nounwind readnone - -define arm_aapcs_vfpcc <2 x i64> @test_vqdmull_lanes32(<2 x i32> %arg0_int32x2_t, <2 x i32> %arg1_int32x2_t) nounwind readnone { -entry: -; CHECK: test_vqdmull_lanes32 -; CHECK: vqdmull.s32 q0, d0, d1[1] - %0 = shufflevector <2 x i32> %arg1_int32x2_t, <2 x i32> undef, <2 x i32> <i32 1, i32 1> ; <<2 x i32>> [#uses=1] - %1 = tail call <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32> %arg0_int32x2_t, <2 x i32> %0) ; <<2 x i64>> [#uses=1] - ret <2 x i64> %1 -} - -declare <2 x i64> @llvm.arm.neon.vqdmull.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqmovn.ll b/test/CodeGen/ARM/vqmovn.ll deleted file mode 100644 index 96c84d8..0000000 --- a/test/CodeGen/ARM/vqmovn.ll +++ /dev/null @@ -1,85 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vqmovns16(<8 x i16>* %A) nounwind { -;CHECK: vqmovns16: -;CHECK: vqmovn.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqmovns32(<4 x i32>* %A) nounwind { -;CHECK: vqmovns32: -;CHECK: vqmovn.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqmovns64(<2 x i64>* %A) nounwind { -;CHECK: vqmovns64: -;CHECK: vqmovn.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64> %tmp1) - ret <2 x i32> %tmp2 -} - -define <8 x i8> @vqmovnu16(<8 x i16>* %A) nounwind { -;CHECK: vqmovnu16: -;CHECK: vqmovn.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqmovnu32(<4 x i32>* %A) nounwind { -;CHECK: vqmovnu32: -;CHECK: vqmovn.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqmovnu64(<2 x i64>* %A) nounwind { -;CHECK: vqmovnu64: -;CHECK: vqmovn.u64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64> %tmp1) - ret <2 x i32> %tmp2 -} - -define <8 x i8> @vqmovuns16(<8 x i16>* %A) nounwind { -;CHECK: vqmovuns16: -;CHECK: vqmovun.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqmovuns32(<4 x i32>* %A) nounwind { -;CHECK: vqmovuns32: -;CHECK: vqmovun.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqmovuns64(<2 x i64>* %A) nounwind { -;CHECK: vqmovuns64: -;CHECK: vqmovun.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64> %tmp1) - ret <2 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vqmovns.v8i8(<8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqmovns.v4i16(<4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqmovns.v2i32(<2 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vqmovnu.v8i8(<8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqmovnu.v4i16(<4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqmovnu.v2i32(<2 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vqmovnsu.v8i8(<8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqmovnsu.v4i16(<4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqmovnsu.v2i32(<2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqneg.ll b/test/CodeGen/ARM/vqneg.ll deleted file mode 100644 index f55a132..0000000 --- a/test/CodeGen/ARM/vqneg.ll +++ /dev/null @@ -1,57 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vqnegs8(<8 x i8>* %A) nounwind { -;CHECK: vqnegs8: -;CHECK: vqneg.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8> %tmp1) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqnegs16(<4 x i16>* %A) nounwind { -;CHECK: vqnegs16: -;CHECK: vqneg.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16> %tmp1) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqnegs32(<2 x i32>* %A) nounwind { -;CHECK: vqnegs32: -;CHECK: vqneg.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <16 x i8> @vqnegQs8(<16 x i8>* %A) nounwind { -;CHECK: vqnegQs8: -;CHECK: vqneg.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8> %tmp1) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vqnegQs16(<8 x i16>* %A) nounwind { -;CHECK: vqnegQs16: -;CHECK: vqneg.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16> %tmp1) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vqnegQs32(<4 x i32>* %A) nounwind { -;CHECK: vqnegQs32: -;CHECK: vqneg.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vqneg.v8i8(<8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqneg.v4i16(<4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqneg.v2i32(<2 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vqneg.v16i8(<16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqneg.v8i16(<8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqneg.v4i32(<4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vqrshl.ll b/test/CodeGen/ARM/vqrshl.ll deleted file mode 100644 index f94e92f..0000000 --- a/test/CodeGen/ARM/vqrshl.ll +++ /dev/null @@ -1,165 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vqrshls8: -;CHECK: vqrshl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vqrshls16: -;CHECK: vqrshl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vqrshls32: -;CHECK: vqrshl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: vqrshls64: -;CHECK: vqrshl.s64 - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) - ret <1 x i64> %tmp3 -} - -define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vqrshlu8: -;CHECK: vqrshl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vqrshlu16: -;CHECK: vqrshl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vqrshlu32: -;CHECK: vqrshl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: vqrshlu64: -;CHECK: vqrshl.u64 - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) - ret <1 x i64> %tmp3 -} - -define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vqrshlQs8: -;CHECK: vqrshl.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vqrshlQs16: -;CHECK: vqrshl.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vqrshlQs32: -;CHECK: vqrshl.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vqrshlQs64: -;CHECK: vqrshl.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i64> %tmp3 -} - -define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vqrshlQu8: -;CHECK: vqrshl.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vqrshlQu16: -;CHECK: vqrshl.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vqrshlQu32: -;CHECK: vqrshl.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vqrshlQu64: -;CHECK: vqrshl.u64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqrshrn.ll b/test/CodeGen/ARM/vqrshrn.ll deleted file mode 100644 index 41c2e7c..0000000 --- a/test/CodeGen/ARM/vqrshrn.ll +++ /dev/null @@ -1,85 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind { -;CHECK: vqrshrns8: -;CHECK: vqrshrn.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind { -;CHECK: vqrshrns16: -;CHECK: vqrshrn.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind { -;CHECK: vqrshrns32: -;CHECK: vqrshrn.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) - ret <2 x i32> %tmp2 -} - -define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind { -;CHECK: vqrshrnu8: -;CHECK: vqrshrn.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind { -;CHECK: vqrshrnu16: -;CHECK: vqrshrn.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind { -;CHECK: vqrshrnu32: -;CHECK: vqrshrn.u64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) - ret <2 x i32> %tmp2 -} - -define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind { -;CHECK: vqrshruns8: -;CHECK: vqrshrun.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind { -;CHECK: vqrshruns16: -;CHECK: vqrshrun.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind { -;CHECK: vqrshruns32: -;CHECK: vqrshrun.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) - ret <2 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqshl.ll b/test/CodeGen/ARM/vqshl.ll index e8fdcd3..e4d29a3 100644 --- a/test/CodeGen/ARM/vqshl.ll +++ b/test/CodeGen/ARM/vqshl.ll @@ -365,3 +365,167 @@ declare <16 x i8> @llvm.arm.neon.vqshiftsu.v16i8(<16 x i8>, <16 x i8>) nounwind declare <8 x i16> @llvm.arm.neon.vqshiftsu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vqshiftsu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone declare <2 x i64> @llvm.arm.neon.vqshiftsu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vqrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vqrshls8: +;CHECK: vqrshl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vqrshls16: +;CHECK: vqrshl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vqrshls32: +;CHECK: vqrshl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: vqrshls64: +;CHECK: vqrshl.s64 + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vqrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vqrshlu8: +;CHECK: vqrshl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vqrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vqrshlu16: +;CHECK: vqrshl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vqrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vqrshlu32: +;CHECK: vqrshl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vqrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: vqrshlu64: +;CHECK: vqrshl.u64 + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vqrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vqrshlQs8: +;CHECK: vqrshl.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vqrshlQs16: +;CHECK: vqrshl.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vqrshlQs32: +;CHECK: vqrshl.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vqrshlQs64: +;CHECK: vqrshl.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vqrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vqrshlQu8: +;CHECK: vqrshl.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vqrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vqrshlQu16: +;CHECK: vqrshl.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vqrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vqrshlQu32: +;CHECK: vqrshl.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vqrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vqrshlQu64: +;CHECK: vqrshl.u64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vqrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vqrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vqrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vqrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vqrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vqshrn.ll b/test/CodeGen/ARM/vqshrn.ll index 0e00c86..5da7943 100644 --- a/test/CodeGen/ARM/vqshrn.ll +++ b/test/CodeGen/ARM/vqshrn.ll @@ -83,3 +83,87 @@ declare <2 x i32> @llvm.arm.neon.vqshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind declare <8 x i8> @llvm.arm.neon.vqshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i16> @llvm.arm.neon.vqshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vqshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vqrshrns8(<8 x i16>* %A) nounwind { +;CHECK: vqrshrns8: +;CHECK: vqrshrn.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshrns16(<4 x i32>* %A) nounwind { +;CHECK: vqrshrns16: +;CHECK: vqrshrn.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshrns32(<2 x i64>* %A) nounwind { +;CHECK: vqrshrns32: +;CHECK: vqrshrn.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqrshrnu8(<8 x i16>* %A) nounwind { +;CHECK: vqrshrnu8: +;CHECK: vqrshrn.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshrnu16(<4 x i32>* %A) nounwind { +;CHECK: vqrshrnu16: +;CHECK: vqrshrn.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshrnu32(<2 x i64>* %A) nounwind { +;CHECK: vqrshrnu32: +;CHECK: vqrshrn.u64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +define <8 x i8> @vqrshruns8(<8 x i16>* %A) nounwind { +;CHECK: vqrshruns8: +;CHECK: vqrshrun.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vqrshruns16(<4 x i32>* %A) nounwind { +;CHECK: vqrshruns16: +;CHECK: vqrshrun.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vqrshruns32(<2 x i64>* %A) nounwind { +;CHECK: vqrshruns32: +;CHECK: vqrshrun.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vqrshiftns.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftns.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftns.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftnu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftnu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vqrshiftnsu.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vqrshiftnsu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vqrshiftnsu.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vraddhn.ll b/test/CodeGen/ARM/vraddhn.ll deleted file mode 100644 index 40e5cea..0000000 --- a/test/CodeGen/ARM/vraddhn.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vraddhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vraddhni16: -;CHECK: vraddhn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vraddhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vraddhni32: -;CHECK: vraddhn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vraddhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vraddhni64: -;CHECK: vraddhn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i32> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vraddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vraddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vraddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrec.ll b/test/CodeGen/ARM/vrec.ll new file mode 100644 index 0000000..99989e9 --- /dev/null +++ b/test/CodeGen/ARM/vrec.ll @@ -0,0 +1,119 @@ +; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s + +define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind { +;CHECK: vrecpei32: +;CHECK: vrecpe.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind { +;CHECK: vrecpeQi32: +;CHECK: vrecpe.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x float> @vrecpef32(<2 x float>* %A) nounwind { +;CHECK: vrecpef32: +;CHECK: vrecpe.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1) + ret <2 x float> %tmp2 +} + +define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind { +;CHECK: vrecpeQf32: +;CHECK: vrecpe.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1) + ret <4 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone + +define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vrecpsf32: +;CHECK: vrecps.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vrecpsQf32: +;CHECK: vrecps.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone + +define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind { +;CHECK: vrsqrtei32: +;CHECK: vrsqrte.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1) + ret <2 x i32> %tmp2 +} + +define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind { +;CHECK: vrsqrteQi32: +;CHECK: vrsqrte.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1) + ret <4 x i32> %tmp2 +} + +define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind { +;CHECK: vrsqrtef32: +;CHECK: vrsqrte.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1) + ret <2 x float> %tmp2 +} + +define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind { +;CHECK: vrsqrteQf32: +;CHECK: vrsqrte.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1) + ret <4 x float> %tmp2 +} + +declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone + +declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone + +define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind { +;CHECK: vrsqrtsf32: +;CHECK: vrsqrts.f32 + %tmp1 = load <2 x float>* %A + %tmp2 = load <2 x float>* %B + %tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) + ret <2 x float> %tmp3 +} + +define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { +;CHECK: vrsqrtsQf32: +;CHECK: vrsqrts.f32 + %tmp1 = load <4 x float>* %A + %tmp2 = load <4 x float>* %B + %tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) + ret <4 x float> %tmp3 +} + +declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrecpe.ll b/test/CodeGen/ARM/vrecpe.ll deleted file mode 100644 index 72e904e..0000000 --- a/test/CodeGen/ARM/vrecpe.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x i32> @vrecpei32(<2 x i32>* %A) nounwind { -;CHECK: vrecpei32: -;CHECK: vrecpe.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <4 x i32> @vrecpeQi32(<4 x i32>* %A) nounwind { -;CHECK: vrecpeQi32: -;CHECK: vrecpe.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x float> @vrecpef32(<2 x float>* %A) nounwind { -;CHECK: vrecpef32: -;CHECK: vrecpe.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float> %tmp1) - ret <2 x float> %tmp2 -} - -define <4 x float> @vrecpeQf32(<4 x float>* %A) nounwind { -;CHECK: vrecpeQf32: -;CHECK: vrecpe.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float> %tmp1) - ret <4 x float> %tmp2 -} - -declare <2 x i32> @llvm.arm.neon.vrecpe.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrecpe.v4i32(<4 x i32>) nounwind readnone - -declare <2 x float> @llvm.arm.neon.vrecpe.v2f32(<2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrecpe.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrecps.ll b/test/CodeGen/ARM/vrecps.ll deleted file mode 100644 index 71e8f86..0000000 --- a/test/CodeGen/ARM/vrecps.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x float> @vrecpsf32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vrecpsf32: -;CHECK: vrecps.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x float> %tmp3 -} - -define <4 x float> @vrecpsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK: vrecpsQf32: -;CHECK: vrecps.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) - ret <4 x float> %tmp3 -} - -declare <2 x float> @llvm.arm.neon.vrecps.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrecps.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrhadd.ll b/test/CodeGen/ARM/vrhadd.ll deleted file mode 100644 index d47dfc4..0000000 --- a/test/CodeGen/ARM/vrhadd.ll +++ /dev/null @@ -1,125 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vrhadds8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vrhadds8: -;CHECK: vrhadd.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vrhadds16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vrhadds16: -;CHECK: vrhadd.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vrhadds32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vrhadds32: -;CHECK: vrhadd.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <8 x i8> @vrhaddu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vrhaddu8: -;CHECK: vrhadd.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vrhaddu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vrhaddu16: -;CHECK: vrhadd.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vrhaddu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vrhaddu32: -;CHECK: vrhadd.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <16 x i8> @vrhaddQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vrhaddQs8: -;CHECK: vrhadd.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vrhaddQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vrhaddQs16: -;CHECK: vrhadd.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vrhaddQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vrhaddQs32: -;CHECK: vrhadd.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <16 x i8> @vrhaddQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vrhaddQu8: -;CHECK: vrhadd.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vrhaddQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vrhaddQu16: -;CHECK: vrhadd.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vrhaddQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vrhaddQu32: -;CHECK: vrhadd.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vrhadds.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrhadds.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrhadds.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrhaddu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrhaddu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vrhadds.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vrhadds.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrhadds.v4i32(<4 x i32>, <4 x i32>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vrhaddu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vrhaddu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrhaddu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vrshl.ll b/test/CodeGen/ARM/vrshl.ll deleted file mode 100644 index 48fd388..0000000 --- a/test/CodeGen/ARM/vrshl.ll +++ /dev/null @@ -1,293 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vrshls8: -;CHECK: vrshl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vrshls16: -;CHECK: vrshl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vrshls32: -;CHECK: vrshl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: vrshls64: -;CHECK: vrshl.s64 - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) - ret <1 x i64> %tmp3 -} - -define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vrshlu8: -;CHECK: vrshl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vrshlu16: -;CHECK: vrshl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vrshlu32: -;CHECK: vrshl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i32> %tmp3 -} - -define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { -;CHECK: vrshlu64: -;CHECK: vrshl.u64 - %tmp1 = load <1 x i64>* %A - %tmp2 = load <1 x i64>* %B - %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) - ret <1 x i64> %tmp3 -} - -define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vrshlQs8: -;CHECK: vrshl.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vrshlQs16: -;CHECK: vrshl.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vrshlQs32: -;CHECK: vrshl.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vrshlQs64: -;CHECK: vrshl.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i64> %tmp3 -} - -define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vrshlQu8: -;CHECK: vrshl.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) - ret <16 x i8> %tmp3 -} - -define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vrshlQu16: -;CHECK: vrshl.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vrshlQu32: -;CHECK: vrshl.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vrshlQu64: -;CHECK: vrshl.u64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind { -;CHECK: vrshrs8: -;CHECK: vrshr.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind { -;CHECK: vrshrs16: -;CHECK: vrshr.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind { -;CHECK: vrshrs32: -;CHECK: vrshr.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) - ret <2 x i32> %tmp2 -} - -define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind { -;CHECK: vrshrs64: -;CHECK: vrshr.s64 - %tmp1 = load <1 x i64>* %A - %tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) - ret <1 x i64> %tmp2 -} - -define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind { -;CHECK: vrshru8: -;CHECK: vrshr.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind { -;CHECK: vrshru16: -;CHECK: vrshr.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind { -;CHECK: vrshru32: -;CHECK: vrshr.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) - ret <2 x i32> %tmp2 -} - -define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind { -;CHECK: vrshru64: -;CHECK: vrshr.u64 - %tmp1 = load <1 x i64>* %A - %tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) - ret <1 x i64> %tmp2 -} - -define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind { -;CHECK: vrshrQs8: -;CHECK: vrshr.s8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind { -;CHECK: vrshrQs16: -;CHECK: vrshr.s16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind { -;CHECK: vrshrQs32: -;CHECK: vrshr.s32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind { -;CHECK: vrshrQs64: -;CHECK: vrshr.s64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) - ret <2 x i64> %tmp2 -} - -define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind { -;CHECK: vrshrQu8: -;CHECK: vrshr.u8 - %tmp1 = load <16 x i8>* %A - %tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind { -;CHECK: vrshrQu16: -;CHECK: vrshr.u16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind { -;CHECK: vrshrQu32: -;CHECK: vrshr.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) - ret <4 x i32> %tmp2 -} - -define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind { -;CHECK: vrshrQu64: -;CHECK: vrshr.u64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) - ret <2 x i64> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone - -declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone -declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone - -declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone -declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrshrn.ll b/test/CodeGen/ARM/vrshrn.ll deleted file mode 100644 index b0dedef..0000000 --- a/test/CodeGen/ARM/vrshrn.ll +++ /dev/null @@ -1,29 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind { -;CHECK: vrshrns8: -;CHECK: vrshrn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind { -;CHECK: vrshrns16: -;CHECK: vrshrn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind { -;CHECK: vrshrns32: -;CHECK: vrshrn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) - ret <2 x i32> %tmp2 -} - -declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsqrte.ll b/test/CodeGen/ARM/vrsqrte.ll deleted file mode 100644 index ef0aa8a..0000000 --- a/test/CodeGen/ARM/vrsqrte.ll +++ /dev/null @@ -1,39 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x i32> @vrsqrtei32(<2 x i32>* %A) nounwind { -;CHECK: vrsqrtei32: -;CHECK: vrsqrte.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = call <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32> %tmp1) - ret <2 x i32> %tmp2 -} - -define <4 x i32> @vrsqrteQi32(<4 x i32>* %A) nounwind { -;CHECK: vrsqrteQi32: -;CHECK: vrsqrte.u32 - %tmp1 = load <4 x i32>* %A - %tmp2 = call <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32> %tmp1) - ret <4 x i32> %tmp2 -} - -define <2 x float> @vrsqrtef32(<2 x float>* %A) nounwind { -;CHECK: vrsqrtef32: -;CHECK: vrsqrte.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = call <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float> %tmp1) - ret <2 x float> %tmp2 -} - -define <4 x float> @vrsqrteQf32(<4 x float>* %A) nounwind { -;CHECK: vrsqrteQf32: -;CHECK: vrsqrte.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = call <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float> %tmp1) - ret <4 x float> %tmp2 -} - -declare <2 x i32> @llvm.arm.neon.vrsqrte.v2i32(<2 x i32>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vrsqrte.v4i32(<4 x i32>) nounwind readnone - -declare <2 x float> @llvm.arm.neon.vrsqrte.v2f32(<2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrsqrte.v4f32(<4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsqrts.ll b/test/CodeGen/ARM/vrsqrts.ll deleted file mode 100644 index 99f4714..0000000 --- a/test/CodeGen/ARM/vrsqrts.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <2 x float> @vrsqrtsf32(<2 x float>* %A, <2 x float>* %B) nounwind { -;CHECK: vrsqrtsf32: -;CHECK: vrsqrts.f32 - %tmp1 = load <2 x float>* %A - %tmp2 = load <2 x float>* %B - %tmp3 = call <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float> %tmp1, <2 x float> %tmp2) - ret <2 x float> %tmp3 -} - -define <4 x float> @vrsqrtsQf32(<4 x float>* %A, <4 x float>* %B) nounwind { -;CHECK: vrsqrtsQf32: -;CHECK: vrsqrts.f32 - %tmp1 = load <4 x float>* %A - %tmp2 = load <4 x float>* %B - %tmp3 = call <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float> %tmp1, <4 x float> %tmp2) - ret <4 x float> %tmp3 -} - -declare <2 x float> @llvm.arm.neon.vrsqrts.v2f32(<2 x float>, <2 x float>) nounwind readnone -declare <4 x float> @llvm.arm.neon.vrsqrts.v4f32(<4 x float>, <4 x float>) nounwind readnone diff --git a/test/CodeGen/ARM/vrsubhn.ll b/test/CodeGen/ARM/vrsubhn.ll deleted file mode 100644 index cd7b995..0000000 --- a/test/CodeGen/ARM/vrsubhn.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vrsubhni16: -;CHECK: vrsubhn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vrsubhni32: -;CHECK: vrsubhn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vrsubhni64: -;CHECK: vrsubhn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i32> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vset_lane.ll b/test/CodeGen/ARM/vset_lane.ll deleted file mode 100644 index 65d246d..0000000 --- a/test/CodeGen/ARM/vset_lane.ll +++ /dev/null @@ -1,58 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vset_lane8(<8 x i8>* %A, i8 %B) nounwind { -;CHECK: vset_lane8: -;CHECK: vmov.8 - %tmp1 = load <8 x i8>* %A - %tmp2 = insertelement <8 x i8> %tmp1, i8 %B, i32 1 - ret <8 x i8> %tmp2 -} - -define <4 x i16> @vset_lane16(<4 x i16>* %A, i16 %B) nounwind { -;CHECK: vset_lane16: -;CHECK: vmov.16 - %tmp1 = load <4 x i16>* %A - %tmp2 = insertelement <4 x i16> %tmp1, i16 %B, i32 1 - ret <4 x i16> %tmp2 -} - -define <2 x i32> @vset_lane32(<2 x i32>* %A, i32 %B) nounwind { -;CHECK: vset_lane32: -;CHECK: vmov.32 - %tmp1 = load <2 x i32>* %A - %tmp2 = insertelement <2 x i32> %tmp1, i32 %B, i32 1 - ret <2 x i32> %tmp2 -} - -define <16 x i8> @vsetQ_lane8(<16 x i8>* %A, i8 %B) nounwind { -;CHECK: vsetQ_lane8: -;CHECK: vmov.8 - %tmp1 = load <16 x i8>* %A - %tmp2 = insertelement <16 x i8> %tmp1, i8 %B, i32 1 - ret <16 x i8> %tmp2 -} - -define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind { -;CHECK: vsetQ_lane16: -;CHECK: vmov.16 - %tmp1 = load <8 x i16>* %A - %tmp2 = insertelement <8 x i16> %tmp1, i16 %B, i32 1 - ret <8 x i16> %tmp2 -} - -define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind { -;CHECK: vsetQ_lane32: -;CHECK: vmov.32 - %tmp1 = load <4 x i32>* %A - %tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1 - ret <4 x i32> %tmp2 -} - -define arm_aapcs_vfpcc <2 x float> @test_vset_lanef32(float %arg0_float32_t, <2 x float> %arg1_float32x2_t) nounwind { -;CHECK: test_vset_lanef32: -;CHECK: fcpys -;CHECK: fcpys -entry: - %0 = insertelement <2 x float> %arg1_float32x2_t, float %arg0_float32_t, i32 1 ; <<2 x float>> [#uses=1] - ret <2 x float> %0 -} diff --git a/test/CodeGen/ARM/vshift.ll b/test/CodeGen/ARM/vshift.ll index 996858f..f3cbec7 100644 --- a/test/CodeGen/ARM/vshift.ll +++ b/test/CodeGen/ARM/vshift.ll @@ -280,6 +280,13 @@ define <2 x i64> @vlshrQi64(<2 x i64>* %A) nounwind { ret <2 x i64> %tmp2 } +; Example that requires splitting and expanding a vector shift. +define <2 x i64> @update(<2 x i64> %val) nounwind readnone { +entry: + %shr = lshr <2 x i64> %val, < i64 2, i64 2 > ; <<2 x i64>> [#uses=1] + ret <2 x i64> %shr +} + define <8 x i8> @vashrs8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vashrs8: ;CHECK: vneg.s8 diff --git a/test/CodeGen/ARM/vshift_split.ll b/test/CodeGen/ARM/vshift_split.ll deleted file mode 100644 index f05921f..0000000 --- a/test/CodeGen/ARM/vshift_split.ll +++ /dev/null @@ -1,8 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=-neon - -; Example that requires splitting and expanding a vector shift. -define <2 x i64> @update(<2 x i64> %val) nounwind readnone { -entry: - %shr = lshr <2 x i64> %val, < i64 2, i64 2 > ; <<2 x i64>> [#uses=1] - ret <2 x i64> %shr -} diff --git a/test/CodeGen/ARM/vshl.ll b/test/CodeGen/ARM/vshl.ll index b0d0315..818e71b 100644 --- a/test/CodeGen/ARM/vshl.ll +++ b/test/CodeGen/ARM/vshl.ll @@ -360,3 +360,295 @@ declare <16 x i8> @llvm.arm.neon.vshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind re declare <8 x i16> @llvm.arm.neon.vshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i32> @llvm.arm.neon.vshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone declare <2 x i64> @llvm.arm.neon.vshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vrshls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vrshls8: +;CHECK: vrshl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrshls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vrshls16: +;CHECK: vrshl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrshls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vrshls32: +;CHECK: vrshl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vrshls64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: vrshls64: +;CHECK: vrshl.s64 + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <8 x i8> @vrshlu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vrshlu8: +;CHECK: vrshl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrshlu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vrshlu16: +;CHECK: vrshl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrshlu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vrshlu32: +;CHECK: vrshl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i32> %tmp3 +} + +define <1 x i64> @vrshlu64(<1 x i64>* %A, <1 x i64>* %B) nounwind { +;CHECK: vrshlu64: +;CHECK: vrshl.u64 + %tmp1 = load <1 x i64>* %A + %tmp2 = load <1 x i64>* %B + %tmp3 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> %tmp2) + ret <1 x i64> %tmp3 +} + +define <16 x i8> @vrshlQs8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vrshlQs8: +;CHECK: vrshl.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrshlQs16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vrshlQs16: +;CHECK: vrshl.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrshlQs32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vrshlQs32: +;CHECK: vrshl.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vrshlQs64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vrshlQs64: +;CHECK: vrshl.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <16 x i8> @vrshlQu8(<16 x i8>* %A, <16 x i8>* %B) nounwind { +;CHECK: vrshlQu8: +;CHECK: vrshl.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = load <16 x i8>* %B + %tmp3 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2) + ret <16 x i8> %tmp3 +} + +define <8 x i16> @vrshlQu16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vrshlQu16: +;CHECK: vrshl.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vrshlQu32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vrshlQu32: +;CHECK: vrshl.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vrshlQu64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vrshlQu64: +;CHECK: vrshl.u64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i8> @vrshrs8(<8 x i8>* %A) nounwind { +;CHECK: vrshrs8: +;CHECK: vrshr.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshrs16(<4 x i16>* %A) nounwind { +;CHECK: vrshrs16: +;CHECK: vrshr.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshrs32(<2 x i32>* %A) nounwind { +;CHECK: vrshrs32: +;CHECK: vrshr.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vrshrs64(<1 x i64>* %A) nounwind { +;CHECK: vrshrs64: +;CHECK: vrshr.s64 + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <8 x i8> @vrshru8(<8 x i8>* %A) nounwind { +;CHECK: vrshru8: +;CHECK: vrshr.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8> %tmp1, <8 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshru16(<4 x i16>* %A) nounwind { +;CHECK: vrshru16: +;CHECK: vrshr.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16> %tmp1, <4 x i16> < i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshru32(<2 x i32>* %A) nounwind { +;CHECK: vrshru32: +;CHECK: vrshr.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32> %tmp1, <2 x i32> < i32 -32, i32 -32 >) + ret <2 x i32> %tmp2 +} + +define <1 x i64> @vrshru64(<1 x i64>* %A) nounwind { +;CHECK: vrshru64: +;CHECK: vrshr.u64 + %tmp1 = load <1 x i64>* %A + %tmp2 = call <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64> %tmp1, <1 x i64> < i64 -64 >) + ret <1 x i64> %tmp2 +} + +define <16 x i8> @vrshrQs8(<16 x i8>* %A) nounwind { +;CHECK: vrshrQs8: +;CHECK: vrshr.s8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vrshrQs16(<8 x i16>* %A) nounwind { +;CHECK: vrshrQs16: +;CHECK: vrshr.s16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vrshrQs32(<4 x i32>* %A) nounwind { +;CHECK: vrshrQs32: +;CHECK: vrshr.s32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vrshrQs64(<2 x i64>* %A) nounwind { +;CHECK: vrshrQs64: +;CHECK: vrshr.s64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +define <16 x i8> @vrshrQu8(<16 x i8>* %A) nounwind { +;CHECK: vrshrQu8: +;CHECK: vrshr.u8 + %tmp1 = load <16 x i8>* %A + %tmp2 = call <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8> %tmp1, <16 x i8> < i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8, i8 -8 >) + ret <16 x i8> %tmp2 +} + +define <8 x i16> @vrshrQu16(<8 x i16>* %A) nounwind { +;CHECK: vrshrQu16: +;CHECK: vrshr.u16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16> %tmp1, <8 x i16> < i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16, i16 -16 >) + ret <8 x i16> %tmp2 +} + +define <4 x i32> @vrshrQu32(<4 x i32>* %A) nounwind { +;CHECK: vrshrQu32: +;CHECK: vrshr.u32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32> %tmp1, <4 x i32> < i32 -32, i32 -32, i32 -32, i32 -32 >) + ret <4 x i32> %tmp2 +} + +define <2 x i64> @vrshrQu64(<2 x i64>* %A) nounwind { +;CHECK: vrshrQu64: +;CHECK: vrshr.u64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64> %tmp1, <2 x i64> < i64 -64, i64 -64 >) + ret <2 x i64> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vrshifts.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshifts.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshifts.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshifts.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshiftu.v4i16(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshiftu.v2i32(<2 x i32>, <2 x i32>) nounwind readnone +declare <1 x i64> @llvm.arm.neon.vrshiftu.v1i64(<1 x i64>, <1 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshifts.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshifts.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshifts.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshifts.v2i64(<2 x i64>, <2 x i64>) nounwind readnone + +declare <16 x i8> @llvm.arm.neon.vrshiftu.v16i8(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.arm.neon.vrshiftu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vrshiftu.v4i32(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vrshiftu.v2i64(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vshrn.ll b/test/CodeGen/ARM/vshrn.ll index 01324fa..e2544f4 100644 --- a/test/CodeGen/ARM/vshrn.ll +++ b/test/CodeGen/ARM/vshrn.ll @@ -27,3 +27,31 @@ define <2 x i32> @vshrns32(<2 x i64>* %A) nounwind { declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone declare <4 x i16> @llvm.arm.neon.vshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone declare <2 x i32> @llvm.arm.neon.vshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vrshrns8(<8 x i16>* %A) nounwind { +;CHECK: vrshrns8: +;CHECK: vrshrn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = call <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16> %tmp1, <8 x i16> < i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8 >) + ret <8 x i8> %tmp2 +} + +define <4 x i16> @vrshrns16(<4 x i32>* %A) nounwind { +;CHECK: vrshrns16: +;CHECK: vrshrn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = call <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32> %tmp1, <4 x i32> < i32 -16, i32 -16, i32 -16, i32 -16 >) + ret <4 x i16> %tmp2 +} + +define <2 x i32> @vrshrns32(<2 x i64>* %A) nounwind { +;CHECK: vrshrns32: +;CHECK: vrshrn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = call <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64> %tmp1, <2 x i64> < i64 -32, i64 -32 >) + ret <2 x i32> %tmp2 +} + +declare <8 x i8> @llvm.arm.neon.vrshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrshiftn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrshiftn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vsub.ll b/test/CodeGen/ARM/vsub.ll index 5880442..8f0055f 100644 --- a/test/CodeGen/ARM/vsub.ll +++ b/test/CodeGen/ARM/vsub.ll @@ -89,3 +89,189 @@ define <4 x float> @vsubQf32(<4 x float>* %A, <4 x float>* %B) nounwind { %tmp3 = sub <4 x float> %tmp1, %tmp2 ret <4 x float> %tmp3 } + +define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vsubhni16: +;CHECK: vsubhn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vsubhni32: +;CHECK: vsubhn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vsubhni64: +;CHECK: vsubhn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i8> @vrsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { +;CHECK: vrsubhni16: +;CHECK: vrsubhn.i16 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i16>* %B + %tmp3 = call <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) + ret <8 x i8> %tmp3 +} + +define <4 x i16> @vrsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { +;CHECK: vrsubhni32: +;CHECK: vrsubhn.i32 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i32>* %B + %tmp3 = call <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) + ret <4 x i16> %tmp3 +} + +define <2 x i32> @vrsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { +;CHECK: vrsubhni64: +;CHECK: vrsubhn.i64 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i64>* %B + %tmp3 = call <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) + ret <2 x i32> %tmp3 +} + +declare <8 x i8> @llvm.arm.neon.vrsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone +declare <4 x i16> @llvm.arm.neon.vrsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone +declare <2 x i32> @llvm.arm.neon.vrsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone + +define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vsubls8: +;CHECK: vsubl.s8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vsubls16: +;CHECK: vsubl.s16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vsubls32: +;CHECK: vsubl.s32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { +;CHECK: vsublu8: +;CHECK: vsubl.u8 + %tmp1 = load <8 x i8>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { +;CHECK: vsublu16: +;CHECK: vsubl.u16 + %tmp1 = load <4 x i16>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { +;CHECK: vsublu32: +;CHECK: vsubl.u32 + %tmp1 = load <2 x i32>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone + +define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { +;CHECK: vsubws8: +;CHECK: vsubw.s8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { +;CHECK: vsubws16: +;CHECK: vsubw.s16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { +;CHECK: vsubws32: +;CHECK: vsubw.s32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { +;CHECK: vsubwu8: +;CHECK: vsubw.u8 + %tmp1 = load <8 x i16>* %A + %tmp2 = load <8 x i8>* %B + %tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) + ret <8 x i16> %tmp3 +} + +define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { +;CHECK: vsubwu16: +;CHECK: vsubw.u16 + %tmp1 = load <4 x i32>* %A + %tmp2 = load <4 x i16>* %B + %tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) + ret <4 x i32> %tmp3 +} + +define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { +;CHECK: vsubwu32: +;CHECK: vsubw.u32 + %tmp1 = load <2 x i64>* %A + %tmp2 = load <2 x i32>* %B + %tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) + ret <2 x i64> %tmp3 +} + +declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone + +declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone +declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone +declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vsubhn.ll b/test/CodeGen/ARM/vsubhn.ll deleted file mode 100644 index 93645ef..0000000 --- a/test/CodeGen/ARM/vsubhn.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vsubhni16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vsubhni16: -;CHECK: vsubhn.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = call <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2) - ret <8 x i8> %tmp3 -} - -define <4 x i16> @vsubhni32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vsubhni32: -;CHECK: vsubhn.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = call <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2) - ret <4 x i16> %tmp3 -} - -define <2 x i32> @vsubhni64(<2 x i64>* %A, <2 x i64>* %B) nounwind { -;CHECK: vsubhni64: -;CHECK: vsubhn.i64 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i64>* %B - %tmp3 = call <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2) - ret <2 x i32> %tmp3 -} - -declare <8 x i8> @llvm.arm.neon.vsubhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone -declare <4 x i16> @llvm.arm.neon.vsubhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone -declare <2 x i32> @llvm.arm.neon.vsubhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone diff --git a/test/CodeGen/ARM/vsubl.ll b/test/CodeGen/ARM/vsubl.ll deleted file mode 100644 index 9a9bcdb..0000000 --- a/test/CodeGen/ARM/vsubl.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vsubls8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vsubls8: -;CHECK: vsubl.s8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vsubls16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vsubls16: -;CHECK: vsubl.s16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vsubls32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vsubls32: -;CHECK: vsubl.s32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vsublu8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vsublu8: -;CHECK: vsubl.u8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vsublu16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vsublu16: -;CHECK: vsubl.u16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vsublu32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vsublu32: -;CHECK: vsubl.u32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vsubls.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vsubls.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vsubls.v2i64(<2 x i32>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vsublu.v8i16(<8 x i8>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vsublu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vsublu.v2i64(<2 x i32>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vsubw.ll b/test/CodeGen/ARM/vsubw.ll deleted file mode 100644 index e6b6b5c..0000000 --- a/test/CodeGen/ARM/vsubw.ll +++ /dev/null @@ -1,63 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i16> @vsubws8(<8 x i16>* %A, <8 x i8>* %B) nounwind { -;CHECK: vsubws8: -;CHECK: vsubw.s8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vsubws16(<4 x i32>* %A, <4 x i16>* %B) nounwind { -;CHECK: vsubws16: -;CHECK: vsubw.s16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vsubws32(<2 x i64>* %A, <2 x i32>* %B) nounwind { -;CHECK: vsubws32: -;CHECK: vsubw.s32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -define <8 x i16> @vsubwu8(<8 x i16>* %A, <8 x i8>* %B) nounwind { -;CHECK: vsubwu8: -;CHECK: vsubw.u8 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = call <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16> %tmp1, <8 x i8> %tmp2) - ret <8 x i16> %tmp3 -} - -define <4 x i32> @vsubwu16(<4 x i32>* %A, <4 x i16>* %B) nounwind { -;CHECK: vsubwu16: -;CHECK: vsubw.u16 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = call <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32> %tmp1, <4 x i16> %tmp2) - ret <4 x i32> %tmp3 -} - -define <2 x i64> @vsubwu32(<2 x i64>* %A, <2 x i32>* %B) nounwind { -;CHECK: vsubwu32: -;CHECK: vsubw.u32 - %tmp1 = load <2 x i64>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = call <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64> %tmp1, <2 x i32> %tmp2) - ret <2 x i64> %tmp3 -} - -declare <8 x i16> @llvm.arm.neon.vsubws.v8i16(<8 x i16>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vsubws.v4i32(<4 x i32>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vsubws.v2i64(<2 x i64>, <2 x i32>) nounwind readnone - -declare <8 x i16> @llvm.arm.neon.vsubwu.v8i16(<8 x i16>, <8 x i8>) nounwind readnone -declare <4 x i32> @llvm.arm.neon.vsubwu.v4i32(<4 x i32>, <4 x i16>) nounwind readnone -declare <2 x i64> @llvm.arm.neon.vsubwu.v2i64(<2 x i64>, <2 x i32>) nounwind readnone diff --git a/test/CodeGen/ARM/vtst.ll b/test/CodeGen/ARM/vtst.ll deleted file mode 100644 index 9357314..0000000 --- a/test/CodeGen/ARM/vtst.ll +++ /dev/null @@ -1,67 +0,0 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s - -define <8 x i8> @vtsti8(<8 x i8>* %A, <8 x i8>* %B) nounwind { -;CHECK: vtsti8: -;CHECK: vtst.i8 - %tmp1 = load <8 x i8>* %A - %tmp2 = load <8 x i8>* %B - %tmp3 = and <8 x i8> %tmp1, %tmp2 - %tmp4 = icmp ne <8 x i8> %tmp3, zeroinitializer - %tmp5 = sext <8 x i1> %tmp4 to <8 x i8> - ret <8 x i8> %tmp5 -} - -define <4 x i16> @vtsti16(<4 x i16>* %A, <4 x i16>* %B) nounwind { -;CHECK: vtsti16: -;CHECK: vtst.i16 - %tmp1 = load <4 x i16>* %A - %tmp2 = load <4 x i16>* %B - %tmp3 = and <4 x i16> %tmp1, %tmp2 - %tmp4 = icmp ne <4 x i16> %tmp3, zeroinitializer - %tmp5 = sext <4 x i1> %tmp4 to <4 x i16> - ret <4 x i16> %tmp5 -} - -define <2 x i32> @vtsti32(<2 x i32>* %A, <2 x i32>* %B) nounwind { -;CHECK: vtsti32: -;CHECK: vtst.i32 - %tmp1 = load <2 x i32>* %A - %tmp2 = load <2 x i32>* %B - %tmp3 = and <2 x i32> %tmp1, %tmp2 - %tmp4 = icmp ne <2 x i32> %tmp3, zeroinitializer - %tmp5 = sext <2 x i1> %tmp4 to <2 x i32> - ret <2 x i32> %tmp5 -} - -define <16 x i8> @vtstQi8(<16 x i8>* %A, <16 x i8>* %B) nounwind { -;CHECK: vtstQi8: -;CHECK: vtst.i8 - %tmp1 = load <16 x i8>* %A - %tmp2 = load <16 x i8>* %B - %tmp3 = and <16 x i8> %tmp1, %tmp2 - %tmp4 = icmp ne <16 x i8> %tmp3, zeroinitializer - %tmp5 = sext <16 x i1> %tmp4 to <16 x i8> - ret <16 x i8> %tmp5 -} - -define <8 x i16> @vtstQi16(<8 x i16>* %A, <8 x i16>* %B) nounwind { -;CHECK: vtstQi16: -;CHECK: vtst.i16 - %tmp1 = load <8 x i16>* %A - %tmp2 = load <8 x i16>* %B - %tmp3 = and <8 x i16> %tmp1, %tmp2 - %tmp4 = icmp ne <8 x i16> %tmp3, zeroinitializer - %tmp5 = sext <8 x i1> %tmp4 to <8 x i16> - ret <8 x i16> %tmp5 -} - -define <4 x i32> @vtstQi32(<4 x i32>* %A, <4 x i32>* %B) nounwind { -;CHECK: vtstQi32: -;CHECK: vtst.i32 - %tmp1 = load <4 x i32>* %A - %tmp2 = load <4 x i32>* %B - %tmp3 = and <4 x i32> %tmp1, %tmp2 - %tmp4 = icmp ne <4 x i32> %tmp3, zeroinitializer - %tmp5 = sext <4 x i1> %tmp4 to <4 x i32> - ret <4 x i32> %tmp5 -} |